Back to home page

EIC code displayed by LXR

 
 

    


Warning, file /include/unicode/stringtriebuilder.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 // © 2016 and later: Unicode, Inc. and others.
0002 // License & terms of use: http://www.unicode.org/copyright.html
0003 /*
0004 *******************************************************************************
0005 *   Copyright (C) 2010-2012,2014, International Business Machines
0006 *   Corporation and others.  All Rights Reserved.
0007 *******************************************************************************
0008 *   file name:  stringtriebuilder.h
0009 *   encoding:   UTF-8
0010 *   tab size:   8 (not used)
0011 *   indentation:4
0012 *
0013 *   created on: 2010dec24
0014 *   created by: Markus W. Scherer
0015 */
0016 
0017 #ifndef __STRINGTRIEBUILDER_H__
0018 #define __STRINGTRIEBUILDER_H__
0019 
0020 #include "unicode/utypes.h"
0021 
0022 #if U_SHOW_CPLUSPLUS_API
0023 
0024 #include "unicode/uobject.h"
0025 
0026 /**
0027  * \file
0028  * \brief C++ API: Builder API for trie builders
0029  */
0030 
0031 // Forward declaration.
0032 /// \cond
0033 struct UHashtable;
0034 typedef struct UHashtable UHashtable;
0035 /// \endcond
0036 
0037 /**
0038  * Build options for BytesTrieBuilder and CharsTrieBuilder.
0039  * @stable ICU 4.8
0040  */
0041 enum UStringTrieBuildOption {
0042     /**
0043      * Builds a trie quickly.
0044      * @stable ICU 4.8
0045      */
0046     USTRINGTRIE_BUILD_FAST,
0047     /**
0048      * Builds a trie more slowly, attempting to generate
0049      * a shorter but equivalent serialization.
0050      * This build option also uses more memory.
0051      *
0052      * This option can be effective when many integer values are the same
0053      * and string/byte sequence suffixes can be shared.
0054      * Runtime speed is not expected to improve.
0055      * @stable ICU 4.8
0056      */
0057     USTRINGTRIE_BUILD_SMALL
0058 };
0059 
0060 U_NAMESPACE_BEGIN
0061 
0062 /**
0063  * Base class for string trie builder classes.
0064  *
0065  * This class is not intended for public subclassing.
0066  * @stable ICU 4.8
0067  */
0068 class U_COMMON_API StringTrieBuilder : public UObject {
0069 public:
0070 #ifndef U_HIDE_INTERNAL_API
0071     /** @internal */
0072     static int32_t hashNode(const void *node);
0073     /** @internal */
0074     static UBool equalNodes(const void *left, const void *right);
0075 #endif  /* U_HIDE_INTERNAL_API */
0076 
0077 protected:
0078     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
0079     // or else the compiler will create a public default constructor.
0080     /** @internal */
0081     StringTrieBuilder();
0082     /** @internal */
0083     virtual ~StringTrieBuilder();
0084 
0085 #ifndef U_HIDE_INTERNAL_API
0086     /** @internal */
0087     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
0088     /** @internal */
0089     void deleteCompactBuilder();
0090 
0091     /** @internal */
0092     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
0093 
0094     /** @internal */
0095     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
0096     /** @internal */
0097     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
0098 #endif  /* U_HIDE_INTERNAL_API */
0099 
0100     class Node;
0101 
0102 #ifndef U_HIDE_INTERNAL_API
0103     /** @internal */
0104     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
0105     /** @internal */
0106     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
0107                             int32_t length, UErrorCode &errorCode);
0108 #endif  /* U_HIDE_INTERNAL_API */
0109 
0110     /** @internal */
0111     virtual int32_t getElementStringLength(int32_t i) const = 0;
0112     /** @internal */
0113     virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
0114     /** @internal */
0115     virtual int32_t getElementValue(int32_t i) const = 0;
0116 
0117     // Finds the first unit index after this one where
0118     // the first and last element have different units again.
0119     /** @internal */
0120     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
0121 
0122     // Number of different units at unitIndex.
0123     /** @internal */
0124     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
0125     /** @internal */
0126     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
0127     /** @internal */
0128     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
0129 
0130     /** @internal */
0131     virtual UBool matchNodesCanHaveValues() const = 0;
0132 
0133     /** @internal */
0134     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
0135     /** @internal */
0136     virtual int32_t getMinLinearMatch() const = 0;
0137     /** @internal */
0138     virtual int32_t getMaxLinearMatchLength() const = 0;
0139 
0140 #ifndef U_HIDE_INTERNAL_API
0141     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
0142     /** @internal */
0143     static const int32_t kMaxBranchLinearSubNodeLength=5;
0144 
0145     // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
0146     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
0147     /** @internal */
0148     static const int32_t kMaxSplitBranchLevels=14;
0149 
0150     /**
0151      * Makes sure that there is only one unique node registered that is
0152      * equivalent to newNode.
0153      * @param newNode Input node. The builder takes ownership.
0154      * @param errorCode ICU in/out UErrorCode.
0155                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
0156      * @return newNode if it is the first of its kind, or
0157      *         an equivalent node if newNode is a duplicate.
0158      * @internal
0159      */
0160     Node *registerNode(Node *newNode, UErrorCode &errorCode);
0161     /**
0162      * Makes sure that there is only one unique FinalValueNode registered
0163      * with this value.
0164      * Avoids creating a node if the value is a duplicate.
0165      * @param value A final value.
0166      * @param errorCode ICU in/out UErrorCode.
0167                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==nullptr.
0168      * @return A FinalValueNode with the given value.
0169      * @internal
0170      */
0171     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
0172 #endif  /* U_HIDE_INTERNAL_API */
0173 
0174     /*
0175      * C++ note:
0176      * registerNode() and registerFinalValue() take ownership of their input nodes,
0177      * and only return owned nodes.
0178      * If they see a failure UErrorCode, they will delete the input node.
0179      * If they get a nullptr pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
0180      * If there is a failure, they return nullptr.
0181      *
0182      * nullptr Node pointers can be safely passed into other Nodes because
0183      * they call the static Node::hashCode() which checks for a nullptr pointer first.
0184      *
0185      * Therefore, as long as builder functions register a new node,
0186      * they need to check for failures only before explicitly dereferencing
0187      * a Node pointer, or before setting a new UErrorCode.
0188      */
0189 
0190     // Hash set of nodes, maps from nodes to integer 1.
0191     /** @internal */
0192     UHashtable *nodes;
0193 
0194     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
0195     // it is needed for layout of other objects.
0196     /**
0197      * @internal
0198      * \cond
0199      */
0200     class Node : public UObject {
0201     public:
0202         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
0203         inline int32_t hashCode() const { return hash; }
0204         // Handles node==nullptr.
0205         static inline int32_t hashCode(const Node *node) { return node==nullptr ? 0 : node->hashCode(); }
0206         // Base class operator==() compares the actual class types.
0207         virtual bool operator==(const Node &other) const;
0208         inline bool operator!=(const Node &other) const { return !operator==(other); }
0209         /**
0210          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
0211          * This is to avoid writing a duplicate node twice.
0212          *
0213          * Branch nodes in this trie data structure are not symmetric.
0214          * Most branch edges "jump" to other nodes but the rightmost branch edges
0215          * just continue without a jump.
0216          * Therefore, write() must write the rightmost branch edge last
0217          * (trie units are written backwards), and must write it at that point even if
0218          * it is a duplicate of a node previously written elsewhere.
0219          *
0220          * This function visits and marks right branch edges first.
0221          * Edges are numbered with increasingly negative values because we share the
0222          * offset field which gets positive values when nodes are written.
0223          * A branch edge also remembers the first number for any of its edges.
0224          *
0225          * When a further-left branch edge has a number in the range of the rightmost
0226          * edge's numbers, then it will be written as part of the required right edge
0227          * and we can avoid writing it first.
0228          *
0229          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
0230          * edge numbers.
0231          *
0232          * @param edgeNumber The first edge number for this node and its sub-nodes.
0233          * @return An edge number that is at least the maximum-negative
0234          *         of the input edge number and the numbers of this node and all of its sub-nodes.
0235          */
0236         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
0237         // write() must set the offset to a positive value.
0238         virtual void write(StringTrieBuilder &builder) = 0;
0239         // See markRightEdgesFirst.
0240         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
0241                                                StringTrieBuilder &builder) {
0242             // Note: Edge numbers are negative, lastRight<=firstRight.
0243             // If offset>0 then this node and its sub-nodes have been written already
0244             // and we need not write them again.
0245             // If this node is part of the unwritten right branch edge,
0246             // then we wait until that is written.
0247             if(offset<0 && (offset<lastRight || firstRight<offset)) {
0248                 write(builder);
0249             }
0250         }
0251         inline int32_t getOffset() const { return offset; }
0252     protected:
0253         int32_t hash;
0254         int32_t offset;
0255     };
0256 
0257 #ifndef U_HIDE_INTERNAL_API
0258     // This class should not be overridden because
0259     // registerFinalValue() compares a stack-allocated FinalValueNode
0260     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
0261     // with the input node, and the
0262     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
0263     // will be false if the typeid's are different.
0264     /** @internal */
0265     class FinalValueNode : public Node {
0266     public:
0267         FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
0268         virtual bool operator==(const Node &other) const override;
0269         virtual void write(StringTrieBuilder &builder) override;
0270     protected:
0271         int32_t value;
0272     };
0273 #endif  /* U_HIDE_INTERNAL_API */
0274 
0275     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
0276     // it is needed for layout of other objects.
0277     /**
0278      * @internal 
0279      */
0280     class ValueNode : public Node {
0281     public:
0282         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {}
0283         virtual bool operator==(const Node &other) const override;
0284         void setValue(int32_t v) {
0285             hasValue=true;
0286             value=v;
0287             hash=hash*37u+v;
0288         }
0289     protected:
0290         UBool hasValue;
0291         int32_t value;
0292     };
0293 
0294 #ifndef U_HIDE_INTERNAL_API
0295     /** 
0296      * @internal 
0297      */
0298     class IntermediateValueNode : public ValueNode {
0299     public:
0300         IntermediateValueNode(int32_t v, Node *nextNode)
0301                 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
0302         virtual bool operator==(const Node &other) const override;
0303         virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
0304         virtual void write(StringTrieBuilder &builder) override;
0305     protected:
0306         Node *next;
0307     };
0308 #endif  /* U_HIDE_INTERNAL_API */
0309 
0310     // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
0311     // it is needed for layout of other objects.
0312     /**
0313      * @internal 
0314      */
0315     class LinearMatchNode : public ValueNode {
0316     public:
0317         LinearMatchNode(int32_t len, Node *nextNode)
0318                 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
0319                   length(len), next(nextNode) {}
0320         virtual bool operator==(const Node &other) const override;
0321         virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
0322     protected:
0323         int32_t length;
0324         Node *next;
0325     };
0326 
0327 #ifndef U_HIDE_INTERNAL_API
0328     /**
0329      * @internal 
0330      */
0331     class BranchNode : public Node {
0332     public:
0333         BranchNode(int32_t initialHash) : Node(initialHash) {}
0334     protected:
0335         int32_t firstEdgeNumber;
0336     };
0337 
0338     /**
0339      * @internal 
0340      */
0341     class ListBranchNode : public BranchNode {
0342     public:
0343         ListBranchNode() : BranchNode(0x444444), length(0) {}
0344         virtual bool operator==(const Node &other) const override;
0345         virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
0346         virtual void write(StringTrieBuilder &builder) override;
0347         // Adds a unit with a final value.
0348         void add(int32_t c, int32_t value) {
0349             units[length]=(char16_t)c;
0350             equal[length]=nullptr;
0351             values[length]=value;
0352             ++length;
0353             hash=(hash*37u+c)*37u+value;
0354         }
0355         // Adds a unit which leads to another match node.
0356         void add(int32_t c, Node *node) {
0357             units[length]=(char16_t)c;
0358             equal[length]=node;
0359             values[length]=0;
0360             ++length;
0361             hash=(hash*37u+c)*37u+hashCode(node);
0362         }
0363     protected:
0364         Node *equal[kMaxBranchLinearSubNodeLength];  // nullptr means "has final value".
0365         int32_t length;
0366         int32_t values[kMaxBranchLinearSubNodeLength];
0367         char16_t units[kMaxBranchLinearSubNodeLength];
0368     };
0369 
0370     /**
0371      * @internal 
0372      */
0373     class SplitBranchNode : public BranchNode {
0374     public:
0375         SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
0376                 : BranchNode(((0x555555u*37u+middleUnit)*37u+
0377                               hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
0378                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
0379         virtual bool operator==(const Node &other) const override;
0380         virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
0381         virtual void write(StringTrieBuilder &builder) override;
0382     protected:
0383         char16_t unit;
0384         Node *lessThan;
0385         Node *greaterOrEqual;
0386     };
0387 
0388     // Branch head node, for writing the actual node lead unit.
0389     /** @internal */
0390     class BranchHeadNode : public ValueNode {
0391     public:
0392         BranchHeadNode(int32_t len, Node *subNode)
0393                 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
0394                   length(len), next(subNode) {}
0395         virtual bool operator==(const Node &other) const override;
0396         virtual int32_t markRightEdgesFirst(int32_t edgeNumber) override;
0397         virtual void write(StringTrieBuilder &builder) override;
0398     protected:
0399         int32_t length;
0400         Node *next;  // A branch sub-node.
0401     };
0402 
0403 #endif  /* U_HIDE_INTERNAL_API */
0404     /// \endcond
0405 
0406     /** @internal */
0407     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
0408                                         Node *nextNode) const = 0;
0409 
0410     /** @internal */
0411     virtual int32_t write(int32_t unit) = 0;
0412     /** @internal */
0413     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
0414     /** @internal */
0415     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
0416     /** @internal */
0417     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
0418     /** @internal */
0419     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
0420 };
0421 
0422 U_NAMESPACE_END
0423 
0424 #endif /* U_SHOW_CPLUSPLUS_API */
0425 
0426 #endif  // __STRINGTRIEBUILDER_H__