|
||||
File indexing completed on 2025-01-18 10:13:03
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 ******************************************************************************* 0005 * Copyright (C) 2010-2012, International Business Machines 0006 * Corporation and others. All Rights Reserved. 0007 ******************************************************************************* 0008 * file name: bytestrie.h 0009 * encoding: UTF-8 0010 * tab size: 8 (not used) 0011 * indentation:4 0012 * 0013 * created on: 2010sep25 0014 * created by: Markus W. Scherer 0015 */ 0016 0017 #ifndef __BYTESTRIE_H__ 0018 #define __BYTESTRIE_H__ 0019 0020 /** 0021 * \file 0022 * \brief C++ API: Trie for mapping byte sequences to integer values. 0023 */ 0024 0025 #include "unicode/utypes.h" 0026 0027 #if U_SHOW_CPLUSPLUS_API 0028 0029 #include "unicode/stringpiece.h" 0030 #include "unicode/uobject.h" 0031 #include "unicode/ustringtrie.h" 0032 0033 class BytesTrieTest; 0034 0035 U_NAMESPACE_BEGIN 0036 0037 class ByteSink; 0038 class BytesTrieBuilder; 0039 class CharString; 0040 class UVector32; 0041 0042 /** 0043 * Light-weight, non-const reader class for a BytesTrie. 0044 * Traverses a byte-serialized data structure with minimal state, 0045 * for mapping byte sequences to non-negative integer values. 0046 * 0047 * This class owns the serialized trie data only if it was constructed by 0048 * the builder's build() method. 0049 * The public constructor and the copy constructor only alias the data (only copy the pointer). 0050 * There is no assignment operator. 0051 * 0052 * This class is not intended for public subclassing. 0053 * @stable ICU 4.8 0054 */ 0055 class U_COMMON_API BytesTrie : public UMemory { 0056 public: 0057 /** 0058 * Constructs a BytesTrie reader instance. 0059 * 0060 * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, 0061 * starting with the first byte of that sequence. 0062 * The BytesTrie object will not read more bytes than 0063 * the BytesTrieBuilder generated in the corresponding build() call. 0064 * 0065 * The array is not copied/cloned and must not be modified while 0066 * the BytesTrie object is in use. 0067 * 0068 * @param trieBytes The byte array that contains the serialized trie. 0069 * @stable ICU 4.8 0070 */ 0071 BytesTrie(const void *trieBytes) 0072 : ownedArray_(nullptr), bytes_(static_cast<const uint8_t *>(trieBytes)), 0073 pos_(bytes_), remainingMatchLength_(-1) {} 0074 0075 /** 0076 * Destructor. 0077 * @stable ICU 4.8 0078 */ 0079 ~BytesTrie(); 0080 0081 /** 0082 * Copy constructor, copies the other trie reader object and its state, 0083 * but not the byte array which will be shared. (Shallow copy.) 0084 * @param other Another BytesTrie object. 0085 * @stable ICU 4.8 0086 */ 0087 BytesTrie(const BytesTrie &other) 0088 : ownedArray_(nullptr), bytes_(other.bytes_), 0089 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 0090 0091 /** 0092 * Resets this trie to its initial state. 0093 * @return *this 0094 * @stable ICU 4.8 0095 */ 0096 BytesTrie &reset() { 0097 pos_=bytes_; 0098 remainingMatchLength_=-1; 0099 return *this; 0100 } 0101 0102 /** 0103 * Returns the state of this trie as a 64-bit integer. 0104 * The state value is never 0. 0105 * 0106 * @return opaque state value 0107 * @see resetToState64 0108 * @stable ICU 65 0109 */ 0110 uint64_t getState64() const { 0111 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) | 0112 (uint64_t)(pos_ - bytes_); 0113 } 0114 0115 /** 0116 * Resets this trie to the saved state. 0117 * Unlike resetToState(State), the 64-bit state value 0118 * must be from getState64() from the same trie object or 0119 * from one initialized the exact same way. 0120 * Because of no validation, this method is faster. 0121 * 0122 * @param state The opaque trie state value from getState64(). 0123 * @return *this 0124 * @see getState64 0125 * @see resetToState 0126 * @see reset 0127 * @stable ICU 65 0128 */ 0129 BytesTrie &resetToState64(uint64_t state) { 0130 remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2; 0131 pos_ = bytes_ + (state & kState64PosMask); 0132 return *this; 0133 } 0134 0135 /** 0136 * BytesTrie state object, for saving a trie's current state 0137 * and resetting the trie back to this state later. 0138 * @stable ICU 4.8 0139 */ 0140 class State : public UMemory { 0141 public: 0142 /** 0143 * Constructs an empty State. 0144 * @stable ICU 4.8 0145 */ 0146 State() { bytes=nullptr; } 0147 private: 0148 friend class BytesTrie; 0149 0150 const uint8_t *bytes; 0151 const uint8_t *pos; 0152 int32_t remainingMatchLength; 0153 }; 0154 0155 /** 0156 * Saves the state of this trie. 0157 * @param state The State object to hold the trie's state. 0158 * @return *this 0159 * @see resetToState 0160 * @stable ICU 4.8 0161 */ 0162 const BytesTrie &saveState(State &state) const { 0163 state.bytes=bytes_; 0164 state.pos=pos_; 0165 state.remainingMatchLength=remainingMatchLength_; 0166 return *this; 0167 } 0168 0169 /** 0170 * Resets this trie to the saved state. 0171 * If the state object contains no state, or the state of a different trie, 0172 * then this trie remains unchanged. 0173 * @param state The State object which holds a saved trie state. 0174 * @return *this 0175 * @see saveState 0176 * @see reset 0177 * @stable ICU 4.8 0178 */ 0179 BytesTrie &resetToState(const State &state) { 0180 if(bytes_==state.bytes && bytes_!=nullptr) { 0181 pos_=state.pos; 0182 remainingMatchLength_=state.remainingMatchLength; 0183 } 0184 return *this; 0185 } 0186 0187 /** 0188 * Determines whether the byte sequence so far matches, whether it has a value, 0189 * and whether another input byte can continue a matching byte sequence. 0190 * @return The match/value Result. 0191 * @stable ICU 4.8 0192 */ 0193 UStringTrieResult current() const; 0194 0195 /** 0196 * Traverses the trie from the initial state for this input byte. 0197 * Equivalent to reset().next(inByte). 0198 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 0199 * Values below -0x100 and above 0xff will never match. 0200 * @return The match/value Result. 0201 * @stable ICU 4.8 0202 */ 0203 inline UStringTrieResult first(int32_t inByte) { 0204 remainingMatchLength_=-1; 0205 if(inByte<0) { 0206 inByte+=0x100; 0207 } 0208 return nextImpl(bytes_, inByte); 0209 } 0210 0211 /** 0212 * Traverses the trie from the current state for this input byte. 0213 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 0214 * Values below -0x100 and above 0xff will never match. 0215 * @return The match/value Result. 0216 * @stable ICU 4.8 0217 */ 0218 UStringTrieResult next(int32_t inByte); 0219 0220 /** 0221 * Traverses the trie from the current state for this byte sequence. 0222 * Equivalent to 0223 * \code 0224 * Result result=current(); 0225 * for(each c in s) 0226 * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 0227 * result=next(c); 0228 * return result; 0229 * \endcode 0230 * @param s A string or byte sequence. Can be nullptr if length is 0. 0231 * @param length The length of the byte sequence. Can be -1 if NUL-terminated. 0232 * @return The match/value Result. 0233 * @stable ICU 4.8 0234 */ 0235 UStringTrieResult next(const char *s, int32_t length); 0236 0237 /** 0238 * Returns a matching byte sequence's value if called immediately after 0239 * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 0240 * getValue() can be called multiple times. 0241 * 0242 * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 0243 * @return The value for the byte sequence so far. 0244 * @stable ICU 4.8 0245 */ 0246 inline int32_t getValue() const { 0247 const uint8_t *pos=pos_; 0248 int32_t leadByte=*pos++; 0249 // U_ASSERT(leadByte>=kMinValueLead); 0250 return readValue(pos, leadByte>>1); 0251 } 0252 0253 /** 0254 * Determines whether all byte sequences reachable from the current state 0255 * map to the same value. 0256 * @param uniqueValue Receives the unique value, if this function returns true. 0257 * (output-only) 0258 * @return true if all byte sequences reachable from the current state 0259 * map to the same value. 0260 * @stable ICU 4.8 0261 */ 0262 inline UBool hasUniqueValue(int32_t &uniqueValue) const { 0263 const uint8_t *pos=pos_; 0264 // Skip the rest of a pending linear-match node. 0265 return pos!=nullptr && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue); 0266 } 0267 0268 /** 0269 * Finds each byte which continues the byte sequence from the current state. 0270 * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. 0271 * @param out Each next byte is appended to this object. 0272 * (Only uses the out.Append(s, length) method.) 0273 * @return the number of bytes which continue the byte sequence from here 0274 * @stable ICU 4.8 0275 */ 0276 int32_t getNextBytes(ByteSink &out) const; 0277 0278 /** 0279 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 0280 * @stable ICU 4.8 0281 */ 0282 class U_COMMON_API Iterator : public UMemory { 0283 public: 0284 /** 0285 * Iterates from the root of a byte-serialized BytesTrie. 0286 * @param trieBytes The trie bytes. 0287 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 0288 * Otherwise, the iterator returns strings with this maximum length. 0289 * @param errorCode Standard ICU error code. Its input value must 0290 * pass the U_SUCCESS() test, or else the function returns 0291 * immediately. Check for U_FAILURE() on output or use with 0292 * function chaining. (See User Guide for details.) 0293 * @stable ICU 4.8 0294 */ 0295 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); 0296 0297 /** 0298 * Iterates from the current state of the specified BytesTrie. 0299 * @param trie The trie whose state will be copied for iteration. 0300 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 0301 * Otherwise, the iterator returns strings with this maximum length. 0302 * @param errorCode Standard ICU error code. Its input value must 0303 * pass the U_SUCCESS() test, or else the function returns 0304 * immediately. Check for U_FAILURE() on output or use with 0305 * function chaining. (See User Guide for details.) 0306 * @stable ICU 4.8 0307 */ 0308 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 0309 0310 /** 0311 * Destructor. 0312 * @stable ICU 4.8 0313 */ 0314 ~Iterator(); 0315 0316 /** 0317 * Resets this iterator to its initial state. 0318 * @return *this 0319 * @stable ICU 4.8 0320 */ 0321 Iterator &reset(); 0322 0323 /** 0324 * @return true if there are more elements. 0325 * @stable ICU 4.8 0326 */ 0327 UBool hasNext() const; 0328 0329 /** 0330 * Finds the next (byte sequence, value) pair if there is one. 0331 * 0332 * If the byte sequence is truncated to the maximum length and does not 0333 * have a real value, then the value is set to -1. 0334 * In this case, this "not a real value" is indistinguishable from 0335 * a real value of -1. 0336 * @param errorCode Standard ICU error code. Its input value must 0337 * pass the U_SUCCESS() test, or else the function returns 0338 * immediately. Check for U_FAILURE() on output or use with 0339 * function chaining. (See User Guide for details.) 0340 * @return true if there is another element. 0341 * @stable ICU 4.8 0342 */ 0343 UBool next(UErrorCode &errorCode); 0344 0345 /** 0346 * @return The NUL-terminated byte sequence for the last successful next(). 0347 * @stable ICU 4.8 0348 */ 0349 StringPiece getString() const; 0350 /** 0351 * @return The value for the last successful next(). 0352 * @stable ICU 4.8 0353 */ 0354 int32_t getValue() const { return value_; } 0355 0356 private: 0357 UBool truncateAndStop(); 0358 0359 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); 0360 0361 const uint8_t *bytes_; 0362 const uint8_t *pos_; 0363 const uint8_t *initialPos_; 0364 int32_t remainingMatchLength_; 0365 int32_t initialRemainingMatchLength_; 0366 0367 CharString *str_; 0368 int32_t maxLength_; 0369 int32_t value_; 0370 0371 // The stack stores pairs of integers for backtracking to another 0372 // outbound edge of a branch node. 0373 // The first integer is an offset from bytes_. 0374 // The second integer has the str_->length() from before the node in bits 15..0, 0375 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 0376 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 0377 // but the code looks more confusing that way.) 0378 UVector32 *stack_; 0379 }; 0380 0381 private: 0382 friend class BytesTrieBuilder; 0383 friend class ::BytesTrieTest; 0384 0385 /** 0386 * Constructs a BytesTrie reader instance. 0387 * Unlike the public constructor which just aliases an array, 0388 * this constructor adopts the builder's array. 0389 * This constructor is only called by the builder. 0390 */ 0391 BytesTrie(void *adoptBytes, const void *trieBytes) 0392 : ownedArray_(static_cast<uint8_t *>(adoptBytes)), 0393 bytes_(static_cast<const uint8_t *>(trieBytes)), 0394 pos_(bytes_), remainingMatchLength_(-1) {} 0395 0396 // No assignment operator. 0397 BytesTrie &operator=(const BytesTrie &other) = delete; 0398 0399 inline void stop() { 0400 pos_=nullptr; 0401 } 0402 0403 // Reads a compact 32-bit integer. 0404 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 0405 static int32_t readValue(const uint8_t *pos, int32_t leadByte); 0406 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { 0407 // U_ASSERT(leadByte>=kMinValueLead); 0408 if(leadByte>=(kMinTwoByteValueLead<<1)) { 0409 if(leadByte<(kMinThreeByteValueLead<<1)) { 0410 ++pos; 0411 } else if(leadByte<(kFourByteValueLead<<1)) { 0412 pos+=2; 0413 } else { 0414 pos+=3+((leadByte>>1)&1); 0415 } 0416 } 0417 return pos; 0418 } 0419 static inline const uint8_t *skipValue(const uint8_t *pos) { 0420 int32_t leadByte=*pos++; 0421 return skipValue(pos, leadByte); 0422 } 0423 0424 // Reads a jump delta and jumps. 0425 static const uint8_t *jumpByDelta(const uint8_t *pos); 0426 0427 static inline const uint8_t *skipDelta(const uint8_t *pos) { 0428 int32_t delta=*pos++; 0429 if(delta>=kMinTwoByteDeltaLead) { 0430 if(delta<kMinThreeByteDeltaLead) { 0431 ++pos; 0432 } else if(delta<kFourByteDeltaLead) { 0433 pos+=2; 0434 } else { 0435 pos+=3+(delta&1); 0436 } 0437 } 0438 return pos; 0439 } 0440 0441 static inline UStringTrieResult valueResult(int32_t node) { 0442 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); 0443 } 0444 0445 // Handles a branch node for both next(byte) and next(string). 0446 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); 0447 0448 // Requires remainingLength_<0. 0449 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); 0450 0451 // Helper functions for hasUniqueValue(). 0452 // Recursively finds a unique value (or whether there is not a unique one) 0453 // from a branch. 0454 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 0455 UBool haveUniqueValue, int32_t &uniqueValue); 0456 // Recursively finds a unique value (or whether there is not a unique one) 0457 // starting from a position on a node lead byte. 0458 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 0459 0460 // Helper functions for getNextBytes(). 0461 // getNextBytes() when pos is on a branch node. 0462 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); 0463 static void append(ByteSink &out, int c); 0464 0465 // BytesTrie data structure 0466 // 0467 // The trie consists of a series of byte-serialized nodes for incremental 0468 // string/byte sequence matching. The root node is at the beginning of the trie data. 0469 // 0470 // Types of nodes are distinguished by their node lead byte ranges. 0471 // After each node, except a final-value node, another node follows to 0472 // encode match values or continue matching further bytes. 0473 // 0474 // Node types: 0475 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 0476 // The value is for the string/byte sequence so far. 0477 // One node bit indicates whether the value is final or whether 0478 // matching continues with the next node. 0479 // - Linear-match node: Matches a number of bytes. 0480 // - Branch node: Branches to other nodes according to the current input byte. 0481 // The node byte is the length of the branch (number of bytes to select from) 0482 // minus 1. It is followed by a sub-node: 0483 // - If the length is at most kMaxBranchLinearSubNodeLength, then 0484 // there are length-1 (key, value) pairs and then one more comparison byte. 0485 // If one of the key bytes matches, then the value is either a final value for 0486 // the string/byte sequence so far, or a "jump" delta to the next node. 0487 // If the last byte matches, then matching continues with the next node. 0488 // (Values have the same encoding as value nodes.) 0489 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 0490 // there is one byte and one "jump" delta. 0491 // If the input byte is less than the sub-node byte, then "jump" by delta to 0492 // the next sub-node which will have a length of length/2. 0493 // (The delta has its own compact encoding.) 0494 // Otherwise, skip the "jump" delta to the next sub-node 0495 // which will have a length of length-length/2. 0496 0497 // Node lead byte values. 0498 0499 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 0500 // the length is one more than the next byte. 0501 0502 // For a branch sub-node with at most this many entries, we drop down 0503 // to a linear search. 0504 static const int32_t kMaxBranchLinearSubNodeLength=5; 0505 0506 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 0507 static const int32_t kMinLinearMatch=0x10; 0508 static const int32_t kMaxLinearMatchLength=0x10; 0509 0510 // 20..ff: Variable-length value node. 0511 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 0512 // Then shift-right by 1 bit. 0513 // The remaining lead byte value indicates the number of following bytes (0..4) 0514 // and contains the value's top bits. 0515 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 0516 // It is a final value if bit 0 is set. 0517 static const int32_t kValueIsFinal=1; 0518 0519 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 0520 static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 0521 static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 0522 0523 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 0524 static const int32_t kMaxTwoByteValue=0x1aff; 0525 0526 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 0527 static const int32_t kFourByteValueLead=0x7e; 0528 0529 // A little more than Unicode code points. (0x11ffff) 0530 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 0531 0532 static const int32_t kFiveByteValueLead=0x7f; 0533 0534 // Compact delta integers. 0535 static const int32_t kMaxOneByteDelta=0xbf; 0536 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 0537 static const int32_t kMinThreeByteDeltaLead=0xf0; 0538 static const int32_t kFourByteDeltaLead=0xfe; 0539 static const int32_t kFiveByteDeltaLead=0xff; 0540 0541 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 0542 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 0543 0544 // For getState64(): 0545 // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 0546 // so we need at least 5 bits for that. 0547 // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. 0548 static constexpr int32_t kState64RemainingShift = 59; 0549 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; 0550 0551 uint8_t *ownedArray_; 0552 0553 // Fixed value referencing the BytesTrie bytes. 0554 const uint8_t *bytes_; 0555 0556 // Iterator variables. 0557 0558 // Pointer to next trie byte to read. nullptr if no more matches. 0559 const uint8_t *pos_; 0560 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 0561 int32_t remainingMatchLength_; 0562 }; 0563 0564 U_NAMESPACE_END 0565 0566 #endif /* U_SHOW_CPLUSPLUS_API */ 0567 0568 #endif // __BYTESTRIE_H__
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |