|
||||
Warning, file /include/unicode/uniset.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 *************************************************************************** 0005 * Copyright (C) 1999-2016, International Business Machines Corporation 0006 * and others. All Rights Reserved. 0007 *************************************************************************** 0008 * Date Name Description 0009 * 10/20/99 alan Creation. 0010 *************************************************************************** 0011 */ 0012 0013 #ifndef UNICODESET_H 0014 #define UNICODESET_H 0015 0016 #include "unicode/utypes.h" 0017 0018 #if U_SHOW_CPLUSPLUS_API 0019 0020 #include "unicode/ucpmap.h" 0021 #include "unicode/unifilt.h" 0022 #include "unicode/unistr.h" 0023 #include "unicode/uset.h" 0024 0025 /** 0026 * \file 0027 * \brief C++ API: Unicode Set 0028 */ 0029 0030 U_NAMESPACE_BEGIN 0031 0032 // Forward Declarations. 0033 class BMPSet; 0034 class ParsePosition; 0035 class RBBIRuleScanner; 0036 class SymbolTable; 0037 class UnicodeSetStringSpan; 0038 class UVector; 0039 class RuleCharacterIterator; 0040 0041 /** 0042 * A mutable set of Unicode characters and multicharacter strings. Objects of this class 0043 * represent <em>character classes</em> used in regular expressions. 0044 * A character specifies a subset of Unicode code points. Legal 0045 * code points are U+0000 to U+10FFFF, inclusive. 0046 * 0047 * <p>The UnicodeSet class is not designed to be subclassed. 0048 * 0049 * <p><code>UnicodeSet</code> supports two APIs. The first is the 0050 * <em>operand</em> API that allows the caller to modify the value of 0051 * a <code>UnicodeSet</code> object. It conforms to Java 2's 0052 * <code>java.util.Set</code> interface, although 0053 * <code>UnicodeSet</code> does not actually implement that 0054 * interface. All methods of <code>Set</code> are supported, with the 0055 * modification that they take a character range or single character 0056 * instead of an <code>Object</code>, and they take a 0057 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The 0058 * operand API may be thought of in terms of boolean logic: a boolean 0059 * OR is implemented by <code>add</code>, a boolean AND is implemented 0060 * by <code>retain</code>, a boolean XOR is implemented by 0061 * <code>complement</code> taking an argument, and a boolean NOT is 0062 * implemented by <code>complement</code> with no argument. In terms 0063 * of traditional set theory function names, <code>add</code> is a 0064 * union, <code>retain</code> is an intersection, <code>remove</code> 0065 * is an asymmetric difference, and <code>complement</code> with no 0066 * argument is a set complement with respect to the superset range 0067 * <code>MIN_VALUE-MAX_VALUE</code> 0068 * 0069 * <p>The second API is the 0070 * <code>applyPattern()</code>/<code>toPattern()</code> API from the 0071 * <code>java.text.Format</code>-derived classes. Unlike the 0072 * methods that add characters, add categories, and control the logic 0073 * of the set, the method <code>applyPattern()</code> sets all 0074 * attributes of a <code>UnicodeSet</code> at once, based on a 0075 * string pattern. 0076 * 0077 * <p><b>Pattern syntax</b></p> 0078 * 0079 * Patterns are accepted by the constructors and the 0080 * <code>applyPattern()</code> methods and returned by the 0081 * <code>toPattern()</code> method. These patterns follow a syntax 0082 * similar to that employed by version 8 regular expression character 0083 * classes. Here are some simple examples: 0084 * 0085 * \htmlonly<blockquote>\endhtmlonly 0086 * <table> 0087 * <tr align="top"> 0088 * <td nowrap valign="top" align="left"><code>[]</code></td> 0089 * <td valign="top">No characters</td> 0090 * </tr><tr align="top"> 0091 * <td nowrap valign="top" align="left"><code>[a]</code></td> 0092 * <td valign="top">The character 'a'</td> 0093 * </tr><tr align="top"> 0094 * <td nowrap valign="top" align="left"><code>[ae]</code></td> 0095 * <td valign="top">The characters 'a' and 'e'</td> 0096 * </tr> 0097 * <tr> 0098 * <td nowrap valign="top" align="left"><code>[a-e]</code></td> 0099 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code 0100 * point order</td> 0101 * </tr> 0102 * <tr> 0103 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td> 0104 * <td valign="top">The character U+4E01</td> 0105 * </tr> 0106 * <tr> 0107 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td> 0108 * <td valign="top">The character 'a' and the multicharacter strings "ab" and 0109 * "ac"</td> 0110 * </tr> 0111 * <tr> 0112 * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td> 0113 * <td valign="top">All characters in the general category Uppercase Letter</td> 0114 * </tr> 0115 * </table> 0116 * \htmlonly</blockquote>\endhtmlonly 0117 * 0118 * Any character may be preceded by a backslash in order to remove any special 0119 * meaning. White space characters, as defined by UCharacter.isWhitespace(), are 0120 * ignored, unless they are escaped. 0121 * 0122 * <p>Property patterns specify a set of characters having a certain 0123 * property as defined by the Unicode standard. Both the POSIX-like 0124 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a 0125 * complete list of supported property patterns, see the User's Guide 0126 * for UnicodeSet at 0127 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset"> 0128 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>. 0129 * Actual determination of property data is defined by the underlying 0130 * Unicode database as implemented by UCharacter. 0131 * 0132 * <p>Patterns specify individual characters, ranges of characters, and 0133 * Unicode property sets. When elements are concatenated, they 0134 * specify their union. To complement a set, place a '^' immediately 0135 * after the opening '['. Property patterns are inverted by modifying 0136 * their delimiters; "[:^foo]" and "\\P{foo}". In any other location, 0137 * '^' has no special meaning. 0138 * 0139 * <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]" 0140 * perform a “code point complement” (all code points minus the original set), 0141 * removing all multicharacter strings, 0142 * equivalent to <code>.complement().removeAllStrings()</code>. 0143 * The complement() API function continues to perform a 0144 * symmetric difference with all code points and thus retains all multicharacter strings. 0145 * 0146 * <p>Ranges are indicated by placing two a '-' between two 0147 * characters, as in "a-z". This specifies the range of all 0148 * characters from the left to the right, in Unicode order. If the 0149 * left character is greater than or equal to the 0150 * right character it is a syntax error. If a '-' occurs as the first 0151 * character after the opening '[' or '[^', or if it occurs as the 0152 * last character before the closing ']', then it is taken as a 0153 * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same 0154 * set of three characters, 'a', 'b', and '-'. 0155 * 0156 * <p>Sets may be intersected using the '&' operator or the asymmetric 0157 * set difference may be taken using the '-' operator, for example, 0158 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters 0159 * with values less than 4096. Operators ('&' and '|') have equal 0160 * precedence and bind left-to-right. Thus 0161 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to 0162 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for 0163 * difference; intersection is commutative. 0164 * 0165 * <table> 0166 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a' 0167 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a' 0168 * through 'z' and all letters in between, in Unicode order 0169 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing 0170 * all characters but 'a' through 'z', 0171 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF 0172 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code> 0173 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em> 0174 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code> 0175 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em> 0176 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code> 0177 * <td>The asymmetric difference of sets specified by <em>pat1</em> and 0178 * <em>pat2</em> 0179 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code> 0180 * <td>The set of characters having the specified 0181 * Unicode property; in 0182 * this case, Unicode uppercase letters 0183 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code> 0184 * <td>The set of characters <em>not</em> having the given 0185 * Unicode property 0186 * </table> 0187 * 0188 * <p><b>Formal syntax</b></p> 0189 * 0190 * \htmlonly<blockquote>\endhtmlonly 0191 * <table> 0192 * <tr align="top"> 0193 * <td nowrap valign="top" align="right"><code>pattern := </code></td> 0194 * <td valign="top"><code>('[' '^'? item* ']') | 0195 * property</code></td> 0196 * </tr> 0197 * <tr align="top"> 0198 * <td nowrap valign="top" align="right"><code>item := </code></td> 0199 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br> 0200 * </code></td> 0201 * </tr> 0202 * <tr align="top"> 0203 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td> 0204 * <td valign="top"><code>pattern | pattern-expr pattern | 0205 * pattern-expr op pattern<br> 0206 * </code></td> 0207 * </tr> 0208 * <tr align="top"> 0209 * <td nowrap valign="top" align="right"><code>op := </code></td> 0210 * <td valign="top"><code>'&' | '-'<br> 0211 * </code></td> 0212 * </tr> 0213 * <tr align="top"> 0214 * <td nowrap valign="top" align="right"><code>special := </code></td> 0215 * <td valign="top"><code>'[' | ']' | '-'<br> 0216 * </code></td> 0217 * </tr> 0218 * <tr align="top"> 0219 * <td nowrap valign="top" align="right"><code>char := </code></td> 0220 * <td valign="top"><em>any character that is not</em><code> special<br> 0221 * | ('\' </code><em>any character</em><code>)<br> 0222 * | ('\\u' hex hex hex hex)<br> 0223 * </code></td> 0224 * </tr> 0225 * <tr align="top"> 0226 * <td nowrap valign="top" align="right"><code>hex := </code></td> 0227 * <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br> 0228 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td> 0229 * </tr> 0230 * <tr> 0231 * <td nowrap valign="top" align="right"><code>property := </code></td> 0232 * <td valign="top"><em>a Unicode property set pattern</em></td> 0233 * </tr> 0234 * </table> 0235 * <br> 0236 * <table border="1"> 0237 * <tr> 0238 * <td>Legend: <table> 0239 * <tr> 0240 * <td nowrap valign="top"><code>a := b</code></td> 0241 * <td width="20" valign="top"> </td> 0242 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td> 0243 * </tr> 0244 * <tr> 0245 * <td nowrap valign="top"><code>a?</code></td> 0246 * <td valign="top"></td> 0247 * <td valign="top">zero or one instance of <code>a</code><br> 0248 * </td> 0249 * </tr> 0250 * <tr> 0251 * <td nowrap valign="top"><code>a*</code></td> 0252 * <td valign="top"></td> 0253 * <td valign="top">one or more instances of <code>a</code><br> 0254 * </td> 0255 * </tr> 0256 * <tr> 0257 * <td nowrap valign="top"><code>a | b</code></td> 0258 * <td valign="top"></td> 0259 * <td valign="top">either <code>a</code> or <code>b</code><br> 0260 * </td> 0261 * </tr> 0262 * <tr> 0263 * <td nowrap valign="top"><code>'a'</code></td> 0264 * <td valign="top"></td> 0265 * <td valign="top">the literal string between the quotes </td> 0266 * </tr> 0267 * </table> 0268 * </td> 0269 * </tr> 0270 * </table> 0271 * \htmlonly</blockquote>\endhtmlonly 0272 * 0273 * <p>Note: 0274 * - Most UnicodeSet methods do not take a UErrorCode parameter because 0275 * there are usually very few opportunities for failure other than a shortage 0276 * of memory, error codes in low-level C++ string methods would be inconvenient, 0277 * and the error code as the last parameter (ICU convention) would prevent 0278 * the use of default parameter values. 0279 * Instead, such methods set the UnicodeSet into a "bogus" state 0280 * (see isBogus()) if an error occurs. 0281 * 0282 * @author Alan Liu 0283 * @stable ICU 2.0 0284 */ 0285 class U_COMMON_API UnicodeSet final : public UnicodeFilter { 0286 private: 0287 /** 0288 * Enough for sets with few ranges. 0289 * For example, White_Space has 10 ranges, list length 21. 0290 */ 0291 static constexpr int32_t INITIAL_CAPACITY = 25; 0292 // fFlags constant 0293 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) 0294 0295 UChar32* list = stackList; // MUST be terminated with HIGH 0296 int32_t capacity = INITIAL_CAPACITY; // capacity of list 0297 int32_t len = 1; // length of list used; 1 <= len <= capacity 0298 uint8_t fFlags = 0; // Bit flag (see constants above) 0299 0300 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr. 0301 UChar32* buffer = nullptr; // internal buffer, may be nullptr 0302 int32_t bufferCapacity = 0; // capacity of buffer 0303 0304 /** 0305 * The pattern representation of this set. This may not be the 0306 * most economical pattern. It is the pattern supplied to 0307 * applyPattern(), with variables substituted and whitespace 0308 * removed. For sets constructed without applyPattern(), or 0309 * modified using the non-pattern API, this string will be empty, 0310 * indicating that toPattern() must generate a pattern 0311 * representation from the inversion list. 0312 */ 0313 char16_t *pat = nullptr; 0314 int32_t patLen = 0; 0315 0316 UVector* strings = nullptr; // maintained in sorted order 0317 UnicodeSetStringSpan *stringSpan = nullptr; 0318 0319 /** 0320 * Initial list array. 0321 * Avoids some heap allocations, and list is never nullptr. 0322 * Increases the object size a bit. 0323 */ 0324 UChar32 stackList[INITIAL_CAPACITY]; 0325 0326 public: 0327 /** 0328 * Determine if this object contains a valid set. 0329 * A bogus set has no value. It is different from an empty set. 0330 * It can be used to indicate that no set value is available. 0331 * 0332 * @return true if the set is bogus/invalid, false otherwise 0333 * @see setToBogus() 0334 * @stable ICU 4.0 0335 */ 0336 inline UBool isBogus(void) const; 0337 0338 /** 0339 * Make this UnicodeSet object invalid. 0340 * The string will test true with isBogus(). 0341 * 0342 * A bogus set has no value. It is different from an empty set. 0343 * It can be used to indicate that no set value is available. 0344 * 0345 * This utility function is used throughout the UnicodeSet 0346 * implementation to indicate that a UnicodeSet operation failed, 0347 * and may be used in other functions, 0348 * especially but not exclusively when such functions do not 0349 * take a UErrorCode for simplicity. 0350 * 0351 * @see isBogus() 0352 * @stable ICU 4.0 0353 */ 0354 void setToBogus(); 0355 0356 public: 0357 0358 enum { 0359 /** 0360 * Minimum value that can be stored in a UnicodeSet. 0361 * @stable ICU 2.4 0362 */ 0363 MIN_VALUE = 0, 0364 0365 /** 0366 * Maximum value that can be stored in a UnicodeSet. 0367 * @stable ICU 2.4 0368 */ 0369 MAX_VALUE = 0x10ffff 0370 }; 0371 0372 //---------------------------------------------------------------- 0373 // Constructors &c 0374 //---------------------------------------------------------------- 0375 0376 public: 0377 0378 /** 0379 * Constructs an empty set. 0380 * @stable ICU 2.0 0381 */ 0382 UnicodeSet(); 0383 0384 /** 0385 * Constructs a set containing the given range. If <code>end < 0386 * start</code> then an empty set is created. 0387 * 0388 * @param start first character, inclusive, of range 0389 * @param end last character, inclusive, of range 0390 * @stable ICU 2.4 0391 */ 0392 UnicodeSet(UChar32 start, UChar32 end); 0393 0394 #ifndef U_HIDE_INTERNAL_API 0395 /** 0396 * @internal 0397 */ 0398 enum ESerialization { 0399 kSerialized /* result of serialize() */ 0400 }; 0401 0402 /** 0403 * Constructs a set from the output of serialize(). 0404 * 0405 * @param buffer the 16 bit array 0406 * @param bufferLen the original length returned from serialize() 0407 * @param serialization the value 'kSerialized' 0408 * @param status error code 0409 * 0410 * @internal 0411 */ 0412 UnicodeSet(const uint16_t buffer[], int32_t bufferLen, 0413 ESerialization serialization, UErrorCode &status); 0414 #endif /* U_HIDE_INTERNAL_API */ 0415 0416 /** 0417 * Constructs a set from the given pattern. See the class 0418 * description for the syntax of the pattern language. 0419 * @param pattern a string specifying what characters are in the set 0420 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 0421 * contains a syntax error. 0422 * @stable ICU 2.0 0423 */ 0424 UnicodeSet(const UnicodeString& pattern, 0425 UErrorCode& status); 0426 0427 #ifndef U_HIDE_INTERNAL_API 0428 /** 0429 * Constructs a set from the given pattern. See the class 0430 * description for the syntax of the pattern language. 0431 * @param pattern a string specifying what characters are in the set 0432 * @param options bitmask for options to apply to the pattern. 0433 * Valid options are USET_IGNORE_SPACE and 0434 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 0435 * These case options are mutually exclusive. 0436 * @param symbols a symbol table mapping variable names to values 0437 * and stand-in characters to UnicodeSets; may be nullptr 0438 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 0439 * contains a syntax error. 0440 * @internal 0441 */ 0442 UnicodeSet(const UnicodeString& pattern, 0443 uint32_t options, 0444 const SymbolTable* symbols, 0445 UErrorCode& status); 0446 #endif /* U_HIDE_INTERNAL_API */ 0447 0448 /** 0449 * Constructs a set from the given pattern. See the class description 0450 * for the syntax of the pattern language. 0451 * @param pattern a string specifying what characters are in the set 0452 * @param pos on input, the position in pattern at which to start parsing. 0453 * On output, the position after the last character parsed. 0454 * @param options bitmask for options to apply to the pattern. 0455 * Valid options are USET_IGNORE_SPACE and 0456 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 0457 * These case options are mutually exclusive. 0458 * @param symbols a symbol table mapping variable names to values 0459 * and stand-in characters to UnicodeSets; may be nullptr 0460 * @param status input-output error code 0461 * @stable ICU 2.8 0462 */ 0463 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 0464 uint32_t options, 0465 const SymbolTable* symbols, 0466 UErrorCode& status); 0467 0468 /** 0469 * Constructs a set that is identical to the given UnicodeSet. 0470 * @stable ICU 2.0 0471 */ 0472 UnicodeSet(const UnicodeSet& o); 0473 0474 /** 0475 * Destructs the set. 0476 * @stable ICU 2.0 0477 */ 0478 virtual ~UnicodeSet(); 0479 0480 /** 0481 * Assigns this object to be a copy of another. 0482 * A frozen set will not be modified. 0483 * @stable ICU 2.0 0484 */ 0485 UnicodeSet& operator=(const UnicodeSet& o); 0486 0487 /** 0488 * Compares the specified object with this set for equality. Returns 0489 * <tt>true</tt> if the two sets 0490 * have the same size, and every member of the specified set is 0491 * contained in this set (or equivalently, every member of this set is 0492 * contained in the specified set). 0493 * 0494 * @param o set to be compared for equality with this set. 0495 * @return <tt>true</tt> if the specified set is equal to this set. 0496 * @stable ICU 2.0 0497 */ 0498 virtual bool operator==(const UnicodeSet& o) const; 0499 0500 /** 0501 * Compares the specified object with this set for equality. Returns 0502 * <tt>true</tt> if the specified set is not equal to this set. 0503 * @stable ICU 2.0 0504 */ 0505 inline bool operator!=(const UnicodeSet& o) const; 0506 0507 /** 0508 * Returns a copy of this object. All UnicodeFunctor objects have 0509 * to support cloning in order to allow classes using 0510 * UnicodeFunctors, such as Transliterator, to implement cloning. 0511 * If this set is frozen, then the clone will be frozen as well. 0512 * Use cloneAsThawed() for a mutable clone of a frozen set. 0513 * @see cloneAsThawed 0514 * @stable ICU 2.0 0515 */ 0516 virtual UnicodeSet* clone() const override; 0517 0518 /** 0519 * Returns the hash code value for this set. 0520 * 0521 * @return the hash code value for this set. 0522 * @see Object#hashCode() 0523 * @stable ICU 2.0 0524 */ 0525 virtual int32_t hashCode(void) const; 0526 0527 /** 0528 * Get a UnicodeSet pointer from a USet 0529 * 0530 * @param uset a USet (the ICU plain C type for UnicodeSet) 0531 * @return the corresponding UnicodeSet pointer. 0532 * 0533 * @stable ICU 4.2 0534 */ 0535 inline static UnicodeSet *fromUSet(USet *uset); 0536 0537 /** 0538 * Get a UnicodeSet pointer from a const USet 0539 * 0540 * @param uset a const USet (the ICU plain C type for UnicodeSet) 0541 * @return the corresponding UnicodeSet pointer. 0542 * 0543 * @stable ICU 4.2 0544 */ 0545 inline static const UnicodeSet *fromUSet(const USet *uset); 0546 0547 /** 0548 * Produce a USet * pointer for this UnicodeSet. 0549 * USet is the plain C type for UnicodeSet 0550 * 0551 * @return a USet pointer for this UnicodeSet 0552 * @stable ICU 4.2 0553 */ 0554 inline USet *toUSet(); 0555 0556 0557 /** 0558 * Produce a const USet * pointer for this UnicodeSet. 0559 * USet is the plain C type for UnicodeSet 0560 * 0561 * @return a const USet pointer for this UnicodeSet 0562 * @stable ICU 4.2 0563 */ 0564 inline const USet * toUSet() const; 0565 0566 0567 //---------------------------------------------------------------- 0568 // Freezable API 0569 //---------------------------------------------------------------- 0570 0571 /** 0572 * Determines whether the set has been frozen (made immutable) or not. 0573 * See the ICU4J Freezable interface for details. 0574 * @return true/false for whether the set has been frozen 0575 * @see freeze 0576 * @see cloneAsThawed 0577 * @stable ICU 3.8 0578 */ 0579 inline UBool isFrozen() const; 0580 0581 /** 0582 * Freeze the set (make it immutable). 0583 * Once frozen, it cannot be unfrozen and is therefore thread-safe 0584 * until it is deleted. 0585 * See the ICU4J Freezable interface for details. 0586 * Freezing the set may also make some operations faster, for example 0587 * contains() and span(). 0588 * A frozen set will not be modified. (It remains frozen.) 0589 * @return this set. 0590 * @see isFrozen 0591 * @see cloneAsThawed 0592 * @stable ICU 3.8 0593 */ 0594 UnicodeSet *freeze(); 0595 0596 /** 0597 * Clone the set and make the clone mutable. 0598 * See the ICU4J Freezable interface for details. 0599 * @return the mutable clone 0600 * @see freeze 0601 * @see isFrozen 0602 * @stable ICU 3.8 0603 */ 0604 UnicodeSet *cloneAsThawed() const; 0605 0606 //---------------------------------------------------------------- 0607 // Public API 0608 //---------------------------------------------------------------- 0609 0610 /** 0611 * Make this object represent the range `start - end`. 0612 * If `start > end` then this object is set to an empty range. 0613 * A frozen set will not be modified. 0614 * 0615 * @param start first character in the set, inclusive 0616 * @param end last character in the set, inclusive 0617 * @stable ICU 2.4 0618 */ 0619 UnicodeSet& set(UChar32 start, UChar32 end); 0620 0621 /** 0622 * Return true if the given position, in the given pattern, appears 0623 * to be the start of a UnicodeSet pattern. 0624 * @stable ICU 2.4 0625 */ 0626 static UBool resemblesPattern(const UnicodeString& pattern, 0627 int32_t pos); 0628 0629 /** 0630 * Modifies this set to represent the set specified by the given 0631 * pattern, ignoring Unicode Pattern_White_Space characters. 0632 * See the class description for the syntax of the pattern language. 0633 * A frozen set will not be modified. 0634 * @param pattern a string specifying what characters are in the set 0635 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 0636 * contains a syntax error. 0637 * <em> Empties the set passed before applying the pattern.</em> 0638 * @return a reference to this 0639 * @stable ICU 2.0 0640 */ 0641 UnicodeSet& applyPattern(const UnicodeString& pattern, 0642 UErrorCode& status); 0643 0644 #ifndef U_HIDE_INTERNAL_API 0645 /** 0646 * Modifies this set to represent the set specified by the given 0647 * pattern, optionally ignoring Unicode Pattern_White_Space characters. 0648 * See the class description for the syntax of the pattern language. 0649 * A frozen set will not be modified. 0650 * @param pattern a string specifying what characters are in the set 0651 * @param options bitmask for options to apply to the pattern. 0652 * Valid options are USET_IGNORE_SPACE and 0653 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 0654 * These case options are mutually exclusive. 0655 * @param symbols a symbol table mapping variable names to 0656 * values and stand-ins to UnicodeSets; may be nullptr 0657 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 0658 * contains a syntax error. 0659 *<em> Empties the set passed before applying the pattern.</em> 0660 * @return a reference to this 0661 * @internal 0662 */ 0663 UnicodeSet& applyPattern(const UnicodeString& pattern, 0664 uint32_t options, 0665 const SymbolTable* symbols, 0666 UErrorCode& status); 0667 #endif /* U_HIDE_INTERNAL_API */ 0668 0669 /** 0670 * Parses the given pattern, starting at the given position. The 0671 * character at pattern.charAt(pos.getIndex()) must be '[', or the 0672 * parse fails. Parsing continues until the corresponding closing 0673 * ']'. If a syntax error is encountered between the opening and 0674 * closing brace, the parse fails. Upon return from a successful 0675 * parse, the ParsePosition is updated to point to the character 0676 * following the closing ']', and a StringBuffer containing a 0677 * pairs list for the parsed pattern is returned. This method calls 0678 * itself recursively to parse embedded subpatterns. 0679 *<em> Empties the set passed before applying the pattern.</em> 0680 * A frozen set will not be modified. 0681 * 0682 * @param pattern the string containing the pattern to be parsed. 0683 * The portion of the string from pos.getIndex(), which must be a 0684 * '[', to the corresponding closing ']', is parsed. 0685 * @param pos upon entry, the position at which to being parsing. 0686 * The character at pattern.charAt(pos.getIndex()) must be a '['. 0687 * Upon return from a successful parse, pos.getIndex() is either 0688 * the character after the closing ']' of the parsed pattern, or 0689 * pattern.length() if the closing ']' is the last character of 0690 * the pattern string. 0691 * @param options bitmask for options to apply to the pattern. 0692 * Valid options are USET_IGNORE_SPACE and 0693 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 0694 * These case options are mutually exclusive. 0695 * @param symbols a symbol table mapping variable names to 0696 * values and stand-ins to UnicodeSets; may be nullptr 0697 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern 0698 * contains a syntax error. 0699 * @return a reference to this 0700 * @stable ICU 2.8 0701 */ 0702 UnicodeSet& applyPattern(const UnicodeString& pattern, 0703 ParsePosition& pos, 0704 uint32_t options, 0705 const SymbolTable* symbols, 0706 UErrorCode& status); 0707 0708 /** 0709 * Returns a string representation of this set. If the result of 0710 * calling this function is passed to a UnicodeSet constructor, it 0711 * will produce another set that is equal to this one. 0712 * A frozen set will not be modified. 0713 * @param result the string to receive the rules. Previous 0714 * contents will be deleted. 0715 * @param escapeUnprintable if true then convert unprintable 0716 * character to their hex escape representations, \\uxxxx or 0717 * \\Uxxxxxxxx. Unprintable characters are those other than 0718 * U+000A, U+0020..U+007E. 0719 * @stable ICU 2.0 0720 */ 0721 virtual UnicodeString& toPattern(UnicodeString& result, 0722 UBool escapeUnprintable = false) const override; 0723 0724 /** 0725 * Modifies this set to contain those code points which have the given value 0726 * for the given binary or enumerated property, as returned by 0727 * u_getIntPropertyValue. Prior contents of this set are lost. 0728 * A frozen set will not be modified. 0729 * 0730 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 0731 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 0732 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 0733 * 0734 * @param value a value in the range u_getIntPropertyMinValue(prop).. 0735 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 0736 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 0737 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 0738 * categories such as [:L:] to be represented. 0739 * 0740 * @param ec error code input/output parameter 0741 * 0742 * @return a reference to this set 0743 * 0744 * @stable ICU 2.4 0745 */ 0746 UnicodeSet& applyIntPropertyValue(UProperty prop, 0747 int32_t value, 0748 UErrorCode& ec); 0749 0750 /** 0751 * Modifies this set to contain those code points which have the 0752 * given value for the given property. Prior contents of this 0753 * set are lost. 0754 * A frozen set will not be modified. 0755 * 0756 * @param prop a property alias, either short or long. The name is matched 0757 * loosely. See PropertyAliases.txt for names and a description of loose 0758 * matching. If the value string is empty, then this string is interpreted 0759 * as either a General_Category value alias, a Script value alias, a binary 0760 * property alias, or a special ID. Special IDs are matched loosely and 0761 * correspond to the following sets: 0762 * 0763 * "ANY" = [\\u0000-\\U0010FFFF], 0764 * "ASCII" = [\\u0000-\\u007F], 0765 * "Assigned" = [:^Cn:]. 0766 * 0767 * @param value a value alias, either short or long. The name is matched 0768 * loosely. See PropertyValueAliases.txt for names and a description of 0769 * loose matching. In addition to aliases listed, numeric values and 0770 * canonical combining classes may be expressed numerically, e.g., ("nv", 0771 * "0.5") or ("ccc", "220"). The value string may also be empty. 0772 * 0773 * @param ec error code input/output parameter 0774 * 0775 * @return a reference to this set 0776 * 0777 * @stable ICU 2.4 0778 */ 0779 UnicodeSet& applyPropertyAlias(const UnicodeString& prop, 0780 const UnicodeString& value, 0781 UErrorCode& ec); 0782 0783 /** 0784 * Returns the number of elements in this set (its cardinality). 0785 * Note than the elements of a set may include both individual 0786 * codepoints and strings. 0787 * 0788 * This is slower than getRangeCount() because 0789 * it counts the code points of all ranges. 0790 * 0791 * @return the number of elements in this set (its cardinality). 0792 * @stable ICU 2.0 0793 * @see getRangeCount 0794 */ 0795 virtual int32_t size(void) const; 0796 0797 /** 0798 * Returns <tt>true</tt> if this set contains no elements. 0799 * 0800 * @return <tt>true</tt> if this set contains no elements. 0801 * @stable ICU 2.0 0802 */ 0803 virtual UBool isEmpty(void) const; 0804 0805 /** 0806 * @return true if this set contains multi-character strings or the empty string. 0807 * @stable ICU 70 0808 */ 0809 UBool hasStrings() const; 0810 0811 /** 0812 * Returns true if this set contains the given character. 0813 * This function works faster with a frozen set. 0814 * @param c character to be checked for containment 0815 * @return true if the test condition is met 0816 * @stable ICU 2.0 0817 */ 0818 virtual UBool contains(UChar32 c) const override; 0819 0820 /** 0821 * Returns true if this set contains every character 0822 * of the given range. 0823 * @param start first character, inclusive, of the range 0824 * @param end last character, inclusive, of the range 0825 * @return true if the test condition is met 0826 * @stable ICU 2.0 0827 */ 0828 virtual UBool contains(UChar32 start, UChar32 end) const; 0829 0830 /** 0831 * Returns <tt>true</tt> if this set contains the given 0832 * multicharacter string. 0833 * @param s string to be checked for containment 0834 * @return <tt>true</tt> if this set contains the specified string 0835 * @stable ICU 2.4 0836 */ 0837 UBool contains(const UnicodeString& s) const; 0838 0839 /** 0840 * Returns true if this set contains all the characters and strings 0841 * of the given set. 0842 * @param c set to be checked for containment 0843 * @return true if the test condition is met 0844 * @stable ICU 2.4 0845 */ 0846 virtual UBool containsAll(const UnicodeSet& c) const; 0847 0848 /** 0849 * Returns true if this set contains all the characters 0850 * of the given string. 0851 * @param s string containing characters to be checked for containment 0852 * @return true if the test condition is met 0853 * @stable ICU 2.4 0854 */ 0855 UBool containsAll(const UnicodeString& s) const; 0856 0857 /** 0858 * Returns true if this set contains none of the characters 0859 * of the given range. 0860 * @param start first character, inclusive, of the range 0861 * @param end last character, inclusive, of the range 0862 * @return true if the test condition is met 0863 * @stable ICU 2.4 0864 */ 0865 UBool containsNone(UChar32 start, UChar32 end) const; 0866 0867 /** 0868 * Returns true if this set contains none of the characters and strings 0869 * of the given set. 0870 * @param c set to be checked for containment 0871 * @return true if the test condition is met 0872 * @stable ICU 2.4 0873 */ 0874 UBool containsNone(const UnicodeSet& c) const; 0875 0876 /** 0877 * Returns true if this set contains none of the characters 0878 * of the given string. 0879 * @param s string containing characters to be checked for containment 0880 * @return true if the test condition is met 0881 * @stable ICU 2.4 0882 */ 0883 UBool containsNone(const UnicodeString& s) const; 0884 0885 /** 0886 * Returns true if this set contains one or more of the characters 0887 * in the given range. 0888 * @param start first character, inclusive, of the range 0889 * @param end last character, inclusive, of the range 0890 * @return true if the condition is met 0891 * @stable ICU 2.4 0892 */ 0893 inline UBool containsSome(UChar32 start, UChar32 end) const; 0894 0895 /** 0896 * Returns true if this set contains one or more of the characters 0897 * and strings of the given set. 0898 * @param s The set to be checked for containment 0899 * @return true if the condition is met 0900 * @stable ICU 2.4 0901 */ 0902 inline UBool containsSome(const UnicodeSet& s) const; 0903 0904 /** 0905 * Returns true if this set contains one or more of the characters 0906 * of the given string. 0907 * @param s string containing characters to be checked for containment 0908 * @return true if the condition is met 0909 * @stable ICU 2.4 0910 */ 0911 inline UBool containsSome(const UnicodeString& s) const; 0912 0913 /** 0914 * Returns the length of the initial substring of the input string which 0915 * consists only of characters and strings that are contained in this set 0916 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 0917 * or only of characters and strings that are not contained 0918 * in this set (USET_SPAN_NOT_CONTAINED). 0919 * See USetSpanCondition for details. 0920 * Similar to the strspn() C library function. 0921 * Unpaired surrogates are treated according to contains() of their surrogate code points. 0922 * This function works faster with a frozen set and with a non-negative string length argument. 0923 * @param s start of the string 0924 * @param length of the string; can be -1 for NUL-terminated 0925 * @param spanCondition specifies the containment condition 0926 * @return the length of the initial substring according to the spanCondition; 0927 * 0 if the start of the string does not fit the spanCondition 0928 * @stable ICU 3.8 0929 * @see USetSpanCondition 0930 */ 0931 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; 0932 0933 /** 0934 * Returns the end of the substring of the input string according to the USetSpanCondition. 0935 * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code> 0936 * after pinning start to 0<=start<=s.length(). 0937 * @param s the string 0938 * @param start the start index in the string for the span operation 0939 * @param spanCondition specifies the containment condition 0940 * @return the exclusive end of the substring according to the spanCondition; 0941 * the substring s.tempSubStringBetween(start, end) fulfills the spanCondition 0942 * @stable ICU 4.4 0943 * @see USetSpanCondition 0944 */ 0945 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const; 0946 0947 /** 0948 * Returns the start of the trailing substring of the input string which 0949 * consists only of characters and strings that are contained in this set 0950 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 0951 * or only of characters and strings that are not contained 0952 * in this set (USET_SPAN_NOT_CONTAINED). 0953 * See USetSpanCondition for details. 0954 * Unpaired surrogates are treated according to contains() of their surrogate code points. 0955 * This function works faster with a frozen set and with a non-negative string length argument. 0956 * @param s start of the string 0957 * @param length of the string; can be -1 for NUL-terminated 0958 * @param spanCondition specifies the containment condition 0959 * @return the start of the trailing substring according to the spanCondition; 0960 * the string length if the end of the string does not fit the spanCondition 0961 * @stable ICU 3.8 0962 * @see USetSpanCondition 0963 */ 0964 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; 0965 0966 /** 0967 * Returns the start of the substring of the input string according to the USetSpanCondition. 0968 * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code> 0969 * after pinning limit to 0<=end<=s.length(). 0970 * @param s the string 0971 * @param limit the exclusive-end index in the string for the span operation 0972 * (use s.length() or INT32_MAX for spanning back from the end of the string) 0973 * @param spanCondition specifies the containment condition 0974 * @return the start of the substring according to the spanCondition; 0975 * the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition 0976 * @stable ICU 4.4 0977 * @see USetSpanCondition 0978 */ 0979 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const; 0980 0981 /** 0982 * Returns the length of the initial substring of the input string which 0983 * consists only of characters and strings that are contained in this set 0984 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 0985 * or only of characters and strings that are not contained 0986 * in this set (USET_SPAN_NOT_CONTAINED). 0987 * See USetSpanCondition for details. 0988 * Similar to the strspn() C library function. 0989 * Malformed byte sequences are treated according to contains(0xfffd). 0990 * This function works faster with a frozen set and with a non-negative string length argument. 0991 * @param s start of the string (UTF-8) 0992 * @param length of the string; can be -1 for NUL-terminated 0993 * @param spanCondition specifies the containment condition 0994 * @return the length of the initial substring according to the spanCondition; 0995 * 0 if the start of the string does not fit the spanCondition 0996 * @stable ICU 3.8 0997 * @see USetSpanCondition 0998 */ 0999 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 1000 1001 /** 1002 * Returns the start of the trailing substring of the input string which 1003 * consists only of characters and strings that are contained in this set 1004 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1005 * or only of characters and strings that are not contained 1006 * in this set (USET_SPAN_NOT_CONTAINED). 1007 * See USetSpanCondition for details. 1008 * Malformed byte sequences are treated according to contains(0xfffd). 1009 * This function works faster with a frozen set and with a non-negative string length argument. 1010 * @param s start of the string (UTF-8) 1011 * @param length of the string; can be -1 for NUL-terminated 1012 * @param spanCondition specifies the containment condition 1013 * @return the start of the trailing substring according to the spanCondition; 1014 * the string length if the end of the string does not fit the spanCondition 1015 * @stable ICU 3.8 1016 * @see USetSpanCondition 1017 */ 1018 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 1019 1020 /** 1021 * Implement UnicodeMatcher::matches() 1022 * @stable ICU 2.4 1023 */ 1024 virtual UMatchDegree matches(const Replaceable& text, 1025 int32_t& offset, 1026 int32_t limit, 1027 UBool incremental) override; 1028 1029 private: 1030 /** 1031 * Returns the longest match for s in text at the given position. 1032 * If limit > start then match forward from start+1 to limit 1033 * matching all characters except s.charAt(0). If limit < start, 1034 * go backward starting from start-1 matching all characters 1035 * except s.charAt(s.length()-1). This method assumes that the 1036 * first character, text.charAt(start), matches s, so it does not 1037 * check it. 1038 * @param text the text to match 1039 * @param start the first character to match. In the forward 1040 * direction, text.charAt(start) is matched against s.charAt(0). 1041 * In the reverse direction, it is matched against 1042 * s.charAt(s.length()-1). 1043 * @param limit the limit offset for matching, either last+1 in 1044 * the forward direction, or last-1 in the reverse direction, 1045 * where last is the index of the last character to match. 1046 * @param s 1047 * @return If part of s matches up to the limit, return |limit - 1048 * start|. If all of s matches before reaching the limit, return 1049 * s.length(). If there is a mismatch between s and text, return 1050 * 0 1051 */ 1052 static int32_t matchRest(const Replaceable& text, 1053 int32_t start, int32_t limit, 1054 const UnicodeString& s); 1055 1056 /** 1057 * Returns the smallest value i such that c < list[i]. Caller 1058 * must ensure that c is a legal value or this method will enter 1059 * an infinite loop. This method performs a binary search. 1060 * @param c a character in the range MIN_VALUE..MAX_VALUE 1061 * inclusive 1062 * @return the smallest integer i in the range 0..len-1, 1063 * inclusive, such that c < list[i] 1064 */ 1065 int32_t findCodePoint(UChar32 c) const; 1066 1067 public: 1068 1069 /** 1070 * Implementation of UnicodeMatcher API. Union the set of all 1071 * characters that may be matched by this object into the given 1072 * set. 1073 * @param toUnionTo the set into which to union the source characters 1074 * @stable ICU 2.4 1075 */ 1076 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override; 1077 1078 /** 1079 * Returns the index of the given character within this set, where 1080 * the set is ordered by ascending code point. If the character 1081 * is not in this set, return -1. The inverse of this method is 1082 * <code>charAt()</code>. 1083 * @return an index from 0..size()-1, or -1 1084 * @stable ICU 2.4 1085 */ 1086 int32_t indexOf(UChar32 c) const; 1087 1088 /** 1089 * Returns the character at the given index within this set, where 1090 * the set is ordered by ascending code point. If the index is 1091 * out of range for characters, returns (UChar32)-1. 1092 * The inverse of this method is <code>indexOf()</code>. 1093 * 1094 * For iteration, this is slower than UnicodeSetIterator or 1095 * getRangeCount()/getRangeStart()/getRangeEnd(), 1096 * because for each call it skips linearly over <code>index</code> 1097 * characters in the ranges. 1098 * 1099 * @param index an index from 0..size()-1 1100 * @return the character at the given index, or (UChar32)-1. 1101 * @stable ICU 2.4 1102 */ 1103 UChar32 charAt(int32_t index) const; 1104 1105 /** 1106 * Adds the specified range to this set if it is not already 1107 * present. If this set already contains the specified range, 1108 * the call leaves this set unchanged. If <code>start > end</code> 1109 * then an empty range is added, leaving the set unchanged. 1110 * This is equivalent to a boolean logic OR, or a set UNION. 1111 * A frozen set will not be modified. 1112 * 1113 * @param start first character, inclusive, of range to be added 1114 * to this set. 1115 * @param end last character, inclusive, of range to be added 1116 * to this set. 1117 * @stable ICU 2.0 1118 */ 1119 virtual UnicodeSet& add(UChar32 start, UChar32 end); 1120 1121 /** 1122 * Adds the specified character to this set if it is not already 1123 * present. If this set already contains the specified character, 1124 * the call leaves this set unchanged. 1125 * A frozen set will not be modified. 1126 * 1127 * @param c the character (code point) 1128 * @return this object, for chaining 1129 * @stable ICU 2.0 1130 */ 1131 UnicodeSet& add(UChar32 c); 1132 1133 /** 1134 * Adds the specified multicharacter to this set if it is not already 1135 * present. If this set already contains the multicharacter, 1136 * the call leaves this set unchanged. 1137 * Thus "ch" => {"ch"} 1138 * A frozen set will not be modified. 1139 * 1140 * @param s the source string 1141 * @return this object, for chaining 1142 * @stable ICU 2.4 1143 */ 1144 UnicodeSet& add(const UnicodeString& s); 1145 1146 private: 1147 /** 1148 * @return a code point IF the string consists of a single one. 1149 * otherwise returns -1. 1150 * @param s string to test 1151 */ 1152 static int32_t getSingleCP(const UnicodeString& s); 1153 1154 void _add(const UnicodeString& s); 1155 1156 public: 1157 /** 1158 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 1159 * If this set already contains any particular character, it has no effect on that character. 1160 * A frozen set will not be modified. 1161 * @param s the source string 1162 * @return this object, for chaining 1163 * @stable ICU 2.4 1164 */ 1165 UnicodeSet& addAll(const UnicodeString& s); 1166 1167 /** 1168 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1169 * A frozen set will not be modified. 1170 * @param s the source string 1171 * @return this object, for chaining 1172 * @stable ICU 2.4 1173 */ 1174 UnicodeSet& retainAll(const UnicodeString& s); 1175 1176 /** 1177 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1178 * A frozen set will not be modified. 1179 * @param s the source string 1180 * @return this object, for chaining 1181 * @stable ICU 2.4 1182 */ 1183 UnicodeSet& complementAll(const UnicodeString& s); 1184 1185 /** 1186 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1187 * A frozen set will not be modified. 1188 * @param s the source string 1189 * @return this object, for chaining 1190 * @stable ICU 2.4 1191 */ 1192 UnicodeSet& removeAll(const UnicodeString& s); 1193 1194 /** 1195 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1196 * 1197 * @param s the source string 1198 * @return a newly created set containing the given string. 1199 * The caller owns the return object and is responsible for deleting it. 1200 * @stable ICU 2.4 1201 */ 1202 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s); 1203 1204 1205 /** 1206 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1207 * @param s the source string 1208 * @return a newly created set containing the given characters 1209 * The caller owns the return object and is responsible for deleting it. 1210 * @stable ICU 2.4 1211 */ 1212 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s); 1213 1214 /** 1215 * Retain only the elements in this set that are contained in the 1216 * specified range. If <code>start > end</code> then an empty range is 1217 * retained, leaving the set empty. This is equivalent to 1218 * a boolean logic AND, or a set INTERSECTION. 1219 * A frozen set will not be modified. 1220 * 1221 * @param start first character, inclusive, of range 1222 * @param end last character, inclusive, of range 1223 * @stable ICU 2.0 1224 */ 1225 virtual UnicodeSet& retain(UChar32 start, UChar32 end); 1226 1227 1228 /** 1229 * Retain the specified character from this set if it is present. 1230 * A frozen set will not be modified. 1231 * 1232 * @param c the character (code point) 1233 * @return this object, for chaining 1234 * @stable ICU 2.0 1235 */ 1236 UnicodeSet& retain(UChar32 c); 1237 1238 /** 1239 * Retains only the specified string from this set if it is present. 1240 * Upon return this set will be empty if it did not contain s, or 1241 * will only contain s if it did contain s. 1242 * A frozen set will not be modified. 1243 * 1244 * @param s the source string 1245 * @return this object, for chaining 1246 * @stable ICU 69 1247 */ 1248 UnicodeSet& retain(const UnicodeString &s); 1249 1250 /** 1251 * Removes the specified range from this set if it is present. 1252 * The set will not contain the specified range once the call 1253 * returns. If <code>start > end</code> then an empty range is 1254 * removed, leaving the set unchanged. 1255 * A frozen set will not be modified. 1256 * 1257 * @param start first character, inclusive, of range to be removed 1258 * from this set. 1259 * @param end last character, inclusive, of range to be removed 1260 * from this set. 1261 * @stable ICU 2.0 1262 */ 1263 virtual UnicodeSet& remove(UChar32 start, UChar32 end); 1264 1265 /** 1266 * Removes the specified character from this set if it is present. 1267 * The set will not contain the specified range once the call 1268 * returns. 1269 * A frozen set will not be modified. 1270 * 1271 * @param c the character (code point) 1272 * @return this object, for chaining 1273 * @stable ICU 2.0 1274 */ 1275 UnicodeSet& remove(UChar32 c); 1276 1277 /** 1278 * Removes the specified string from this set if it is present. 1279 * The set will not contain the specified character once the call 1280 * returns. 1281 * A frozen set will not be modified. 1282 * @param s the source string 1283 * @return this object, for chaining 1284 * @stable ICU 2.4 1285 */ 1286 UnicodeSet& remove(const UnicodeString& s); 1287 1288 /** 1289 * This is equivalent to 1290 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1291 * 1292 * <strong>Note:</strong> This performs a symmetric difference with all code points 1293 * <em>and thus retains all multicharacter strings</em>. 1294 * In order to achieve a “code point complement” (all code points minus this set), 1295 * the easiest is to <code>.complement().removeAllStrings()</code>. 1296 * 1297 * A frozen set will not be modified. 1298 * @stable ICU 2.0 1299 */ 1300 virtual UnicodeSet& complement(); 1301 1302 /** 1303 * Complements the specified range in this set. Any character in 1304 * the range will be removed if it is in this set, or will be 1305 * added if it is not in this set. If <code>start > end</code> 1306 * then an empty range is complemented, leaving the set unchanged. 1307 * This is equivalent to a boolean logic XOR. 1308 * A frozen set will not be modified. 1309 * 1310 * @param start first character, inclusive, of range 1311 * @param end last character, inclusive, of range 1312 * @stable ICU 2.0 1313 */ 1314 virtual UnicodeSet& complement(UChar32 start, UChar32 end); 1315 1316 /** 1317 * Complements the specified character in this set. The character 1318 * will be removed if it is in this set, or will be added if it is 1319 * not in this set. 1320 * A frozen set will not be modified. 1321 * 1322 * @param c the character (code point) 1323 * @return this object, for chaining 1324 * @stable ICU 2.0 1325 */ 1326 UnicodeSet& complement(UChar32 c); 1327 1328 /** 1329 * Complement the specified string in this set. 1330 * The string will be removed if it is in this set, or will be added if it is not in this set. 1331 * A frozen set will not be modified. 1332 * 1333 * @param s the string to complement 1334 * @return this object, for chaining 1335 * @stable ICU 2.4 1336 */ 1337 UnicodeSet& complement(const UnicodeString& s); 1338 1339 /** 1340 * Adds all of the elements in the specified set to this set if 1341 * they're not already present. This operation effectively 1342 * modifies this set so that its value is the <i>union</i> of the two 1343 * sets. The behavior of this operation is unspecified if the specified 1344 * collection is modified while the operation is in progress. 1345 * A frozen set will not be modified. 1346 * 1347 * @param c set whose elements are to be added to this set. 1348 * @see #add(UChar32, UChar32) 1349 * @stable ICU 2.0 1350 */ 1351 virtual UnicodeSet& addAll(const UnicodeSet& c); 1352 1353 /** 1354 * Retains only the elements in this set that are contained in the 1355 * specified set. In other words, removes from this set all of 1356 * its elements that are not contained in the specified set. This 1357 * operation effectively modifies this set so that its value is 1358 * the <i>intersection</i> of the two sets. 1359 * A frozen set will not be modified. 1360 * 1361 * @param c set that defines which elements this set will retain. 1362 * @stable ICU 2.0 1363 */ 1364 virtual UnicodeSet& retainAll(const UnicodeSet& c); 1365 1366 /** 1367 * Removes from this set all of its elements that are contained in the 1368 * specified set. This operation effectively modifies this 1369 * set so that its value is the <i>asymmetric set difference</i> of 1370 * the two sets. 1371 * A frozen set will not be modified. 1372 * 1373 * @param c set that defines which elements will be removed from 1374 * this set. 1375 * @stable ICU 2.0 1376 */ 1377 virtual UnicodeSet& removeAll(const UnicodeSet& c); 1378 1379 /** 1380 * Complements in this set all elements contained in the specified 1381 * set. Any character in the other set will be removed if it is 1382 * in this set, or will be added if it is not in this set. 1383 * A frozen set will not be modified. 1384 * 1385 * @param c set that defines which elements will be xor'ed from 1386 * this set. 1387 * @stable ICU 2.4 1388 */ 1389 virtual UnicodeSet& complementAll(const UnicodeSet& c); 1390 1391 /** 1392 * Removes all of the elements from this set. This set will be 1393 * empty after this call returns. 1394 * A frozen set will not be modified. 1395 * @stable ICU 2.0 1396 */ 1397 virtual UnicodeSet& clear(void); 1398 1399 /** 1400 * Close this set over the given attribute. For the attribute 1401 * USET_CASE_INSENSITIVE, the result is to modify this set so that: 1402 * 1403 * 1. For each character or string 'a' in this set, all strings or 1404 * characters 'b' such that foldCase(a) == foldCase(b) are added 1405 * to this set. 1406 * 1407 * 2. For each string 'e' in the resulting set, if e != 1408 * foldCase(e), 'e' will be removed. 1409 * 1410 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 1411 * 1412 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 1413 * == b denotes that the contents are the same, not pointer 1414 * comparison.) 1415 * 1416 * A frozen set will not be modified. 1417 * 1418 * @param attribute bitmask for attributes to close over. 1419 * Valid options: 1420 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE. 1421 * These case options are mutually exclusive. 1422 * Unrelated options bits are ignored. 1423 * @return a reference to this set. 1424 * @stable ICU 4.2 1425 */ 1426 UnicodeSet& closeOver(int32_t attribute); 1427 1428 /** 1429 * Remove all strings from this set. 1430 * 1431 * @return a reference to this set. 1432 * @stable ICU 4.2 1433 */ 1434 virtual UnicodeSet &removeAllStrings(); 1435 1436 /** 1437 * Iteration method that returns the number of ranges contained in 1438 * this set. 1439 * @see #getRangeStart 1440 * @see #getRangeEnd 1441 * @stable ICU 2.4 1442 */ 1443 virtual int32_t getRangeCount(void) const; 1444 1445 /** 1446 * Iteration method that returns the first character in the 1447 * specified range of this set. 1448 * @see #getRangeCount 1449 * @see #getRangeEnd 1450 * @stable ICU 2.4 1451 */ 1452 virtual UChar32 getRangeStart(int32_t index) const; 1453 1454 /** 1455 * Iteration method that returns the last character in the 1456 * specified range of this set. 1457 * @see #getRangeStart 1458 * @see #getRangeEnd 1459 * @stable ICU 2.4 1460 */ 1461 virtual UChar32 getRangeEnd(int32_t index) const; 1462 1463 /** 1464 * Serializes this set into an array of 16-bit integers. Serialization 1465 * (currently) only records the characters in the set; multicharacter 1466 * strings are ignored. 1467 * 1468 * The array has following format (each line is one 16-bit 1469 * integer): 1470 * 1471 * length = (n+2*m) | (m!=0?0x8000:0) 1472 * bmpLength = n; present if m!=0 1473 * bmp[0] 1474 * bmp[1] 1475 * ... 1476 * bmp[n-1] 1477 * supp-high[0] 1478 * supp-low[0] 1479 * supp-high[1] 1480 * supp-low[1] 1481 * ... 1482 * supp-high[m-1] 1483 * supp-low[m-1] 1484 * 1485 * The array starts with a header. After the header are n bmp 1486 * code points, then m supplementary code points. Either n or m 1487 * or both may be zero. n+2*m is always <= 0x7FFF. 1488 * 1489 * If there are no supplementary characters (if m==0) then the 1490 * header is one 16-bit integer, 'length', with value n. 1491 * 1492 * If there are supplementary characters (if m!=0) then the header 1493 * is two 16-bit integers. The first, 'length', has value 1494 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1495 * 1496 * After the header the code points are stored in ascending order. 1497 * Supplementary code points are stored as most significant 16 1498 * bits followed by least significant 16 bits. 1499 * 1500 * @param dest pointer to buffer of destCapacity 16-bit integers. 1501 * May be nullptr only if destCapacity is zero. 1502 * @param destCapacity size of dest, or zero. Must not be negative. 1503 * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR 1504 * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if 1505 * n+2*m+(m!=0?2:1) > destCapacity. 1506 * @return the total length of the serialized format, including 1507 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1508 * than U_BUFFER_OVERFLOW_ERROR. 1509 * @stable ICU 2.4 1510 */ 1511 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const; 1512 1513 /** 1514 * Reallocate this objects internal structures to take up the least 1515 * possible space, without changing this object's value. 1516 * A frozen set will not be modified. 1517 * @stable ICU 2.4 1518 */ 1519 virtual UnicodeSet& compact(); 1520 1521 /** 1522 * Return the class ID for this class. This is useful only for 1523 * comparing to a return value from getDynamicClassID(). For example: 1524 * <pre> 1525 * . Base* polymorphic_pointer = createPolymorphicObject(); 1526 * . if (polymorphic_pointer->getDynamicClassID() == 1527 * . Derived::getStaticClassID()) ... 1528 * </pre> 1529 * @return The class ID for all objects of this class. 1530 * @stable ICU 2.0 1531 */ 1532 static UClassID U_EXPORT2 getStaticClassID(void); 1533 1534 /** 1535 * Implement UnicodeFunctor API. 1536 * 1537 * @return The class ID for this object. All objects of a given 1538 * class have the same class ID. Objects of other classes have 1539 * different class IDs. 1540 * @stable ICU 2.4 1541 */ 1542 virtual UClassID getDynamicClassID(void) const override; 1543 1544 private: 1545 1546 // Private API for the USet API 1547 1548 friend class USetAccess; 1549 1550 const UnicodeString* getString(int32_t index) const; 1551 1552 //---------------------------------------------------------------- 1553 // RuleBasedTransliterator support 1554 //---------------------------------------------------------------- 1555 1556 private: 1557 1558 /** 1559 * Returns <tt>true</tt> if this set contains any character whose low byte 1560 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 1561 * indexing. 1562 */ 1563 virtual UBool matchesIndexValue(uint8_t v) const override; 1564 1565 private: 1566 friend class RBBIRuleScanner; 1567 1568 //---------------------------------------------------------------- 1569 // Implementation: Clone as thawed (see ICU4J Freezable) 1570 //---------------------------------------------------------------- 1571 1572 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); 1573 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed); 1574 1575 //---------------------------------------------------------------- 1576 // Implementation: Pattern parsing 1577 //---------------------------------------------------------------- 1578 1579 void applyPatternIgnoreSpace(const UnicodeString& pattern, 1580 ParsePosition& pos, 1581 const SymbolTable* symbols, 1582 UErrorCode& status); 1583 1584 void applyPattern(RuleCharacterIterator& chars, 1585 const SymbolTable* symbols, 1586 UnicodeString& rebuiltPat, 1587 uint32_t options, 1588 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 1589 int32_t depth, 1590 UErrorCode& ec); 1591 1592 void closeOverCaseInsensitive(bool simple); 1593 void closeOverAddCaseMappings(); 1594 1595 //---------------------------------------------------------------- 1596 // Implementation: Utility methods 1597 //---------------------------------------------------------------- 1598 1599 static int32_t nextCapacity(int32_t minCapacity); 1600 1601 bool ensureCapacity(int32_t newLen); 1602 1603 bool ensureBufferCapacity(int32_t newLen); 1604 1605 void swapBuffers(void); 1606 1607 UBool allocateStrings(UErrorCode &status); 1608 int32_t stringsSize() const; 1609 UBool stringsContains(const UnicodeString &s) const; 1610 1611 UnicodeString& _toPattern(UnicodeString& result, 1612 UBool escapeUnprintable) const; 1613 1614 UnicodeString& _generatePattern(UnicodeString& result, 1615 UBool escapeUnprintable) const; 1616 1617 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable); 1618 1619 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable); 1620 1621 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end, 1622 UBool escapeUnprintable); 1623 1624 //---------------------------------------------------------------- 1625 // Implementation: Fundamental operators 1626 //---------------------------------------------------------------- 1627 1628 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity); 1629 1630 void add(const UChar32* other, int32_t otherLen, int8_t polarity); 1631 1632 void retain(const UChar32* other, int32_t otherLen, int8_t polarity); 1633 1634 /** 1635 * Return true if the given position, in the given pattern, appears 1636 * to be the start of a property set pattern [:foo:], \\p{foo}, or 1637 * \\P{foo}, or \\N{name}. 1638 */ 1639 static UBool resemblesPropertyPattern(const UnicodeString& pattern, 1640 int32_t pos); 1641 1642 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars, 1643 int32_t iterOpts); 1644 1645 /** 1646 * Parse the given property pattern at the given parse position 1647 * and set this UnicodeSet to the result. 1648 * 1649 * The original design document is out of date, but still useful. 1650 * Ignore the property and value names: 1651 * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html 1652 * 1653 * Recognized syntax: 1654 * 1655 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]" 1656 * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P" 1657 * \\N{name} - white space not allowed within "\\N" 1658 * 1659 * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored. 1660 * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading 1661 * and trailing space is deleted, and internal runs of whitespace 1662 * are collapsed to a single space. 1663 * 1664 * We support binary properties, enumerated properties, and the 1665 * following non-enumerated properties: 1666 * 1667 * Numeric_Value 1668 * Name 1669 * Unicode_1_Name 1670 * 1671 * @param pattern the pattern string 1672 * @param ppos on entry, the position at which to begin parsing. 1673 * This should be one of the locations marked '^': 1674 * 1675 * [:blah:] \\p{blah} \\P{blah} \\N{name} 1676 * ^ % ^ % ^ % ^ % 1677 * 1678 * On return, the position after the last character parsed, that is, 1679 * the locations marked '%'. If the parse fails, ppos is returned 1680 * unchanged. 1681 * @param ec status 1682 * @return a reference to this. 1683 */ 1684 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, 1685 ParsePosition& ppos, 1686 UErrorCode &ec); 1687 1688 void applyPropertyPattern(RuleCharacterIterator& chars, 1689 UnicodeString& rebuiltPat, 1690 UErrorCode& ec); 1691 1692 /** 1693 * A filter that returns true if the given code point should be 1694 * included in the UnicodeSet being constructed. 1695 */ 1696 typedef UBool (*Filter)(UChar32 codePoint, void* context); 1697 1698 /** 1699 * Given a filter, set this UnicodeSet to the code points 1700 * contained by that filter. The filter MUST be 1701 * property-conformant. That is, if it returns value v for one 1702 * code point, then it must return v for all affiliated code 1703 * points, as defined by the inclusions list. See 1704 * getInclusions(). 1705 * src is a UPropertySource value. 1706 */ 1707 void applyFilter(Filter filter, 1708 void* context, 1709 const UnicodeSet* inclusions, 1710 UErrorCode &status); 1711 1712 /** 1713 * Set the new pattern to cache. 1714 */ 1715 void setPattern(const UnicodeString& newPat) { 1716 setPattern(newPat.getBuffer(), newPat.length()); 1717 } 1718 void setPattern(const char16_t *newPat, int32_t newPatLen); 1719 /** 1720 * Release existing cached pattern. 1721 */ 1722 void releasePattern(); 1723 1724 friend class UnicodeSetIterator; 1725 }; 1726 1727 1728 1729 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const { 1730 return !operator==(o); 1731 } 1732 1733 inline UBool UnicodeSet::isFrozen() const { 1734 return (UBool)(bmpSet!=nullptr || stringSpan!=nullptr); 1735 } 1736 1737 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const { 1738 return !containsNone(start, end); 1739 } 1740 1741 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const { 1742 return !containsNone(s); 1743 } 1744 1745 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const { 1746 return !containsNone(s); 1747 } 1748 1749 inline UBool UnicodeSet::isBogus() const { 1750 return (UBool)(fFlags & kIsBogus); 1751 } 1752 1753 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) { 1754 return reinterpret_cast<UnicodeSet *>(uset); 1755 } 1756 1757 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) { 1758 return reinterpret_cast<const UnicodeSet *>(uset); 1759 } 1760 1761 inline USet *UnicodeSet::toUSet() { 1762 return reinterpret_cast<USet *>(this); 1763 } 1764 1765 inline const USet *UnicodeSet::toUSet() const { 1766 return reinterpret_cast<const USet *>(this); 1767 } 1768 1769 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const { 1770 int32_t sLength=s.length(); 1771 if(start<0) { 1772 start=0; 1773 } else if(start>sLength) { 1774 start=sLength; 1775 } 1776 return start+span(s.getBuffer()+start, sLength-start, spanCondition); 1777 } 1778 1779 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const { 1780 int32_t sLength=s.length(); 1781 if(limit<0) { 1782 limit=0; 1783 } else if(limit>sLength) { 1784 limit=sLength; 1785 } 1786 return spanBack(s.getBuffer(), limit, spanCondition); 1787 } 1788 1789 U_NAMESPACE_END 1790 1791 #endif /* U_SHOW_CPLUSPLUS_API */ 1792 1793 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |