include/unicode/uset.h

0001 // © 2016 and later: Unicode, Inc. and others.
0002 // License & terms of use: http://www.unicode.org/copyright.html
0003 /*
0004 *******************************************************************************
0005 *
0006 *   Copyright (C) 2002-2014, International Business Machines
0007 *   Corporation and others.  All Rights Reserved.
0008 *
0009 *******************************************************************************
0010 *   file name:  uset.h
0011 *   encoding:   UTF-8
0012 *   tab size:   8 (not used)
0013 *   indentation:4
0014 *
0015 *   created on: 2002mar07
0016 *   created by: Markus W. Scherer
0017 *
0018 *   C version of UnicodeSet.
0019 */
0020
0021
0022 /**
0023  * \file
0024  * \brief C API: Unicode Set
0025  *
0026  * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
0027  */
0028
0029 #ifndef __USET_H__
0030 #define __USET_H__
0031
0032 #include "unicode/utypes.h"
0033 #include "unicode/uchar.h"
0034
0035 #if U_SHOW_CPLUSPLUS_API
0036 #include <string_view>
0037 #include "unicode/char16ptr.h"
0038 #include "unicode/localpointer.h"
0039 #include "unicode/unistr.h"
0040 #endif   // U_SHOW_CPLUSPLUS_API
0041
0042 #ifndef USET_DEFINED
0043
0044 #ifndef U_IN_DOXYGEN
0045 #define USET_DEFINED
0046 #endif
0047 /**
0048  * USet is the C API type corresponding to C++ class UnicodeSet.
0049  * Use the uset_* API to manipulate.  Create with
0050  * uset_open*, and destroy with uset_close.
0051  * @stable ICU 2.4
0052  */
0053 typedef struct USet USet;
0054 #endif
0055
0056 /**
0057  * Bitmask values to be passed to uset_openPatternOptions() or
0058  * uset_applyPattern() taking an option parameter.
0059  *
0060  * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
0061  * These case options are mutually exclusive.
0062  *
0063  * Undefined options bits are ignored, and reserved for future use.
0064  *
0065  * @stable ICU 2.4
0066  */
0067 enum {
0068     /**
0069      * Ignore white space within patterns unless quoted or escaped.
0070      * @stable ICU 2.4
0071      */
0072     USET_IGNORE_SPACE = 1,
0073
0074     /**
0075      * Enable case insensitive matching.  E.g., "[ab]" with this flag
0076      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
0077      * match all except 'a', 'A', 'b', and 'B'. This performs a full
0078      * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
0079      *
0080      * The resulting set is a superset of the input for the code points but
0081      * not for the strings.
0082      * It performs a case mapping closure of the code points and adds
0083      * full case folding strings for the code points, and reduces strings of
0084      * the original set to their full case folding equivalents.
0085      *
0086      * This is designed for case-insensitive matches, for example
0087      * in regular expressions. The full code point case closure allows checking of
0088      * an input character directly against the closure set.
0089      * Strings are matched by comparing the case-folded form from the closure
0090      * set with an incremental case folding of the string in question.
0091      *
0092      * The closure set will also contain single code points if the original
0093      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
0094      * This is not necessary (that is, redundant) for the above matching method
0095      * but results in the same closure sets regardless of whether the original
0096      * set contained the code point or a string.
0097      *
0098      * @stable ICU 2.4
0099      */
0100     USET_CASE_INSENSITIVE = 2,
0101
0102     /**
0103      * Adds all case mappings for each element in the set.
0104      * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
0105      * of each existing element in the set.
0106      *
0107      * Unlike the “case insensitive” options, this does not perform a closure.
0108      * For example, it does not add 'ſ' (U+017F long s) for 's',
0109      * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
0110      *
0111      * @stable ICU 3.2
0112      */
0113     USET_ADD_CASE_MAPPINGS = 4,
0114
0115     /**
0116      * Enable case insensitive matching.
0117      * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
0118      * which map each code point to one code point,
0119      * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
0120      *
0121      * This is designed for case-insensitive matches, for example in certain
0122      * regular expression implementations where only Simple_Case_Folding mappings are used,
0123      * such as in ECMAScript (JavaScript) regular expressions.
0124      *
0125      * @stable ICU 73
0126      */
0127     USET_SIMPLE_CASE_INSENSITIVE = 6
0128 };
0129
0130 /**
0131  * Argument values for whether span() and similar functions continue while
0132  * the current character is contained vs. not contained in the set.
0133  *
0134  * The functionality is straightforward for sets with only single code points,
0135  * without strings (which is the common case):
0136  * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
0137  * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
0138  * - span() and spanBack() partition any string the same way when
0139  *   alternating between span(USET_SPAN_NOT_CONTAINED) and
0140  *   span(either "contained" condition).
0141  * - Using a complemented (inverted) set and the opposite span conditions
0142  *   yields the same results.
0143  *
0144  * When a set contains multi-code point strings, then these statements may not
0145  * be true, depending on the strings in the set (for example, whether they
0146  * overlap with each other) and the string that is processed.
0147  * For a set with strings:
0148  * - The complement of the set contains the opposite set of code points,
0149  *   but the same set of strings.
0150  *   Therefore, complementing both the set and the span conditions
0151  *   may yield different results.
0152  * - When starting spans at different positions in a string
0153  *   (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
0154  *   because a set string may start before the later position.
0155  * - span(USET_SPAN_SIMPLE) may be shorter than
0156  *   span(USET_SPAN_CONTAINED) because it will not recursively try
0157  *   all possible paths.
0158  *   For example, with a set which contains the three strings "xy", "xya" and "ax",
0159  *   span("xyax", USET_SPAN_CONTAINED) will return 4 but
0160  *   span("xyax", USET_SPAN_SIMPLE) will return 3.
0161  *   span(USET_SPAN_SIMPLE) will never be longer than
0162  *   span(USET_SPAN_CONTAINED).
0163  * - With either "contained" condition, span() and spanBack() may partition
0164  *   a string in different ways.
0165  *   For example, with a set which contains the two strings "ab" and "ba",
0166  *   and when processing the string "aba",
0167  *   span() will yield contained/not-contained boundaries of { 0, 2, 3 }
0168  *   while spanBack() will yield boundaries of { 0, 1, 3 }.
0169  *
0170  * Note: If it is important to get the same boundaries whether iterating forward
0171  * or backward through a string, then either only span() should be used and
0172  * the boundaries cached for backward operation, or an ICU BreakIterator
0173  * could be used.
0174  *
0175  * Note: Unpaired surrogates are treated like surrogate code points.
0176  * Similarly, set strings match only on code point boundaries,
0177  * never in the middle of a surrogate pair.
0178  * Illegal UTF-8 sequences are treated like U+FFFD.
0179  * When processing UTF-8 strings, malformed set strings
0180  * (strings with unpaired surrogates which cannot be converted to UTF-8)
0181  * are ignored.
0182  *
0183  * @stable ICU 3.8
0184  */
0185 typedef enum USetSpanCondition {
0186     /**
0187      * Continues a span() while there is no set element at the current position.
0188      * Increments by one code point at a time.
0189      * Stops before the first set element (character or string).
0190      * (For code points only, this is like while contains(current)==false).
0191      *
0192      * When span() returns, the substring between where it started and the position
0193      * it returned consists only of characters that are not in the set,
0194      * and none of its strings overlap with the span.
0195      *
0196      * @stable ICU 3.8
0197      */
0198     USET_SPAN_NOT_CONTAINED = 0,
0199     /**
0200      * Spans the longest substring that is a concatenation of set elements (characters or strings).
0201      * (For characters only, this is like while contains(current)==true).
0202      *
0203      * When span() returns, the substring between where it started and the position
0204      * it returned consists only of set elements (characters or strings) that are in the set.
0205      *
0206      * If a set contains strings, then the span will be the longest substring for which there
0207      * exists at least one non-overlapping concatenation of set elements (characters or strings).
0208      * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
0209      * (Java/ICU/Perl regex stops at the first match of an OR.)
0210      *
0211      * @stable ICU 3.8
0212      */
0213     USET_SPAN_CONTAINED = 1,
0214     /**
0215      * Continues a span() while there is a set element at the current position.
0216      * Increments by the longest matching element at each position.
0217      * (For characters only, this is like while contains(current)==true).
0218      *
0219      * When span() returns, the substring between where it started and the position
0220      * it returned consists only of set elements (characters or strings) that are in the set.
0221      *
0222      * If a set only contains single characters, then this is the same
0223      * as USET_SPAN_CONTAINED.
0224      *
0225      * If a set contains strings, then the span will be the longest substring
0226      * with a match at each position with the longest single set element (character or string).
0227      *
0228      * Use this span condition together with other longest-match algorithms,
0229      * such as ICU converters (ucnv_getUnicodeSet()).
0230      *
0231      * @stable ICU 3.8
0232      */
0233     USET_SPAN_SIMPLE = 2,
0234 #ifndef U_HIDE_DEPRECATED_API
0235     /**
0236      * One more than the last span condition.
0237      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
0238      */
0239     USET_SPAN_CONDITION_COUNT
0240 #endif  // U_HIDE_DEPRECATED_API
0241 } USetSpanCondition;
0242
0243 enum {
0244     /**
0245      * Capacity of USerializedSet::staticArray.
0246      * Enough for any single-code point set.
0247      * Also provides padding for nice sizeof(USerializedSet).
0248      * @stable ICU 2.4
0249      */
0250     USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
0251 };
0252
0253 /**
0254  * A serialized form of a Unicode set.  Limited manipulations are
0255  * possible directly on a serialized set.  See below.
0256  * @stable ICU 2.4
0257  */
0258 typedef struct USerializedSet {
0259     /**
0260      * The serialized Unicode Set.
0261      * @stable ICU 2.4
0262      */
0263     const uint16_t *array;
0264     /**
0265      * The length of the array that contains BMP characters.
0266      * @stable ICU 2.4
0267      */
0268     int32_t bmpLength;
0269     /**
0270      * The total length of the array.
0271      * @stable ICU 2.4
0272      */
0273     int32_t length;
0274     /**
0275      * A small buffer for the array to reduce memory allocations.
0276      * @stable ICU 2.4
0277      */
0278     uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
0279 } USerializedSet;
0280
0281 /*********************************************************************
0282  * USet API
0283  *********************************************************************/
0284
0285 /**
0286  * Create an empty USet object.
0287  * Equivalent to uset_open(1, 0).
0288  * @return a newly created USet.  The caller must call uset_close() on
0289  * it when done.
0290  * @stable ICU 4.2
0291  */
0292 U_CAPI USet* U_EXPORT2
0293 uset_openEmpty(void);
0294
0295 /**
0296  * Creates a USet object that contains the range of characters
0297  * start..end, inclusive.  If <code>start > end</code>
0298  * then an empty set is created (same as using uset_openEmpty()).
0299  * @param start first character of the range, inclusive
0300  * @param end last character of the range, inclusive
0301  * @return a newly created USet.  The caller must call uset_close() on
0302  * it when done.
0303  * @stable ICU 2.4
0304  */
0305 U_CAPI USet* U_EXPORT2
0306 uset_open(UChar32 start, UChar32 end);
0307
0308 /**
0309  * Creates a set from the given pattern.  See the UnicodeSet class
0310  * description for the syntax of the pattern language.
0311  * @param pattern a string specifying what characters are in the set
0312  * @param patternLength the length of the pattern, or -1 if null
0313  * terminated
0314  * @param ec the error code
0315  * @stable ICU 2.4
0316  */
0317 U_CAPI USet* U_EXPORT2
0318 uset_openPattern(const UChar* pattern, int32_t patternLength,
0319                  UErrorCode* ec);
0320
0321 /**
0322  * Creates a set from the given pattern.  See the UnicodeSet class
0323  * description for the syntax of the pattern language.
0324  * @param pattern a string specifying what characters are in the set
0325  * @param patternLength the length of the pattern, or -1 if null
0326  * terminated
0327  * @param options bitmask for options to apply to the pattern.
0328  * Valid options are USET_IGNORE_SPACE and
0329  * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
0330  * These case options are mutually exclusive.
0331  * @param ec the error code
0332  * @stable ICU 2.4
0333  */
0334 U_CAPI USet* U_EXPORT2
0335 uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
0336                  uint32_t options,
0337                  UErrorCode* ec);
0338
0339 /**
0340  * Disposes of the storage used by a USet object.  This function should
0341  * be called exactly once for objects returned by uset_open().
0342  * @param set the object to dispose of
0343  * @stable ICU 2.4
0344  */
0345 U_CAPI void U_EXPORT2
0346 uset_close(USet* set);
0347
0348 #if U_SHOW_CPLUSPLUS_API
0349
0350 U_NAMESPACE_BEGIN
0351
0352 /**
0353  * \class LocalUSetPointer
0354  * "Smart pointer" class, closes a USet via uset_close().
0355  * For most methods see the LocalPointerBase base class.
0356  *
0357  * @see LocalPointerBase
0358  * @see LocalPointer
0359  * @stable ICU 4.4
0360  */
0361 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
0362
0363 U_NAMESPACE_END
0364
0365 #endif
0366
0367 /**
0368  * Returns a copy of this object.
0369  * If this set is frozen, then the clone will be frozen as well.
0370  * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
0371  * @param set the original set
0372  * @return the newly allocated copy of the set
0373  * @see uset_cloneAsThawed
0374  * @stable ICU 3.8
0375  */
0376 U_CAPI USet * U_EXPORT2
0377 uset_clone(const USet *set);
0378
0379 /**
0380  * Determines whether the set has been frozen (made immutable) or not.
0381  * See the ICU4J Freezable interface for details.
0382  * @param set the set
0383  * @return true/false for whether the set has been frozen
0384  * @see uset_freeze
0385  * @see uset_cloneAsThawed
0386  * @stable ICU 3.8
0387  */
0388 U_CAPI UBool U_EXPORT2
0389 uset_isFrozen(const USet *set);
0390
0391 /**
0392  * Freeze the set (make it immutable).
0393  * Once frozen, it cannot be unfrozen and is therefore thread-safe
0394  * until it is deleted.
0395  * See the ICU4J Freezable interface for details.
0396  * Freezing the set may also make some operations faster, for example
0397  * uset_contains() and uset_span().
0398  * A frozen set will not be modified. (It remains frozen.)
0399  * @param set the set
0400  * @return the same set, now frozen
0401  * @see uset_isFrozen
0402  * @see uset_cloneAsThawed
0403  * @stable ICU 3.8
0404  */
0405 U_CAPI void U_EXPORT2
0406 uset_freeze(USet *set);
0407
0408 /**
0409  * Clone the set and make the clone mutable.
0410  * See the ICU4J Freezable interface for details.
0411  * @param set the set
0412  * @return the mutable clone
0413  * @see uset_freeze
0414  * @see uset_isFrozen
0415  * @see uset_clone
0416  * @stable ICU 3.8
0417  */
0418 U_CAPI USet * U_EXPORT2
0419 uset_cloneAsThawed(const USet *set);
0420
0421 /**
0422  * Causes the USet object to represent the range <code>start - end</code>.
0423  * If <code>start > end</code> then this USet is set to an empty range.
0424  * A frozen set will not be modified.
0425  * @param set the object to set to the given range
0426  * @param start first character in the set, inclusive
0427  * @param end last character in the set, inclusive
0428  * @stable ICU 3.2
0429  */
0430 U_CAPI void U_EXPORT2
0431 uset_set(USet* set,
0432          UChar32 start, UChar32 end);
0433
0434 /**
0435  * Modifies the set to represent the set specified by the given
0436  * pattern. See the UnicodeSet class description for the syntax of
0437  * the pattern language. See also the User Guide chapter about UnicodeSet.
0438  * <em>Empties the set passed before applying the pattern.</em>
0439  * A frozen set will not be modified.
0440  * @param set               The set to which the pattern is to be applied.
0441  * @param pattern           A pointer to UChar string specifying what characters are in the set.
0442  *                          The character at pattern[0] must be a '['.
0443  * @param patternLength     The length of the UChar string. -1 if NUL terminated.
0444  * @param options           A bitmask for options to apply to the pattern.
0445  *                          Valid options are USET_IGNORE_SPACE and
0446  *                          at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
0447  *                          USET_SIMPLE_CASE_INSENSITIVE.
0448  *                          These case options are mutually exclusive.
0449  * @param status            Returns an error if the pattern cannot be parsed.
0450  * @return                  Upon successful parse, the value is either
0451  *                          the index of the character after the closing ']'
0452  *                          of the parsed pattern.
0453  *                          If the status code indicates failure, then the return value
0454  *                          is the index of the error in the source.
0455  *
0456  * @stable ICU 2.8
0457  */
0458 U_CAPI int32_t U_EXPORT2
0459 uset_applyPattern(USet *set,
0460                   const UChar *pattern, int32_t patternLength,
0461                   uint32_t options,
0462                   UErrorCode *status);
0463
0464 /**
0465  * Modifies the set to contain those code points which have the given value
0466  * for the given binary or enumerated property, as returned by
0467  * u_getIntPropertyValue.  Prior contents of this set are lost.
0468  * A frozen set will not be modified.
0469  *
0470  * @param set the object to contain the code points defined by the property
0471  *
0472  * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
0473  * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
0474  * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
0475  *
0476  * @param value a value in the range u_getIntPropertyMinValue(prop)..
0477  * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
0478  * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
0479  * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
0480  * categories such as [:L:] to be represented.
0481  *
0482  * @param ec error code input/output parameter
0483  *
0484  * @stable ICU 3.2
0485  */
0486 U_CAPI void U_EXPORT2
0487 uset_applyIntPropertyValue(USet* set,
0488                            UProperty prop, int32_t value, UErrorCode* ec);
0489
0490 /**
0491  * Modifies the set to contain those code points which have the
0492  * given value for the given property.  Prior contents of this
0493  * set are lost.
0494  * A frozen set will not be modified.
0495  *
0496  * @param set the object to contain the code points defined by the given
0497  * property and value alias
0498  *
0499  * @param prop a string specifying a property alias, either short or long.
0500  * The name is matched loosely.  See PropertyAliases.txt for names and a
0501  * description of loose matching.  If the value string is empty, then this
0502  * string is interpreted as either a General_Category value alias, a Script
0503  * value alias, a binary property alias, or a special ID.  Special IDs are
0504  * matched loosely and correspond to the following sets:
0505  *
0506  * "ANY" = [\\u0000-\\U0010FFFF],
0507  * "ASCII" = [\\u0000-\\u007F],
0508  * "Assigned" = [:^Cn:].
0509  *
0510  * @param propLength the length of the prop, or -1 if NULL
0511  *
0512  * @param value a string specifying a value alias, either short or long.
0513  * The name is matched loosely.  See PropertyValueAliases.txt for names
0514  * and a description of loose matching.  In addition to aliases listed,
0515  * numeric values and canonical combining classes may be expressed
0516  * numerically, e.g., ("nv", "0.5") or ("ccc", "220").  The value string
0517  * may also be empty.
0518  *
0519  * @param valueLength the length of the value, or -1 if NULL
0520  *
0521  * @param ec error code input/output parameter
0522  *
0523  * @stable ICU 3.2
0524  */
0525 U_CAPI void U_EXPORT2
0526 uset_applyPropertyAlias(USet* set,
0527                         const UChar *prop, int32_t propLength,
0528                         const UChar *value, int32_t valueLength,
0529                         UErrorCode* ec);
0530
0531 /**
0532  * Return true if the given position, in the given pattern, appears
0533  * to be the start of a UnicodeSet pattern.
0534  *
0535  * @param pattern a string specifying the pattern
0536  * @param patternLength the length of the pattern, or -1 if NULL
0537  * @param pos the given position
0538  * @stable ICU 3.2
0539  */
0540 U_CAPI UBool U_EXPORT2
0541 uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
0542                       int32_t pos);
0543
0544 /**
0545  * Returns a string representation of this set.  If the result of
0546  * calling this function is passed to a uset_openPattern(), it
0547  * will produce another set that is equal to this one.
0548  * @param set the set
0549  * @param result the string to receive the rules, may be NULL
0550  * @param resultCapacity the capacity of result, may be 0 if result is NULL
0551  * @param escapeUnprintable if true then convert unprintable
0552  * character to their hex escape representations, \\uxxxx or
0553  * \\Uxxxxxxxx.  Unprintable characters are those other than
0554  * U+000A, U+0020..U+007E.
0555  * @param ec error code.
0556  * @return length of string, possibly larger than resultCapacity
0557  * @stable ICU 2.4
0558  */
0559 U_CAPI int32_t U_EXPORT2
0560 uset_toPattern(const USet* set,
0561                UChar* result, int32_t resultCapacity,
0562                UBool escapeUnprintable,
0563                UErrorCode* ec);
0564
0565 /**
0566  * Adds the given character to the given USet.  After this call,
0567  * uset_contains(set, c) will return true.
0568  * A frozen set will not be modified.
0569  * @param set the object to which to add the character
0570  * @param c the character to add
0571  * @stable ICU 2.4
0572  */
0573 U_CAPI void U_EXPORT2
0574 uset_add(USet* set, UChar32 c);
0575
0576 /**
0577  * Adds all of the elements in the specified set to this set if
0578  * they're not already present.  This operation effectively
0579  * modifies this set so that its value is the <i>union</i> of the two
0580  * sets.  The behavior of this operation is unspecified if the specified
0581  * collection is modified while the operation is in progress.
0582  * A frozen set will not be modified.
0583  *
0584  * @param set the object to which to add the set
0585  * @param additionalSet the source set whose elements are to be added to this set.
0586  * @stable ICU 2.6
0587  */
0588 U_CAPI void U_EXPORT2
0589 uset_addAll(USet* set, const USet *additionalSet);
0590
0591 /**
0592  * Adds the given range of characters to the given USet.  After this call,
0593  * uset_contains(set, start, end) will return true.
0594  * A frozen set will not be modified.
0595  * @param set the object to which to add the character
0596  * @param start the first character of the range to add, inclusive
0597  * @param end the last character of the range to add, inclusive
0598  * @stable ICU 2.2
0599  */
0600 U_CAPI void U_EXPORT2
0601 uset_addRange(USet* set, UChar32 start, UChar32 end);
0602
0603 /**
0604  * Adds the given string to the given USet.  After this call,
0605  * uset_containsString(set, str, strLen) will return true.
0606  * A frozen set will not be modified.
0607  * @param set the object to which to add the character
0608  * @param str the string to add
0609  * @param strLen the length of the string or -1 if null terminated.
0610  * @stable ICU 2.4
0611  */
0612 U_CAPI void U_EXPORT2
0613 uset_addString(USet* set, const UChar* str, int32_t strLen);
0614
0615 /**
0616  * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
0617  * If this set already contains any particular character, it has no effect on that character.
0618  * A frozen set will not be modified.
0619  * @param set the object to which to add the character
0620  * @param str the source string
0621  * @param strLen the length of the string or -1 if null terminated.
0622  * @stable ICU 3.4
0623  */
0624 U_CAPI void U_EXPORT2
0625 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
0626
0627 /**
0628  * Removes the given character from the given USet.  After this call,
0629  * uset_contains(set, c) will return false.
0630  * A frozen set will not be modified.
0631  * @param set the object from which to remove the character
0632  * @param c the character to remove
0633  * @stable ICU 2.4
0634  */
0635 U_CAPI void U_EXPORT2
0636 uset_remove(USet* set, UChar32 c);
0637
0638 /**
0639  * Removes the given range of characters from the given USet.  After this call,
0640  * uset_contains(set, start, end) will return false.
0641  * A frozen set will not be modified.
0642  * @param set the object to which to add the character
0643  * @param start the first character of the range to remove, inclusive
0644  * @param end the last character of the range to remove, inclusive
0645  * @stable ICU 2.2
0646  */
0647 U_CAPI void U_EXPORT2
0648 uset_removeRange(USet* set, UChar32 start, UChar32 end);
0649
0650 /**
0651  * Removes the given string to the given USet.  After this call,
0652  * uset_containsString(set, str, strLen) will return false.
0653  * A frozen set will not be modified.
0654  * @param set the object to which to add the character
0655  * @param str the string to remove
0656  * @param strLen the length of the string or -1 if null terminated.
0657  * @stable ICU 2.4
0658  */
0659 U_CAPI void U_EXPORT2
0660 uset_removeString(USet* set, const UChar* str, int32_t strLen);
0661
0662 /**
0663  * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
0664  * A frozen set will not be modified.
0665  *
0666  * @param set the object to be modified
0667  * @param str the string
0668  * @param length the length of the string, or -1 if NUL-terminated
0669  * @stable ICU 69
0670  */
0671 U_CAPI void U_EXPORT2
0672 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
0673
0674 /**
0675  * Removes from this set all of its elements that are contained in the
0676  * specified set.  This operation effectively modifies this
0677  * set so that its value is the <i>asymmetric set difference</i> of
0678  * the two sets.
0679  * A frozen set will not be modified.
0680  * @param set the object from which the elements are to be removed
0681  * @param removeSet the object that defines which elements will be
0682  * removed from this set
0683  * @stable ICU 3.2
0684  */
0685 U_CAPI void U_EXPORT2
0686 uset_removeAll(USet* set, const USet* removeSet);
0687
0688 /**
0689  * Retain only the elements in this set that are contained in the
0690  * specified range.  If <code>start > end</code> then an empty range is
0691  * retained, leaving the set empty.  This is equivalent to
0692  * a boolean logic AND, or a set INTERSECTION.
0693  * A frozen set will not be modified.
0694  *
0695  * @param set the object for which to retain only the specified range
0696  * @param start first character, inclusive, of range
0697  * @param end last character, inclusive, of range
0698  * @stable ICU 3.2
0699  */
0700 U_CAPI void U_EXPORT2
0701 uset_retain(USet* set, UChar32 start, UChar32 end);
0702
0703 /**
0704  * Retains only the specified string from this set if it is present.
0705  * Upon return this set will be empty if it did not contain s, or
0706  * will only contain s if it did contain s.
0707  * A frozen set will not be modified.
0708  *
0709  * @param set the object to be modified
0710  * @param str the string
0711  * @param length the length of the string, or -1 if NUL-terminated
0712  * @stable ICU 69
0713  */
0714 U_CAPI void U_EXPORT2
0715 uset_retainString(USet *set, const UChar *str, int32_t length);
0716
0717 /**
0718  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
0719  * A frozen set will not be modified.
0720  *
0721  * @param set the object to be modified
0722  * @param str the string
0723  * @param length the length of the string, or -1 if NUL-terminated
0724  * @stable ICU 69
0725  */
0726 U_CAPI void U_EXPORT2
0727 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
0728
0729 /**
0730  * Retains only the elements in this set that are contained in the
0731  * specified set.  In other words, removes from this set all of
0732  * its elements that are not contained in the specified set.  This
0733  * operation effectively modifies this set so that its value is
0734  * the <i>intersection</i> of the two sets.
0735  * A frozen set will not be modified.
0736  *
0737  * @param set the object on which to perform the retain
0738  * @param retain set that defines which elements this set will retain
0739  * @stable ICU 3.2
0740  */
0741 U_CAPI void U_EXPORT2
0742 uset_retainAll(USet* set, const USet* retain);
0743
0744 /**
0745  * Reallocate this objects internal structures to take up the least
0746  * possible space, without changing this object's value.
0747  * A frozen set will not be modified.
0748  *
0749  * @param set the object on which to perform the compact
0750  * @stable ICU 3.2
0751  */
0752 U_CAPI void U_EXPORT2
0753 uset_compact(USet* set);
0754
0755 /**
0756  * This is equivalent to
0757  * <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
0758  *
0759  * <strong>Note:</strong> This performs a symmetric difference with all code points
0760  * <em>and thus retains all multicharacter strings</em>.
0761  * In order to achieve a “code point complement” (all code points minus this set),
0762  * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
0763  *
0764  * A frozen set will not be modified.
0765  * @param set the set
0766  * @stable ICU 2.4
0767  */
0768 U_CAPI void U_EXPORT2
0769 uset_complement(USet* set);
0770
0771 /**
0772  * Complements the specified range in this set.  Any character in
0773  * the range will be removed if it is in this set, or will be
0774  * added if it is not in this set.  If <code>start > end</code>
0775  * then an empty range is complemented, leaving the set unchanged.
0776  * This is equivalent to a boolean logic XOR.
0777  * A frozen set will not be modified.
0778  *
0779  * @param set the object to be modified
0780  * @param start first character, inclusive, of range
0781  * @param end last character, inclusive, of range
0782  * @stable ICU 69
0783  */
0784 U_CAPI void U_EXPORT2
0785 uset_complementRange(USet *set, UChar32 start, UChar32 end);
0786
0787 /**
0788  * Complements the specified string in this set.
0789  * The string will be removed if it is in this set, or will be added if it is not in this set.
0790  * A frozen set will not be modified.
0791  *
0792  * @param set the object to be modified
0793  * @param str the string
0794  * @param length the length of the string, or -1 if NUL-terminated
0795  * @stable ICU 69
0796  */
0797 U_CAPI void U_EXPORT2
0798 uset_complementString(USet *set, const UChar *str, int32_t length);
0799
0800 /**
0801  * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
0802  * A frozen set will not be modified.
0803  *
0804  * @param set the object to be modified
0805  * @param str the string
0806  * @param length the length of the string, or -1 if NUL-terminated
0807  * @stable ICU 69
0808  */
0809 U_CAPI void U_EXPORT2
0810 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
0811
0812 /**
0813  * Complements in this set all elements contained in the specified
0814  * set.  Any character in the other set will be removed if it is
0815  * in this set, or will be added if it is not in this set.
0816  * A frozen set will not be modified.
0817  *
0818  * @param set the set with which to complement
0819  * @param complement set that defines which elements will be xor'ed
0820  * from this set.
0821  * @stable ICU 3.2
0822  */
0823 U_CAPI void U_EXPORT2
0824 uset_complementAll(USet* set, const USet* complement);
0825
0826 /**
0827  * Removes all of the elements from this set.  This set will be
0828  * empty after this call returns.
0829  * A frozen set will not be modified.
0830  * @param set the set
0831  * @stable ICU 2.4
0832  */
0833 U_CAPI void U_EXPORT2
0834 uset_clear(USet* set);
0835
0836 /**
0837  * Close this set over the given attribute.  For the attribute
0838  * USET_CASE_INSENSITIVE, the result is to modify this set so that:
0839  *
0840  * 1. For each character or string 'a' in this set, all strings or
0841  * characters 'b' such that foldCase(a) == foldCase(b) are added
0842  * to this set.
0843  *
0844  * 2. For each string 'e' in the resulting set, if e !=
0845  * foldCase(e), 'e' will be removed.
0846  *
0847  * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
0848  *
0849  * (Here foldCase(x) refers to the operation u_strFoldCase, and a
0850  * == b denotes that the contents are the same, not pointer
0851  * comparison.)
0852  *
0853  * A frozen set will not be modified.
0854  *
0855  * @param set the set
0856  *
0857  * @param attributes bitmask for attributes to close over.
0858  * Valid options:
0859  * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
0860  * These case options are mutually exclusive.
0861  * Unrelated options bits are ignored.
0862  * @stable ICU 4.2
0863  */
0864 U_CAPI void U_EXPORT2
0865 uset_closeOver(USet* set, int32_t attributes);
0866
0867 /**
0868  * Remove all strings from this set.
0869  *
0870  * @param set the set
0871  * @stable ICU 4.2
0872  */
0873 U_CAPI void U_EXPORT2
0874 uset_removeAllStrings(USet* set);
0875
0876 /**
0877  * Returns true if the given USet contains no characters and no
0878  * strings.
0879  * @param set the set
0880  * @return true if set is empty
0881  * @stable ICU 2.4
0882  */
0883 U_CAPI UBool U_EXPORT2
0884 uset_isEmpty(const USet* set);
0885
0886 /**
0887  * @param set the set
0888  * @return true if this set contains multi-character strings or the empty string.
0889  * @stable ICU 70
0890  */
0891 U_CAPI UBool U_EXPORT2
0892 uset_hasStrings(const USet *set);
0893
0894 /**
0895  * Returns true if the given USet contains the given character.
0896  * This function works faster with a frozen set.
0897  * @param set the set
0898  * @param c The codepoint to check for within the set
0899  * @return true if set contains c
0900  * @stable ICU 2.4
0901  */
0902 U_CAPI UBool U_EXPORT2
0903 uset_contains(const USet* set, UChar32 c);
0904
0905 /**
0906  * Returns true if the given USet contains all characters c
0907  * where start <= c && c <= end.
0908  * @param set the set
0909  * @param start the first character of the range to test, inclusive
0910  * @param end the last character of the range to test, inclusive
0911  * @return true if set contains the range
0912  * @stable ICU 2.2
0913  */
0914 U_CAPI UBool U_EXPORT2
0915 uset_containsRange(const USet* set, UChar32 start, UChar32 end);
0916
0917 /**
0918  * Returns true if the given USet contains the given string.
0919  * @param set the set
0920  * @param str the string
0921  * @param strLen the length of the string or -1 if null terminated.
0922  * @return true if set contains str
0923  * @stable ICU 2.4
0924  */
0925 U_CAPI UBool U_EXPORT2
0926 uset_containsString(const USet* set, const UChar* str, int32_t strLen);
0927
0928 /**
0929  * Returns the index of the given character within this set, where
0930  * the set is ordered by ascending code point.  If the character
0931  * is not in this set, return -1.  The inverse of this method is
0932  * <code>charAt()</code>.
0933  * @param set the set
0934  * @param c the character to obtain the index for
0935  * @return an index from 0..size()-1, or -1
0936  * @stable ICU 3.2
0937  */
0938 U_CAPI int32_t U_EXPORT2
0939 uset_indexOf(const USet* set, UChar32 c);
0940
0941 /**
0942  * Returns the character at the given index within this set, where
0943  * the set is ordered by ascending code point.  If the index is
0944  * out of range for characters, returns (UChar32)-1.
0945  * The inverse of this method is <code>indexOf()</code>.
0946  *
0947  * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
0948  * with uset_getItem(), because for each call it skips linearly over <code>index</code>
0949  * characters in the ranges.
0950  *
0951  * @param set the set
0952  * @param charIndex an index from 0..size()-1 to obtain the char for
0953  * @return the character at the given index, or (UChar32)-1.
0954  * @stable ICU 3.2
0955  */
0956 U_CAPI UChar32 U_EXPORT2
0957 uset_charAt(const USet* set, int32_t charIndex);
0958
0959 /**
0960  * Returns the number of characters and strings contained in this set.
0961  * The last uset_getStringCount() == (uset_getItemCount() - uset_getRangeCount()) items are strings.
0962  *
0963  * This is slower than uset_getRangeCount() and uset_getItemCount() because
0964  * it counts the code points of all ranges.
0965  *
0966  * @param set the set
0967  * @return a non-negative integer counting the characters and strings
0968  * contained in set
0969  * @stable ICU 2.4
0970  * @see uset_getRangeCount
0971  * @see uset_getStringCount
0972  * @see uset_getItemCount
0973  */
0974 U_CAPI int32_t U_EXPORT2
0975 uset_size(const USet* set);
0976
0977 /**
0978  * @param set the set
0979  * @return the number of ranges in this set.
0980  * @stable ICU 70
0981  * @see uset_getItemCount
0982  * @see uset_getItem
0983  * @see uset_getStringCount
0984  * @see uset_size
0985  */
0986 U_CAPI int32_t U_EXPORT2
0987 uset_getRangeCount(const USet *set);
0988
0989 #ifndef U_HIDE_DRAFT_API
0990
0991 /**
0992  * @param set the set
0993  * @return the number of strings in this set.
0994  * @draft ICU 76
0995  * @see uset_getRangeCount
0996  * @see uset_getItemCount
0997  * @see uset_size
0998  */
0999 U_CAPI int32_t U_EXPORT2
1000 uset_getStringCount(const USet *set);
1001
1002 /**
1003  * Returns the index-th string (empty or multi-character) in the set.
1004  * The string may not be NUL-terminated.
1005  * The output length must be used, and the caller must not read more than that many UChars.
1006  *
1007  * @param set the set
1008  * @param index the string index, 0 .. uset_getStringCount() - 1
1009  * @param pLength the output string length; must not be NULL
1010  * @return the pointer to the string; NULL if the index is out of range or pLength is NULL
1011  * @draft ICU 76
1012  * @see uset_getStringCount
1013  */
1014 U_CAPI const UChar* U_EXPORT2
1015 uset_getString(const USet *set, int32_t index, int32_t *pLength);
1016
1017 #endif  // U_HIDE_DRAFT_API
1018
1019 /**
1020  * Returns the number of items in this set.  An item is either a range
1021  * of characters or a single multicharacter string.
1022  * @param set the set
1023  * @return a non-negative integer counting the character ranges
1024  * and/or strings contained in set
1025  * @stable ICU 2.4
1026  * @see uset_getRangeCount
1027  * @see uset_getStringCount
1028  */
1029 U_CAPI int32_t U_EXPORT2
1030 uset_getItemCount(const USet* set);
1031
1032 /**
1033  * Returns an item of this set.  An item is either a range of
1034  * characters or a single multicharacter string (which can be the empty string).
1035  *
1036  * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
1037  * and the range is <code>*start</code>..<code>*end</code>.
1038  *
1039  * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
1040  * this function copies the string into <code>str[strCapacity]</code> and
1041  * returns the length of the string (0 for the empty string).
1042  * See uset_getString() for a function that does not copy the string contents.
1043  *
1044  * If <code>itemIndex</code> is out of range, then this function returns -1.
1045  *
1046  * Note that 0 is returned for each range as well as for the empty string.
1047  *
1048  * @param set the set
1049  * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
1050  * @param start pointer to variable to receive first character in range, inclusive;
1051  *              can be NULL for a string item
1052  * @param end pointer to variable to receive last character in range, inclusive;
1053  *            can be NULL for a string item
1054  * @param str buffer to receive the string, may be NULL
1055  * @param strCapacity capacity of str, or 0 if str is NULL
1056  * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
1057  * @return the length of the string (0 or >= 2), or 0 if the item is a range,
1058  *         or -1 if the itemIndex is out of range
1059  * @stable ICU 2.4
1060  * @see uset_getString
1061  */
1062 U_CAPI int32_t U_EXPORT2
1063 uset_getItem(const USet* set, int32_t itemIndex,
1064              UChar32* start, UChar32* end,
1065              UChar* str, int32_t strCapacity,
1066              UErrorCode* ec);
1067
1068 /**
1069  * Returns true if set1 contains all the characters and strings
1070  * of set2. It answers the question, 'Is set1 a superset of set2?'
1071  * @param set1 set to be checked for containment
1072  * @param set2 set to be checked for containment
1073  * @return true if the test condition is met
1074  * @stable ICU 3.2
1075  */
1076 U_CAPI UBool U_EXPORT2
1077 uset_containsAll(const USet* set1, const USet* set2);
1078
1079 /**
1080  * Returns true if this set contains all the characters
1081  * of the given string. This is does not check containment of grapheme
1082  * clusters, like uset_containsString.
1083  * @param set set of characters to be checked for containment
1084  * @param str string containing codepoints to be checked for containment
1085  * @param strLen the length of the string or -1 if null terminated.
1086  * @return true if the test condition is met
1087  * @stable ICU 3.4
1088  */
1089 U_CAPI UBool U_EXPORT2
1090 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
1091
1092 /**
1093  * Returns true if set1 contains none of the characters and strings
1094  * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
1095  * @param set1 set to be checked for containment
1096  * @param set2 set to be checked for containment
1097  * @return true if the test condition is met
1098  * @stable ICU 3.2
1099  */
1100 U_CAPI UBool U_EXPORT2
1101 uset_containsNone(const USet* set1, const USet* set2);
1102
1103 /**
1104  * Returns true if set1 contains some of the characters and strings
1105  * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
1106  * @param set1 set to be checked for containment
1107  * @param set2 set to be checked for containment
1108  * @return true if the test condition is met
1109  * @stable ICU 3.2
1110  */
1111 U_CAPI UBool U_EXPORT2
1112 uset_containsSome(const USet* set1, const USet* set2);
1113
1114 /**
1115  * Returns the length of the initial substring of the input string which
1116  * consists only of characters and strings that are contained in this set
1117  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1118  * or only of characters and strings that are not contained
1119  * in this set (USET_SPAN_NOT_CONTAINED).
1120  * See USetSpanCondition for details.
1121  * Similar to the strspn() C library function.
1122  * Unpaired surrogates are treated according to contains() of their surrogate code points.
1123  * This function works faster with a frozen set and with a non-negative string length argument.
1124  * @param set the set
1125  * @param s start of the string
1126  * @param length of the string; can be -1 for NUL-terminated
1127  * @param spanCondition specifies the containment condition
1128  * @return the length of the initial substring according to the spanCondition;
1129  *         0 if the start of the string does not fit the spanCondition
1130  * @stable ICU 3.8
1131  * @see USetSpanCondition
1132  */
1133 U_CAPI int32_t U_EXPORT2
1134 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
1135
1136 /**
1137  * Returns the start of the trailing substring of the input string which
1138  * consists only of characters and strings that are contained in this set
1139  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1140  * or only of characters and strings that are not contained
1141  * in this set (USET_SPAN_NOT_CONTAINED).
1142  * See USetSpanCondition for details.
1143  * Unpaired surrogates are treated according to contains() of their surrogate code points.
1144  * This function works faster with a frozen set and with a non-negative string length argument.
1145  * @param set the set
1146  * @param s start of the string
1147  * @param length of the string; can be -1 for NUL-terminated
1148  * @param spanCondition specifies the containment condition
1149  * @return the start of the trailing substring according to the spanCondition;
1150  *         the string length if the end of the string does not fit the spanCondition
1151  * @stable ICU 3.8
1152  * @see USetSpanCondition
1153  */
1154 U_CAPI int32_t U_EXPORT2
1155 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
1156
1157 /**
1158  * Returns the length of the initial substring of the input string which
1159  * consists only of characters and strings that are contained in this set
1160  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1161  * or only of characters and strings that are not contained
1162  * in this set (USET_SPAN_NOT_CONTAINED).
1163  * See USetSpanCondition for details.
1164  * Similar to the strspn() C library function.
1165  * Malformed byte sequences are treated according to contains(0xfffd).
1166  * This function works faster with a frozen set and with a non-negative string length argument.
1167  * @param set the set
1168  * @param s start of the string (UTF-8)
1169  * @param length of the string; can be -1 for NUL-terminated
1170  * @param spanCondition specifies the containment condition
1171  * @return the length of the initial substring according to the spanCondition;
1172  *         0 if the start of the string does not fit the spanCondition
1173  * @stable ICU 3.8
1174  * @see USetSpanCondition
1175  */
1176 U_CAPI int32_t U_EXPORT2
1177 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
1178
1179 /**
1180  * Returns the start of the trailing substring of the input string which
1181  * consists only of characters and strings that are contained in this set
1182  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1183  * or only of characters and strings that are not contained
1184  * in this set (USET_SPAN_NOT_CONTAINED).
1185  * See USetSpanCondition for details.
1186  * Malformed byte sequences are treated according to contains(0xfffd).
1187  * This function works faster with a frozen set and with a non-negative string length argument.
1188  * @param set the set
1189  * @param s start of the string (UTF-8)
1190  * @param length of the string; can be -1 for NUL-terminated
1191  * @param spanCondition specifies the containment condition
1192  * @return the start of the trailing substring according to the spanCondition;
1193  *         the string length if the end of the string does not fit the spanCondition
1194  * @stable ICU 3.8
1195  * @see USetSpanCondition
1196  */
1197 U_CAPI int32_t U_EXPORT2
1198 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
1199
1200 /**
1201  * Returns true if set1 contains all of the characters and strings
1202  * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
1203  * @param set1 set to be checked for containment
1204  * @param set2 set to be checked for containment
1205  * @return true if the test condition is met
1206  * @stable ICU 3.2
1207  */
1208 U_CAPI UBool U_EXPORT2
1209 uset_equals(const USet* set1, const USet* set2);
1210
1211 /*********************************************************************
1212  * Serialized set API
1213  *********************************************************************/
1214
1215 /**
1216  * Serializes this set into an array of 16-bit integers.  Serialization
1217  * (currently) only records the characters in the set; multicharacter
1218  * strings are ignored.
1219  *
1220  * The array
1221  * has following format (each line is one 16-bit integer):
1222  *
1223  *  length     = (n+2*m) | (m!=0?0x8000:0)
1224  *  bmpLength  = n; present if m!=0
1225  *  bmp[0]
1226  *  bmp[1]
1227  *  ...
1228  *  bmp[n-1]
1229  *  supp-high[0]
1230  *  supp-low[0]
1231  *  supp-high[1]
1232  *  supp-low[1]
1233  *  ...
1234  *  supp-high[m-1]
1235  *  supp-low[m-1]
1236  *
1237  * The array starts with a header.  After the header are n bmp
1238  * code points, then m supplementary code points.  Either n or m
1239  * or both may be zero.  n+2*m is always <= 0x7FFF.
1240  *
1241  * If there are no supplementary characters (if m==0) then the
1242  * header is one 16-bit integer, 'length', with value n.
1243  *
1244  * If there are supplementary characters (if m!=0) then the header
1245  * is two 16-bit integers.  The first, 'length', has value
1246  * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
1247  *
1248  * After the header the code points are stored in ascending order.
1249  * Supplementary code points are stored as most significant 16
1250  * bits followed by least significant 16 bits.
1251  *
1252  * @param set the set
1253  * @param dest pointer to buffer of destCapacity 16-bit integers.
1254  * May be NULL only if destCapacity is zero.
1255  * @param destCapacity size of dest, or zero.  Must not be negative.
1256  * @param pErrorCode pointer to the error code.  Will be set to
1257  * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF.  Will be set to
1258  * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
1259  * @return the total length of the serialized format, including
1260  * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1261  * than U_BUFFER_OVERFLOW_ERROR.
1262  * @stable ICU 2.4
1263  */
1264 U_CAPI int32_t U_EXPORT2
1265 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
1266
1267 /**
1268  * Given a serialized array, fill in the given serialized set object.
1269  * @param fillSet pointer to result
1270  * @param src pointer to start of array
1271  * @param srcLength length of array
1272  * @return true if the given array is valid, otherwise false
1273  * @stable ICU 2.4
1274  */
1275 U_CAPI UBool U_EXPORT2
1276 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
1277
1278 /**
1279  * Set the USerializedSet to contain the given character (and nothing
1280  * else).
1281  * @param fillSet pointer to result
1282  * @param c The codepoint to set
1283  * @stable ICU 2.4
1284  */
1285 U_CAPI void U_EXPORT2
1286 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
1287
1288 /**
1289  * Returns true if the given USerializedSet contains the given
1290  * character.
1291  * @param set the serialized set
1292  * @param c The codepoint to check for within the set
1293  * @return true if set contains c
1294  * @stable ICU 2.4
1295  */
1296 U_CAPI UBool U_EXPORT2
1297 uset_serializedContains(const USerializedSet* set, UChar32 c);
1298
1299 /**
1300  * Returns the number of disjoint ranges of characters contained in
1301  * the given serialized set.  Ignores any strings contained in the
1302  * set.
1303  * @param set the serialized set
1304  * @return a non-negative integer counting the character ranges
1305  * contained in set
1306  * @stable ICU 2.4
1307  */
1308 U_CAPI int32_t U_EXPORT2
1309 uset_getSerializedRangeCount(const USerializedSet* set);
1310
1311 /**
1312  * Returns a range of characters contained in the given serialized
1313  * set.
1314  * @param set the serialized set
1315  * @param rangeIndex a non-negative integer in the range 0..
1316  * uset_getSerializedRangeCount(set)-1
1317  * @param pStart pointer to variable to receive first character
1318  * in range, inclusive
1319  * @param pEnd pointer to variable to receive last character in range,
1320  * inclusive
1321  * @return true if rangeIndex is valid, otherwise false
1322  * @stable ICU 2.4
1323  */
1324 U_CAPI UBool U_EXPORT2
1325 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
1326                         UChar32* pStart, UChar32* pEnd);
1327
1328 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
1329 #ifndef U_HIDE_DRAFT_API
1330
1331 namespace U_HEADER_ONLY_NAMESPACE {
1332
1333 // Note: Not U_COMMON_API, and not a subclass of UMemory, because this is a header-only class,
1334 // not intended to be used via export from the ICU DLL.
1335
1336 /**
1337  * Iterator returned by USetCodePoints.
1338  * @draft ICU 76
1339  */
1340 class USetCodePointIterator {
1341 public:
1342     /** @draft ICU 76 */
1343     USetCodePointIterator(const USetCodePointIterator &other) = default;
1344
1345     /** @draft ICU 76 */
1346     bool operator==(const USetCodePointIterator &other) const {
1347         // No need to compare rangeCount & end given private constructor
1348         // and assuming we don't compare iterators across the set being modified.
1349         // And comparing rangeIndex is redundant with comparing c.
1350         // We might even skip comparing uset.
1351         // Unless we want operator==() to be "correct" for more than iteration.
1352         return uset == other.uset && c == other.c;
1353     }
1354
1355     /** @draft ICU 76 */
1356     bool operator!=(const USetCodePointIterator &other) const { return !operator==(other); }
1357
1358     /** @draft ICU 76 */
1359     UChar32 operator*() const { return c; }
1360
1361     /**
1362      * Pre-increment.
1363      * @draft ICU 76
1364      */
1365     USetCodePointIterator &operator++() {
1366         if (c < end) {
1367             ++c;
1368         } else if (rangeIndex < rangeCount) {
1369             UErrorCode errorCode = U_ZERO_ERROR;
1370             int32_t result = uset_getItem(uset, rangeIndex, &c, &end, nullptr, 0, &errorCode);
1371             if (U_SUCCESS(errorCode) && result == 0) {
1372                 ++rangeIndex;
1373             } else {
1374                 c = end = U_SENTINEL;
1375             }
1376         } else {
1377             c = end = U_SENTINEL;
1378         }
1379         return *this;
1380     }
1381
1382     /**
1383      * Post-increment.
1384      * @draft ICU 76
1385      */
1386     USetCodePointIterator operator++(int) {
1387         USetCodePointIterator result(*this);
1388         operator++();
1389         return result;
1390     }
1391
1392 private:
1393     friend class USetCodePoints;
1394
1395     USetCodePointIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
1396             : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount),
1397                 c(U_SENTINEL), end(U_SENTINEL) {
1398         // Fetch the first range.
1399         operator++();
1400     }
1401
1402     const USet *uset;
1403     int32_t rangeIndex;
1404     int32_t rangeCount;
1405     UChar32 c, end;
1406 };
1407
1408 /**
1409  * C++ "range" for iterating over the code points of a USet.
1410  *
1411  * \code
1412  * using U_HEADER_NESTED_NAMESPACE::USetCodePoints;
1413  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode));
1414  * for (UChar32 c : USetCodePoints(uset.getAlias())) {
1415  *     printf("uset.codePoint U+%04lx\n", (long)c);
1416  * }
1417  * \endcode
1418  *
1419  * C++ UnicodeSet has member functions for iteration, including codePoints().
1420  *
1421  * @draft ICU 76
1422  * @see USetRanges
1423  * @see USetStrings
1424  * @see USetElements
1425  */
1426 class USetCodePoints {
1427 public:
1428     /**
1429      * Constructs a C++ "range" object over the code points of the USet.
1430      * @draft ICU 76
1431      */
1432     USetCodePoints(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
1433
1434     /** @draft ICU 76 */
1435     USetCodePoints(const USetCodePoints &other) = default;
1436
1437     /** @draft ICU 76 */
1438     USetCodePointIterator begin() const {
1439         return USetCodePointIterator(uset, 0, rangeCount);
1440     }
1441
1442     /** @draft ICU 76 */
1443     USetCodePointIterator end() const {
1444         return USetCodePointIterator(uset, rangeCount, rangeCount);
1445     }
1446
1447 private:
1448     const USet *uset;
1449     int32_t rangeCount;
1450 };
1451
1452 /**
1453  * A contiguous range of code points in a USet/UnicodeSet.
1454  * Returned by USetRangeIterator which is returned by USetRanges.
1455  * Both the rangeStart and rangeEnd are in the range.
1456  * (end() returns an iterator corresponding to rangeEnd+1.)
1457  * @draft ICU 76
1458  */
1459 struct CodePointRange {
1460     /** @draft ICU 76 */
1461     struct iterator {
1462         /** @draft ICU 76 */
1463         iterator(UChar32 c) : c(c) {}
1464
1465         /** @draft ICU 76 */
1466         bool operator==(const iterator &other) const { return c == other.c; }
1467         /** @draft ICU 76 */
1468         bool operator!=(const iterator &other) const { return !operator==(other); }
1469
1470         /** @draft ICU 76 */
1471         UChar32 operator*() const { return c; }
1472
1473         /**
1474          * Pre-increment.
1475          * @draft ICU 76
1476          */
1477         iterator &operator++() {
1478             ++c;
1479             return *this;
1480         }
1481
1482         /**
1483          * Post-increment.
1484          * @draft ICU 76
1485          */
1486         iterator operator++(int) {
1487             return c++;
1488         }
1489
1490         /**
1491          * The current code point in the range.
1492          * @draft ICU 76
1493          */
1494         UChar32 c;
1495     };
1496
1497     /** @draft ICU 76 */
1498     CodePointRange(UChar32 start, UChar32 end) : rangeStart(start), rangeEnd(end) {}
1499     /** @draft ICU 76 */
1500     CodePointRange(const CodePointRange &other) = default;
1501     /** @draft ICU 76 */
1502     size_t size() const { return (rangeEnd + 1) - rangeStart; }
1503     /** @draft ICU 76 */
1504     iterator begin() const { return rangeStart; }
1505     /** @draft ICU 76 */
1506     iterator end() const { return rangeEnd + 1; }
1507
1508     /**
1509      * Start of a USet/UnicodeSet range of code points.
1510      * @draft ICU 76
1511      */
1512     UChar32 rangeStart;
1513     /**
1514      * Inclusive end of a USet/UnicodeSet range of code points.
1515      * @draft ICU 76
1516      */
1517     UChar32 rangeEnd;
1518 };
1519
1520 /**
1521  * Iterator returned by USetRanges.
1522  * @draft ICU 76
1523  */
1524 class USetRangeIterator {
1525 public:
1526     /** @draft ICU 76 */
1527     USetRangeIterator(const USetRangeIterator &other) = default;
1528
1529     /** @draft ICU 76 */
1530     bool operator==(const USetRangeIterator &other) const {
1531         // No need to compare rangeCount given private constructor
1532         // and assuming we don't compare iterators across the set being modified.
1533         // We might even skip comparing uset.
1534         // Unless we want operator==() to be "correct" for more than iteration.
1535         return uset == other.uset && rangeIndex == other.rangeIndex;
1536     }
1537
1538     /** @draft ICU 76 */
1539     bool operator!=(const USetRangeIterator &other) const { return !operator==(other); }
1540
1541     /** @draft ICU 76 */
1542     CodePointRange operator*() const {
1543         if (rangeIndex < rangeCount) {
1544             UChar32 start, end;
1545             UErrorCode errorCode = U_ZERO_ERROR;
1546             int32_t result = uset_getItem(uset, rangeIndex, &start, &end, nullptr, 0, &errorCode);
1547             if (U_SUCCESS(errorCode) && result == 0) {
1548                 return CodePointRange(start, end);
1549             }
1550         }
1551         return CodePointRange(U_SENTINEL, U_SENTINEL);
1552     }
1553
1554     /**
1555      * Pre-increment.
1556      * @draft ICU 76
1557      */
1558     USetRangeIterator &operator++() {
1559         ++rangeIndex;
1560         return *this;
1561     }
1562
1563     /**
1564      * Post-increment.
1565      * @draft ICU 76
1566      */
1567     USetRangeIterator operator++(int) {
1568         USetRangeIterator result(*this);
1569         ++rangeIndex;
1570         return result;
1571     }
1572
1573 private:
1574     friend class USetRanges;
1575
1576     USetRangeIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
1577             : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount) {}
1578
1579     const USet *uset;
1580     int32_t rangeIndex;
1581     int32_t rangeCount;
1582 };
1583
1584 /**
1585  * C++ "range" for iterating over the code point ranges of a USet.
1586  *
1587  * \code
1588  * using U_HEADER_NESTED_NAMESPACE::USetRanges;
1589  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode));
1590  * for (auto [start, end] : USetRanges(uset.getAlias())) {
1591  *     printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end);
1592  * }
1593  * for (auto range : USetRanges(uset.getAlias())) {
1594  *     for (UChar32 c : range) {
1595  *         printf("uset.range.c U+%04lx\n", (long)c);
1596  *     }
1597  * }
1598  * \endcode
1599  *
1600  * C++ UnicodeSet has member functions for iteration, including ranges().
1601  *
1602  * @draft ICU 76
1603  * @see USetCodePoints
1604  * @see USetStrings
1605  * @see USetElements
1606  */
1607 class USetRanges {
1608 public:
1609     /**
1610      * Constructs a C++ "range" object over the code point ranges of the USet.
1611      * @draft ICU 76
1612      */
1613     USetRanges(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
1614
1615     /** @draft ICU 76 */
1616     USetRanges(const USetRanges &other) = default;
1617
1618     /** @draft ICU 76 */
1619     USetRangeIterator begin() const {
1620         return USetRangeIterator(uset, 0, rangeCount);
1621     }
1622
1623     /** @draft ICU 76 */
1624     USetRangeIterator end() const {
1625         return USetRangeIterator(uset, rangeCount, rangeCount);
1626     }
1627
1628 private:
1629     const USet *uset;
1630     int32_t rangeCount;
1631 };
1632
1633 /**
1634  * Iterator returned by USetStrings.
1635  * @draft ICU 76
1636  */
1637 class USetStringIterator {
1638 public:
1639     /** @draft ICU 76 */
1640     USetStringIterator(const USetStringIterator &other) = default;
1641
1642     /** @draft ICU 76 */
1643     bool operator==(const USetStringIterator &other) const {
1644         // No need to compare count given private constructor
1645         // and assuming we don't compare iterators across the set being modified.
1646         // We might even skip comparing uset.
1647         // Unless we want operator==() to be "correct" for more than iteration.
1648         return uset == other.uset && index == other.index;
1649     }
1650
1651     /** @draft ICU 76 */
1652     bool operator!=(const USetStringIterator &other) const { return !operator==(other); }
1653
1654     /** @draft ICU 76 */
1655     std::u16string_view operator*() const {
1656         if (index < count) {
1657             int32_t length;
1658             const UChar *uchars = uset_getString(uset, index, &length);
1659             // assert uchars != nullptr;
1660             return {ConstChar16Ptr(uchars), static_cast<uint32_t>(length)};
1661         }
1662         return {};
1663     }
1664
1665     /**
1666      * Pre-increment.
1667      * @draft ICU 76
1668      */
1669     USetStringIterator &operator++() {
1670         ++index;
1671         return *this;
1672     }
1673
1674     /**
1675      * Post-increment.
1676      * @draft ICU 76
1677      */
1678     USetStringIterator operator++(int) {
1679         USetStringIterator result(*this);
1680         ++index;
1681         return result;
1682     }
1683
1684 private:
1685     friend class USetStrings;
1686
1687     USetStringIterator(const USet *uset, int32_t index, int32_t count)
1688             : uset(uset), index(index), count(count) {}
1689
1690     const USet *uset;
1691     int32_t index;
1692     int32_t count;
1693 };
1694
1695 /**
1696  * C++ "range" for iterating over the empty and multi-character strings of a USet.
1697  *
1698  * \code
1699  * using U_HEADER_NESTED_NAMESPACE::USetStrings;
1700  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode));
1701  * for (auto s : USetStrings(uset.getAlias())) {
1702  *     UnicodeString us(s);
1703  *     std::string u8;
1704  *     printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
1705  * }
1706  * \endcode
1707  *
1708  * C++ UnicodeSet has member functions for iteration, including strings().
1709  *
1710  * @draft ICU 76
1711  * @see USetCodePoints
1712  * @see USetRanges
1713  * @see USetElements
1714  */
1715 class USetStrings {
1716 public:
1717     /**
1718      * Constructs a C++ "range" object over the strings of the USet.
1719      * @draft ICU 76
1720      */
1721     USetStrings(const USet *uset) : uset(uset), count(uset_getStringCount(uset)) {}
1722
1723     /** @draft ICU 76 */
1724     USetStrings(const USetStrings &other) = default;
1725
1726     /** @draft ICU 76 */
1727     USetStringIterator begin() const {
1728         return USetStringIterator(uset, 0, count);
1729     }
1730
1731     /** @draft ICU 76 */
1732     USetStringIterator end() const {
1733         return USetStringIterator(uset, count, count);
1734     }
1735
1736 private:
1737     const USet *uset;
1738     int32_t count;
1739 };
1740
1741 /**
1742  * Iterator returned by USetElements.
1743  * @draft ICU 76
1744  */
1745 class USetElementIterator {
1746 public:
1747     /** @draft ICU 76 */
1748     USetElementIterator(const USetElementIterator &other) = default;
1749
1750     /** @draft ICU 76 */
1751     bool operator==(const USetElementIterator &other) const {
1752         // No need to compare rangeCount & end given private constructor
1753         // and assuming we don't compare iterators across the set being modified.
1754         // We might even skip comparing uset.
1755         // Unless we want operator==() to be "correct" for more than iteration.
1756         return uset == other.uset && c == other.c && index == other.index;
1757     }
1758
1759     /** @draft ICU 76 */
1760     bool operator!=(const USetElementIterator &other) const { return !operator==(other); }
1761
1762     /** @draft ICU 76 */
1763     UnicodeString operator*() const {
1764         if (c >= 0) {
1765             return UnicodeString(c);
1766         } else if (index < totalCount) {
1767             int32_t length;
1768             const UChar *uchars = uset_getString(uset, index - rangeCount, &length);
1769             // assert uchars != nullptr;
1770             return UnicodeString(uchars, length);
1771         } else {
1772             return UnicodeString();
1773         }
1774     }
1775
1776     /**
1777      * Pre-increment.
1778      * @draft ICU 76
1779      */
1780     USetElementIterator &operator++() {
1781         if (c < end) {
1782             ++c;
1783         } else if (index < rangeCount) {
1784             UErrorCode errorCode = U_ZERO_ERROR;
1785             int32_t result = uset_getItem(uset, index, &c, &end, nullptr, 0, &errorCode);
1786             if (U_SUCCESS(errorCode) && result == 0) {
1787                 ++index;
1788             } else {
1789                 c = end = U_SENTINEL;
1790             }
1791         } else if (c >= 0) {
1792             // assert index == rangeCount;
1793             // Switch from the last range to the first string.
1794             c = end = U_SENTINEL;
1795         } else {
1796             ++index;
1797         }
1798         return *this;
1799     }
1800
1801     /**
1802      * Post-increment.
1803      * @draft ICU 76
1804      */
1805     USetElementIterator operator++(int) {
1806         USetElementIterator result(*this);
1807         operator++();
1808         return result;
1809     }
1810
1811 private:
1812     friend class USetElements;
1813
1814     USetElementIterator(const USet *uset, int32_t index, int32_t rangeCount, int32_t totalCount)
1815             : uset(uset), index(index), rangeCount(rangeCount), totalCount(totalCount),
1816                 c(U_SENTINEL), end(U_SENTINEL) {
1817         if (index < rangeCount) {
1818             // Fetch the first range.
1819             operator++();
1820         }
1821         // Otherwise don't move beyond the (index - rangeCount)-th string.
1822     }
1823
1824     const USet *uset;
1825     int32_t index;
1826     /** Number of UnicodeSet/USet code point ranges. */
1827     int32_t rangeCount;
1828     /**
1829      * Number of code point ranges plus number of strings.
1830      * index starts from 0, counts ranges while less than rangeCount,
1831      * then counts strings while at least rangeCount and less than totalCount.
1832      *
1833      * Note that totalCount is the same as uset_getItemCount(), but usually
1834      * smaller than the number of elements returned by this iterator
1835      * because we return each code point of each range.
1836      */
1837     int32_t totalCount;
1838     UChar32 c, end;
1839 };
1840
1841 /**
1842  * A C++ "range" for iterating over all of the elements of a USet.
1843  * Convenient all-in one iteration, but creates a UnicodeString for each
1844  * code point or string.
1845  *
1846  * Code points are returned first, then empty and multi-character strings.
1847  *
1848  * \code
1849  * using U_HEADER_NESTED_NAMESPACE::USetElements;
1850  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode));
1851  * for (auto el : USetElements(uset.getAlias())) {
1852  *     std::string u8;
1853  *     printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
1854  * }
1855  * \endcode
1856  *
1857  * C++ UnicodeSet has member functions for iteration, including begin() and end().
1858  *
1859  * @return an all-elements iterator.
1860  * @draft ICU 76
1861  * @see USetCodePoints
1862  * @see USetRanges
1863  * @see USetStrings
1864  */
1865 class USetElements {
1866 public:
1867     /**
1868      * Constructs a C++ "range" object over all of the elements of the USet.
1869      * @draft ICU 76
1870      */
1871     USetElements(const USet *uset)
1872         : uset(uset), rangeCount(uset_getRangeCount(uset)),
1873             stringCount(uset_getStringCount(uset)) {}
1874
1875     /** @draft ICU 76 */
1876     USetElements(const USetElements &other) = default;
1877
1878     /** @draft ICU 76 */
1879     USetElementIterator begin() const {
1880         return USetElementIterator(uset, 0, rangeCount, rangeCount + stringCount);
1881     }
1882
1883     /** @draft ICU 76 */
1884     USetElementIterator end() const {
1885         return USetElementIterator(uset, rangeCount + stringCount, rangeCount, rangeCount + stringCount);
1886     }
1887
1888 private:
1889     const USet *uset;
1890     int32_t rangeCount, stringCount;
1891 };
1892
1893 }  // namespace U_HEADER_ONLY_NAMESPACE
1894
1895 #endif  // U_HIDE_DRAFT_API
1896 #endif  // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
1897
1898 #endif  // __USET_H__