|
|
|||
File indexing completed on 2026-01-04 10:00:23
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 ******************************************************************************* 0005 * 0006 * Copyright (C) 2009-2013, International Business Machines 0007 * Corporation and others. All Rights Reserved. 0008 * 0009 ******************************************************************************* 0010 * file name: normalizer2.h 0011 * encoding: UTF-8 0012 * tab size: 8 (not used) 0013 * indentation:4 0014 * 0015 * created on: 2009nov22 0016 * created by: Markus W. Scherer 0017 */ 0018 0019 #ifndef __NORMALIZER2_H__ 0020 #define __NORMALIZER2_H__ 0021 0022 /** 0023 * \file 0024 * \brief C++ API: New API for Unicode Normalization. 0025 */ 0026 0027 #include "unicode/utypes.h" 0028 0029 #if U_SHOW_CPLUSPLUS_API 0030 0031 #if !UCONFIG_NO_NORMALIZATION 0032 0033 #include "unicode/stringpiece.h" 0034 #include "unicode/uniset.h" 0035 #include "unicode/unistr.h" 0036 #include "unicode/unorm2.h" 0037 0038 U_NAMESPACE_BEGIN 0039 0040 class ByteSink; 0041 0042 /** 0043 * Unicode normalization functionality for standard Unicode normalization or 0044 * for using custom mapping tables. 0045 * All instances of this class are unmodifiable/immutable. 0046 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 0047 * The Normalizer2 class is not intended for public subclassing. 0048 * 0049 * The primary functions are to produce a normalized string and to detect whether 0050 * a string is already normalized. 0051 * The most commonly used normalization forms are those defined in 0052 * http://www.unicode.org/unicode/reports/tr15/ 0053 * However, this API supports additional normalization forms for specialized purposes. 0054 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 0055 * and can be used in implementations of UTS #46. 0056 * 0057 * Not only are the standard compose and decompose modes supplied, 0058 * but additional modes are provided as documented in the Mode enum. 0059 * 0060 * Some of the functions in this class identify normalization boundaries. 0061 * At a normalization boundary, the portions of the string 0062 * before it and starting from it do not interact and can be handled independently. 0063 * 0064 * The spanQuickCheckYes() stops at a normalization boundary. 0065 * When the goal is a normalized string, then the text before the boundary 0066 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 0067 * 0068 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 0069 * a character is guaranteed to be at a normalization boundary, 0070 * regardless of context. 0071 * This is used for moving from one normalization boundary to the next 0072 * or preceding boundary, and for performing iterative normalization. 0073 * 0074 * Iterative normalization is useful when only a small portion of a 0075 * longer string needs to be processed. 0076 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 0077 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 0078 * (to process only the substring for which sort key bytes are computed). 0079 * 0080 * The set of normalization boundaries returned by these functions may not be 0081 * complete: There may be more boundaries that could be returned. 0082 * Different functions may return different boundaries. 0083 * @stable ICU 4.4 0084 */ 0085 class U_COMMON_API Normalizer2 : public UObject { 0086 public: 0087 /** 0088 * Destructor. 0089 * @stable ICU 4.4 0090 */ 0091 ~Normalizer2(); 0092 0093 /** 0094 * Returns a Normalizer2 instance for Unicode NFC normalization. 0095 * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode). 0096 * Returns an unmodifiable singleton instance. Do not delete it. 0097 * @param errorCode Standard ICU error code. Its input value must 0098 * pass the U_SUCCESS() test, or else the function returns 0099 * immediately. Check for U_FAILURE() on output or use with 0100 * function chaining. (See User Guide for details.) 0101 * @return the requested Normalizer2, if successful 0102 * @stable ICU 49 0103 */ 0104 static const Normalizer2 * 0105 getNFCInstance(UErrorCode &errorCode); 0106 0107 /** 0108 * Returns a Normalizer2 instance for Unicode NFD normalization. 0109 * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode). 0110 * Returns an unmodifiable singleton instance. Do not delete it. 0111 * @param errorCode Standard ICU error code. Its input value must 0112 * pass the U_SUCCESS() test, or else the function returns 0113 * immediately. Check for U_FAILURE() on output or use with 0114 * function chaining. (See User Guide for details.) 0115 * @return the requested Normalizer2, if successful 0116 * @stable ICU 49 0117 */ 0118 static const Normalizer2 * 0119 getNFDInstance(UErrorCode &errorCode); 0120 0121 /** 0122 * Returns a Normalizer2 instance for Unicode NFKC normalization. 0123 * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode). 0124 * Returns an unmodifiable singleton instance. Do not delete it. 0125 * @param errorCode Standard ICU error code. Its input value must 0126 * pass the U_SUCCESS() test, or else the function returns 0127 * immediately. Check for U_FAILURE() on output or use with 0128 * function chaining. (See User Guide for details.) 0129 * @return the requested Normalizer2, if successful 0130 * @stable ICU 49 0131 */ 0132 static const Normalizer2 * 0133 getNFKCInstance(UErrorCode &errorCode); 0134 0135 /** 0136 * Returns a Normalizer2 instance for Unicode NFKD normalization. 0137 * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode). 0138 * Returns an unmodifiable singleton instance. Do not delete it. 0139 * @param errorCode Standard ICU error code. Its input value must 0140 * pass the U_SUCCESS() test, or else the function returns 0141 * immediately. Check for U_FAILURE() on output or use with 0142 * function chaining. (See User Guide for details.) 0143 * @return the requested Normalizer2, if successful 0144 * @stable ICU 49 0145 */ 0146 static const Normalizer2 * 0147 getNFKDInstance(UErrorCode &errorCode); 0148 0149 /** 0150 * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization 0151 * which is equivalent to applying the NFKC_Casefold mappings and then NFC. 0152 * See https://www.unicode.org/reports/tr44/#NFKC_Casefold 0153 * 0154 * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode). 0155 * Returns an unmodifiable singleton instance. Do not delete it. 0156 * @param errorCode Standard ICU error code. Its input value must 0157 * pass the U_SUCCESS() test, or else the function returns 0158 * immediately. Check for U_FAILURE() on output or use with 0159 * function chaining. (See User Guide for details.) 0160 * @return the requested Normalizer2, if successful 0161 * @stable ICU 49 0162 */ 0163 static const Normalizer2 * 0164 getNFKCCasefoldInstance(UErrorCode &errorCode); 0165 0166 /** 0167 * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization 0168 * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC. 0169 * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold 0170 * 0171 * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode). 0172 * Returns an unmodifiable singleton instance. Do not delete it. 0173 * @param errorCode Standard ICU error code. Its input value must 0174 * pass the U_SUCCESS() test, or else the function returns 0175 * immediately. Check for U_FAILURE() on output or use with 0176 * function chaining. (See User Guide for details.) 0177 * @return the requested Normalizer2, if successful 0178 * @stable ICU 74 0179 */ 0180 static const Normalizer2 * 0181 getNFKCSimpleCasefoldInstance(UErrorCode &errorCode); 0182 0183 /** 0184 * Returns a Normalizer2 instance which uses the specified data file 0185 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 0186 * and which composes or decomposes text according to the specified mode. 0187 * Returns an unmodifiable singleton instance. Do not delete it. 0188 * 0189 * Use packageName=nullptr for data files that are part of ICU's own data. 0190 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 0191 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 0192 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 0193 * 0194 * @param packageName nullptr for ICU built-in data, otherwise application data package name 0195 * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file 0196 * @param mode normalization mode (compose or decompose etc.) 0197 * @param errorCode Standard ICU error code. Its input value must 0198 * pass the U_SUCCESS() test, or else the function returns 0199 * immediately. Check for U_FAILURE() on output or use with 0200 * function chaining. (See User Guide for details.) 0201 * @return the requested Normalizer2, if successful 0202 * @stable ICU 4.4 0203 */ 0204 static const Normalizer2 * 0205 getInstance(const char *packageName, 0206 const char *name, 0207 UNormalization2Mode mode, 0208 UErrorCode &errorCode); 0209 0210 /** 0211 * Returns the normalized form of the source string. 0212 * @param src source string 0213 * @param errorCode Standard ICU error code. Its input value must 0214 * pass the U_SUCCESS() test, or else the function returns 0215 * immediately. Check for U_FAILURE() on output or use with 0216 * function chaining. (See User Guide for details.) 0217 * @return normalized src 0218 * @stable ICU 4.4 0219 */ 0220 UnicodeString 0221 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 0222 UnicodeString result; 0223 normalize(src, result, errorCode); 0224 return result; 0225 } 0226 /** 0227 * Writes the normalized form of the source string to the destination string 0228 * (replacing its contents) and returns the destination string. 0229 * The source and destination strings must be different objects. 0230 * @param src source string 0231 * @param dest destination string; its contents is replaced with normalized src 0232 * @param errorCode Standard ICU error code. Its input value must 0233 * pass the U_SUCCESS() test, or else the function returns 0234 * immediately. Check for U_FAILURE() on output or use with 0235 * function chaining. (See User Guide for details.) 0236 * @return dest 0237 * @stable ICU 4.4 0238 */ 0239 virtual UnicodeString & 0240 normalize(const UnicodeString &src, 0241 UnicodeString &dest, 0242 UErrorCode &errorCode) const = 0; 0243 0244 /** 0245 * Normalizes a UTF-8 string and optionally records how source substrings 0246 * relate to changed and unchanged result substrings. 0247 * 0248 * Implemented completely for all built-in modes except for FCD. 0249 * The base class implementation converts to & from UTF-16 and does not support edits. 0250 * 0251 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 0252 * @param src Source UTF-8 string. 0253 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 0254 * sink.Flush() is called at the end. 0255 * @param edits Records edits for index mapping, working with styled text, 0256 * and getting only changes (if any). 0257 * The Edits contents is undefined if any error occurs. 0258 * This function calls edits->reset() first unless 0259 * options includes U_EDITS_NO_RESET. edits can be nullptr. 0260 * @param errorCode Standard ICU error code. Its input value must 0261 * pass the U_SUCCESS() test, or else the function returns 0262 * immediately. Check for U_FAILURE() on output or use with 0263 * function chaining. (See User Guide for details.) 0264 * @stable ICU 60 0265 */ 0266 virtual void 0267 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 0268 Edits *edits, UErrorCode &errorCode) const; 0269 0270 /** 0271 * Appends the normalized form of the second string to the first string 0272 * (merging them at the boundary) and returns the first string. 0273 * The result is normalized if the first string was normalized. 0274 * The first and second strings must be different objects. 0275 * @param first string, should be normalized 0276 * @param second string, will be normalized 0277 * @param errorCode Standard ICU error code. Its input value must 0278 * pass the U_SUCCESS() test, or else the function returns 0279 * immediately. Check for U_FAILURE() on output or use with 0280 * function chaining. (See User Guide for details.) 0281 * @return first 0282 * @stable ICU 4.4 0283 */ 0284 virtual UnicodeString & 0285 normalizeSecondAndAppend(UnicodeString &first, 0286 const UnicodeString &second, 0287 UErrorCode &errorCode) const = 0; 0288 /** 0289 * Appends the second string to the first string 0290 * (merging them at the boundary) and returns the first string. 0291 * The result is normalized if both the strings were normalized. 0292 * The first and second strings must be different objects. 0293 * @param first string, should be normalized 0294 * @param second string, should be normalized 0295 * @param errorCode Standard ICU error code. Its input value must 0296 * pass the U_SUCCESS() test, or else the function returns 0297 * immediately. Check for U_FAILURE() on output or use with 0298 * function chaining. (See User Guide for details.) 0299 * @return first 0300 * @stable ICU 4.4 0301 */ 0302 virtual UnicodeString & 0303 append(UnicodeString &first, 0304 const UnicodeString &second, 0305 UErrorCode &errorCode) const = 0; 0306 0307 /** 0308 * Gets the decomposition mapping of c. 0309 * Roughly equivalent to normalizing the String form of c 0310 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 0311 * returns false and does not write a string 0312 * if c does not have a decomposition mapping in this instance's data. 0313 * This function is independent of the mode of the Normalizer2. 0314 * @param c code point 0315 * @param decomposition String object which will be set to c's 0316 * decomposition mapping, if there is one. 0317 * @return true if c has a decomposition, otherwise false 0318 * @stable ICU 4.6 0319 */ 0320 virtual UBool 0321 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 0322 0323 /** 0324 * Gets the raw decomposition mapping of c. 0325 * 0326 * This is similar to the getDecomposition() method but returns the 0327 * raw decomposition mapping as specified in UnicodeData.txt or 0328 * (for custom data) in the mapping files processed by the gennorm2 tool. 0329 * By contrast, getDecomposition() returns the processed, 0330 * recursively-decomposed version of this mapping. 0331 * 0332 * When used on a standard NFKC Normalizer2 instance, 0333 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 0334 * 0335 * When used on a standard NFC Normalizer2 instance, 0336 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 0337 * in this case, the result contains either one or two code points (=1..4 char16_ts). 0338 * 0339 * This function is independent of the mode of the Normalizer2. 0340 * The default implementation returns false. 0341 * @param c code point 0342 * @param decomposition String object which will be set to c's 0343 * raw decomposition mapping, if there is one. 0344 * @return true if c has a decomposition, otherwise false 0345 * @stable ICU 49 0346 */ 0347 virtual UBool 0348 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 0349 0350 /** 0351 * Performs pairwise composition of a & b and returns the composite if there is one. 0352 * 0353 * Returns a composite code point c only if c has a two-way mapping to a+b. 0354 * In standard Unicode normalization, this means that 0355 * c has a canonical decomposition to a+b 0356 * and c does not have the Full_Composition_Exclusion property. 0357 * 0358 * This function is independent of the mode of the Normalizer2. 0359 * The default implementation returns a negative value. 0360 * @param a A (normalization starter) code point. 0361 * @param b Another code point. 0362 * @return The non-negative composite code point if there is one; otherwise a negative value. 0363 * @stable ICU 49 0364 */ 0365 virtual UChar32 0366 composePair(UChar32 a, UChar32 b) const; 0367 0368 /** 0369 * Gets the combining class of c. 0370 * The default implementation returns 0 0371 * but all standard implementations return the Unicode Canonical_Combining_Class value. 0372 * @param c code point 0373 * @return c's combining class 0374 * @stable ICU 49 0375 */ 0376 virtual uint8_t 0377 getCombiningClass(UChar32 c) const; 0378 0379 /** 0380 * Tests if the string is normalized. 0381 * Internally, in cases where the quickCheck() method would return "maybe" 0382 * (which is only possible for the two COMPOSE modes) this method 0383 * resolves to "yes" or "no" to provide a definitive result, 0384 * at the cost of doing more work in those cases. 0385 * @param s input string 0386 * @param errorCode Standard ICU error code. Its input value must 0387 * pass the U_SUCCESS() test, or else the function returns 0388 * immediately. Check for U_FAILURE() on output or use with 0389 * function chaining. (See User Guide for details.) 0390 * @return true if s is normalized 0391 * @stable ICU 4.4 0392 */ 0393 virtual UBool 0394 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 0395 /** 0396 * Tests if the UTF-8 string is normalized. 0397 * Internally, in cases where the quickCheck() method would return "maybe" 0398 * (which is only possible for the two COMPOSE modes) this method 0399 * resolves to "yes" or "no" to provide a definitive result, 0400 * at the cost of doing more work in those cases. 0401 * 0402 * This works for all normalization modes. 0403 * It is optimized for UTF-8 for all built-in modes except for FCD. 0404 * The base class implementation converts to UTF-16 and calls isNormalized(). 0405 * 0406 * @param s UTF-8 input string 0407 * @param errorCode Standard ICU error code. Its input value must 0408 * pass the U_SUCCESS() test, or else the function returns 0409 * immediately. Check for U_FAILURE() on output or use with 0410 * function chaining. (See User Guide for details.) 0411 * @return true if s is normalized 0412 * @stable ICU 60 0413 */ 0414 virtual UBool 0415 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 0416 0417 0418 /** 0419 * Tests if the string is normalized. 0420 * For the two COMPOSE modes, the result could be "maybe" in cases that 0421 * would take a little more work to resolve definitively. 0422 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 0423 * combination of quick check + normalization, to avoid 0424 * re-checking the "yes" prefix. 0425 * @param s input string 0426 * @param errorCode Standard ICU error code. Its input value must 0427 * pass the U_SUCCESS() test, or else the function returns 0428 * immediately. Check for U_FAILURE() on output or use with 0429 * function chaining. (See User Guide for details.) 0430 * @return UNormalizationCheckResult 0431 * @stable ICU 4.4 0432 */ 0433 virtual UNormalizationCheckResult 0434 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 0435 0436 /** 0437 * Returns the end of the normalized substring of the input string. 0438 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 0439 * the substring <code>UnicodeString(s, 0, end)</code> 0440 * will pass the quick check with a "yes" result. 0441 * 0442 * The returned end index is usually one or more characters before the 0443 * "no" or "maybe" character: The end index is at a normalization boundary. 0444 * (See the class documentation for more about normalization boundaries.) 0445 * 0446 * When the goal is a normalized string and most input strings are expected 0447 * to be normalized already, then call this method, 0448 * and if it returns a prefix shorter than the input string, 0449 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 0450 * @param s input string 0451 * @param errorCode Standard ICU error code. Its input value must 0452 * pass the U_SUCCESS() test, or else the function returns 0453 * immediately. Check for U_FAILURE() on output or use with 0454 * function chaining. (See User Guide for details.) 0455 * @return "yes" span end index 0456 * @stable ICU 4.4 0457 */ 0458 virtual int32_t 0459 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 0460 0461 /** 0462 * Tests if the character always has a normalization boundary before it, 0463 * regardless of context. 0464 * If true, then the character does not normalization-interact with 0465 * preceding characters. 0466 * In other words, a string containing this character can be normalized 0467 * by processing portions before this character and starting from this 0468 * character independently. 0469 * This is used for iterative normalization. See the class documentation for details. 0470 * @param c character to test 0471 * @return true if c has a normalization boundary before it 0472 * @stable ICU 4.4 0473 */ 0474 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 0475 0476 /** 0477 * Tests if the character always has a normalization boundary after it, 0478 * regardless of context. 0479 * If true, then the character does not normalization-interact with 0480 * following characters. 0481 * In other words, a string containing this character can be normalized 0482 * by processing portions up to this character and after this 0483 * character independently. 0484 * This is used for iterative normalization. See the class documentation for details. 0485 * Note that this operation may be significantly slower than hasBoundaryBefore(). 0486 * @param c character to test 0487 * @return true if c has a normalization boundary after it 0488 * @stable ICU 4.4 0489 */ 0490 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 0491 0492 /** 0493 * Tests if the character is normalization-inert. 0494 * If true, then the character does not change, nor normalization-interact with 0495 * preceding or following characters. 0496 * In other words, a string containing this character can be normalized 0497 * by processing portions before this character and after this 0498 * character independently. 0499 * This is used for iterative normalization. See the class documentation for details. 0500 * Note that this operation may be significantly slower than hasBoundaryBefore(). 0501 * @param c character to test 0502 * @return true if c is normalization-inert 0503 * @stable ICU 4.4 0504 */ 0505 virtual UBool isInert(UChar32 c) const = 0; 0506 }; 0507 0508 /** 0509 * Normalization filtered by a UnicodeSet. 0510 * Normalizes portions of the text contained in the filter set and leaves 0511 * portions not contained in the filter set unchanged. 0512 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 0513 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 0514 * This class implements all of (and only) the Normalizer2 API. 0515 * An instance of this class is unmodifiable/immutable but is constructed and 0516 * must be destructed by the owner. 0517 * @stable ICU 4.4 0518 */ 0519 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 0520 public: 0521 /** 0522 * Constructs a filtered normalizer wrapping any Normalizer2 instance 0523 * and a filter set. 0524 * Both are aliased and must not be modified or deleted while this object 0525 * is used. 0526 * The filter set should be frozen; otherwise the performance will suffer greatly. 0527 * @param n2 wrapped Normalizer2 instance 0528 * @param filterSet UnicodeSet which determines the characters to be normalized 0529 * @stable ICU 4.4 0530 */ 0531 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 0532 norm2(n2), set(filterSet) {} 0533 0534 /** 0535 * Destructor. 0536 * @stable ICU 4.4 0537 */ 0538 ~FilteredNormalizer2(); 0539 0540 /** 0541 * Writes the normalized form of the source string to the destination string 0542 * (replacing its contents) and returns the destination string. 0543 * The source and destination strings must be different objects. 0544 * @param src source string 0545 * @param dest destination string; its contents is replaced with normalized src 0546 * @param errorCode Standard ICU error code. Its input value must 0547 * pass the U_SUCCESS() test, or else the function returns 0548 * immediately. Check for U_FAILURE() on output or use with 0549 * function chaining. (See User Guide for details.) 0550 * @return dest 0551 * @stable ICU 4.4 0552 */ 0553 virtual UnicodeString & 0554 normalize(const UnicodeString &src, 0555 UnicodeString &dest, 0556 UErrorCode &errorCode) const override; 0557 0558 /** 0559 * Normalizes a UTF-8 string and optionally records how source substrings 0560 * relate to changed and unchanged result substrings. 0561 * 0562 * Implemented completely for most built-in modes except for FCD. 0563 * The base class implementation converts to & from UTF-16 and does not support edits. 0564 * 0565 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 0566 * @param src Source UTF-8 string. 0567 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 0568 * sink.Flush() is called at the end. 0569 * @param edits Records edits for index mapping, working with styled text, 0570 * and getting only changes (if any). 0571 * The Edits contents is undefined if any error occurs. 0572 * This function calls edits->reset() first unless 0573 * options includes U_EDITS_NO_RESET. edits can be nullptr. 0574 * @param errorCode Standard ICU error code. Its input value must 0575 * pass the U_SUCCESS() test, or else the function returns 0576 * immediately. Check for U_FAILURE() on output or use with 0577 * function chaining. (See User Guide for details.) 0578 * @stable ICU 60 0579 */ 0580 virtual void 0581 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 0582 Edits *edits, UErrorCode &errorCode) const override; 0583 0584 /** 0585 * Appends the normalized form of the second string to the first string 0586 * (merging them at the boundary) and returns the first string. 0587 * The result is normalized if the first string was normalized. 0588 * The first and second strings must be different objects. 0589 * @param first string, should be normalized 0590 * @param second string, will be normalized 0591 * @param errorCode Standard ICU error code. Its input value must 0592 * pass the U_SUCCESS() test, or else the function returns 0593 * immediately. Check for U_FAILURE() on output or use with 0594 * function chaining. (See User Guide for details.) 0595 * @return first 0596 * @stable ICU 4.4 0597 */ 0598 virtual UnicodeString & 0599 normalizeSecondAndAppend(UnicodeString &first, 0600 const UnicodeString &second, 0601 UErrorCode &errorCode) const override; 0602 /** 0603 * Appends the second string to the first string 0604 * (merging them at the boundary) and returns the first string. 0605 * The result is normalized if both the strings were normalized. 0606 * The first and second strings must be different objects. 0607 * @param first string, should be normalized 0608 * @param second string, should be normalized 0609 * @param errorCode Standard ICU error code. Its input value must 0610 * pass the U_SUCCESS() test, or else the function returns 0611 * immediately. Check for U_FAILURE() on output or use with 0612 * function chaining. (See User Guide for details.) 0613 * @return first 0614 * @stable ICU 4.4 0615 */ 0616 virtual UnicodeString & 0617 append(UnicodeString &first, 0618 const UnicodeString &second, 0619 UErrorCode &errorCode) const override; 0620 0621 /** 0622 * Gets the decomposition mapping of c. 0623 * For details see the base class documentation. 0624 * 0625 * This function is independent of the mode of the Normalizer2. 0626 * @param c code point 0627 * @param decomposition String object which will be set to c's 0628 * decomposition mapping, if there is one. 0629 * @return true if c has a decomposition, otherwise false 0630 * @stable ICU 4.6 0631 */ 0632 virtual UBool 0633 getDecomposition(UChar32 c, UnicodeString &decomposition) const override; 0634 0635 /** 0636 * Gets the raw decomposition mapping of c. 0637 * For details see the base class documentation. 0638 * 0639 * This function is independent of the mode of the Normalizer2. 0640 * @param c code point 0641 * @param decomposition String object which will be set to c's 0642 * raw decomposition mapping, if there is one. 0643 * @return true if c has a decomposition, otherwise false 0644 * @stable ICU 49 0645 */ 0646 virtual UBool 0647 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override; 0648 0649 /** 0650 * Performs pairwise composition of a & b and returns the composite if there is one. 0651 * For details see the base class documentation. 0652 * 0653 * This function is independent of the mode of the Normalizer2. 0654 * @param a A (normalization starter) code point. 0655 * @param b Another code point. 0656 * @return The non-negative composite code point if there is one; otherwise a negative value. 0657 * @stable ICU 49 0658 */ 0659 virtual UChar32 0660 composePair(UChar32 a, UChar32 b) const override; 0661 0662 /** 0663 * Gets the combining class of c. 0664 * The default implementation returns 0 0665 * but all standard implementations return the Unicode Canonical_Combining_Class value. 0666 * @param c code point 0667 * @return c's combining class 0668 * @stable ICU 49 0669 */ 0670 virtual uint8_t 0671 getCombiningClass(UChar32 c) const override; 0672 0673 /** 0674 * Tests if the string is normalized. 0675 * For details see the Normalizer2 base class documentation. 0676 * @param s input string 0677 * @param errorCode Standard ICU error code. Its input value must 0678 * pass the U_SUCCESS() test, or else the function returns 0679 * immediately. Check for U_FAILURE() on output or use with 0680 * function chaining. (See User Guide for details.) 0681 * @return true if s is normalized 0682 * @stable ICU 4.4 0683 */ 0684 virtual UBool 0685 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override; 0686 /** 0687 * Tests if the UTF-8 string is normalized. 0688 * Internally, in cases where the quickCheck() method would return "maybe" 0689 * (which is only possible for the two COMPOSE modes) this method 0690 * resolves to "yes" or "no" to provide a definitive result, 0691 * at the cost of doing more work in those cases. 0692 * 0693 * This works for all normalization modes. 0694 * It is optimized for UTF-8 for all built-in modes except for FCD. 0695 * The base class implementation converts to UTF-16 and calls isNormalized(). 0696 * 0697 * @param s UTF-8 input string 0698 * @param errorCode Standard ICU error code. Its input value must 0699 * pass the U_SUCCESS() test, or else the function returns 0700 * immediately. Check for U_FAILURE() on output or use with 0701 * function chaining. (See User Guide for details.) 0702 * @return true if s is normalized 0703 * @stable ICU 60 0704 */ 0705 virtual UBool 0706 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override; 0707 /** 0708 * Tests if the string is normalized. 0709 * For details see the Normalizer2 base class documentation. 0710 * @param s input string 0711 * @param errorCode Standard ICU error code. Its input value must 0712 * pass the U_SUCCESS() test, or else the function returns 0713 * immediately. Check for U_FAILURE() on output or use with 0714 * function chaining. (See User Guide for details.) 0715 * @return UNormalizationCheckResult 0716 * @stable ICU 4.4 0717 */ 0718 virtual UNormalizationCheckResult 0719 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override; 0720 /** 0721 * Returns the end of the normalized substring of the input string. 0722 * For details see the Normalizer2 base class documentation. 0723 * @param s input string 0724 * @param errorCode Standard ICU error code. Its input value must 0725 * pass the U_SUCCESS() test, or else the function returns 0726 * immediately. Check for U_FAILURE() on output or use with 0727 * function chaining. (See User Guide for details.) 0728 * @return "yes" span end index 0729 * @stable ICU 4.4 0730 */ 0731 virtual int32_t 0732 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override; 0733 0734 /** 0735 * Tests if the character always has a normalization boundary before it, 0736 * regardless of context. 0737 * For details see the Normalizer2 base class documentation. 0738 * @param c character to test 0739 * @return true if c has a normalization boundary before it 0740 * @stable ICU 4.4 0741 */ 0742 virtual UBool hasBoundaryBefore(UChar32 c) const override; 0743 0744 /** 0745 * Tests if the character always has a normalization boundary after it, 0746 * regardless of context. 0747 * For details see the Normalizer2 base class documentation. 0748 * @param c character to test 0749 * @return true if c has a normalization boundary after it 0750 * @stable ICU 4.4 0751 */ 0752 virtual UBool hasBoundaryAfter(UChar32 c) const override; 0753 0754 /** 0755 * Tests if the character is normalization-inert. 0756 * For details see the Normalizer2 base class documentation. 0757 * @param c character to test 0758 * @return true if c is normalization-inert 0759 * @stable ICU 4.4 0760 */ 0761 virtual UBool isInert(UChar32 c) const override; 0762 private: 0763 UnicodeString & 0764 normalize(const UnicodeString &src, 0765 UnicodeString &dest, 0766 USetSpanCondition spanCondition, 0767 UErrorCode &errorCode) const; 0768 0769 void 0770 normalizeUTF8(uint32_t options, const char *src, int32_t length, 0771 ByteSink &sink, Edits *edits, 0772 USetSpanCondition spanCondition, 0773 UErrorCode &errorCode) const; 0774 0775 UnicodeString & 0776 normalizeSecondAndAppend(UnicodeString &first, 0777 const UnicodeString &second, 0778 UBool doNormalize, 0779 UErrorCode &errorCode) const; 0780 0781 const Normalizer2 &norm2; 0782 const UnicodeSet &set; 0783 }; 0784 0785 U_NAMESPACE_END 0786 0787 #endif // !UCONFIG_NO_NORMALIZATION 0788 0789 #endif /* U_SHOW_CPLUSPLUS_API */ 0790 0791 #endif // __NORMALIZER2_H__
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|