|
||||
File indexing completed on 2025-01-18 10:13:06
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 ******************************************************************************* 0005 * 0006 * Copyright (C) 2009-2013, International Business Machines 0007 * Corporation and others. All Rights Reserved. 0008 * 0009 ******************************************************************************* 0010 * file name: normalizer2.h 0011 * encoding: UTF-8 0012 * tab size: 8 (not used) 0013 * indentation:4 0014 * 0015 * created on: 2009nov22 0016 * created by: Markus W. Scherer 0017 */ 0018 0019 #ifndef __NORMALIZER2_H__ 0020 #define __NORMALIZER2_H__ 0021 0022 /** 0023 * \file 0024 * \brief C++ API: New API for Unicode Normalization. 0025 */ 0026 0027 #include "unicode/utypes.h" 0028 0029 #if U_SHOW_CPLUSPLUS_API 0030 0031 #if !UCONFIG_NO_NORMALIZATION 0032 0033 #include "unicode/stringpiece.h" 0034 #include "unicode/uniset.h" 0035 #include "unicode/unistr.h" 0036 #include "unicode/unorm2.h" 0037 0038 U_NAMESPACE_BEGIN 0039 0040 class ByteSink; 0041 0042 /** 0043 * Unicode normalization functionality for standard Unicode normalization or 0044 * for using custom mapping tables. 0045 * All instances of this class are unmodifiable/immutable. 0046 * Instances returned by getInstance() are singletons that must not be deleted by the caller. 0047 * The Normalizer2 class is not intended for public subclassing. 0048 * 0049 * The primary functions are to produce a normalized string and to detect whether 0050 * a string is already normalized. 0051 * The most commonly used normalization forms are those defined in 0052 * http://www.unicode.org/unicode/reports/tr15/ 0053 * However, this API supports additional normalization forms for specialized purposes. 0054 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) 0055 * and can be used in implementations of UTS #46. 0056 * 0057 * Not only are the standard compose and decompose modes supplied, 0058 * but additional modes are provided as documented in the Mode enum. 0059 * 0060 * Some of the functions in this class identify normalization boundaries. 0061 * At a normalization boundary, the portions of the string 0062 * before it and starting from it do not interact and can be handled independently. 0063 * 0064 * The spanQuickCheckYes() stops at a normalization boundary. 0065 * When the goal is a normalized string, then the text before the boundary 0066 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). 0067 * 0068 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether 0069 * a character is guaranteed to be at a normalization boundary, 0070 * regardless of context. 0071 * This is used for moving from one normalization boundary to the next 0072 * or preceding boundary, and for performing iterative normalization. 0073 * 0074 * Iterative normalization is useful when only a small portion of a 0075 * longer string needs to be processed. 0076 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator 0077 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() 0078 * (to process only the substring for which sort key bytes are computed). 0079 * 0080 * The set of normalization boundaries returned by these functions may not be 0081 * complete: There may be more boundaries that could be returned. 0082 * Different functions may return different boundaries. 0083 * @stable ICU 4.4 0084 */ 0085 class U_COMMON_API Normalizer2 : public UObject { 0086 public: 0087 /** 0088 * Destructor. 0089 * @stable ICU 4.4 0090 */ 0091 ~Normalizer2(); 0092 0093 /** 0094 * Returns a Normalizer2 instance for Unicode NFC normalization. 0095 * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode). 0096 * Returns an unmodifiable singleton instance. Do not delete it. 0097 * @param errorCode Standard ICU error code. Its input value must 0098 * pass the U_SUCCESS() test, or else the function returns 0099 * immediately. Check for U_FAILURE() on output or use with 0100 * function chaining. (See User Guide for details.) 0101 * @return the requested Normalizer2, if successful 0102 * @stable ICU 49 0103 */ 0104 static const Normalizer2 * 0105 getNFCInstance(UErrorCode &errorCode); 0106 0107 /** 0108 * Returns a Normalizer2 instance for Unicode NFD normalization. 0109 * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode). 0110 * Returns an unmodifiable singleton instance. Do not delete it. 0111 * @param errorCode Standard ICU error code. Its input value must 0112 * pass the U_SUCCESS() test, or else the function returns 0113 * immediately. Check for U_FAILURE() on output or use with 0114 * function chaining. (See User Guide for details.) 0115 * @return the requested Normalizer2, if successful 0116 * @stable ICU 49 0117 */ 0118 static const Normalizer2 * 0119 getNFDInstance(UErrorCode &errorCode); 0120 0121 /** 0122 * Returns a Normalizer2 instance for Unicode NFKC normalization. 0123 * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode). 0124 * Returns an unmodifiable singleton instance. Do not delete it. 0125 * @param errorCode Standard ICU error code. Its input value must 0126 * pass the U_SUCCESS() test, or else the function returns 0127 * immediately. Check for U_FAILURE() on output or use with 0128 * function chaining. (See User Guide for details.) 0129 * @return the requested Normalizer2, if successful 0130 * @stable ICU 49 0131 */ 0132 static const Normalizer2 * 0133 getNFKCInstance(UErrorCode &errorCode); 0134 0135 /** 0136 * Returns a Normalizer2 instance for Unicode NFKD normalization. 0137 * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode). 0138 * Returns an unmodifiable singleton instance. Do not delete it. 0139 * @param errorCode Standard ICU error code. Its input value must 0140 * pass the U_SUCCESS() test, or else the function returns 0141 * immediately. Check for U_FAILURE() on output or use with 0142 * function chaining. (See User Guide for details.) 0143 * @return the requested Normalizer2, if successful 0144 * @stable ICU 49 0145 */ 0146 static const Normalizer2 * 0147 getNFKDInstance(UErrorCode &errorCode); 0148 0149 /** 0150 * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization 0151 * which is equivalent to applying the NFKC_Casefold mappings and then NFC. 0152 * See https://www.unicode.org/reports/tr44/#NFKC_Casefold 0153 * 0154 * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode). 0155 * Returns an unmodifiable singleton instance. Do not delete it. 0156 * @param errorCode Standard ICU error code. Its input value must 0157 * pass the U_SUCCESS() test, or else the function returns 0158 * immediately. Check for U_FAILURE() on output or use with 0159 * function chaining. (See User Guide for details.) 0160 * @return the requested Normalizer2, if successful 0161 * @stable ICU 49 0162 */ 0163 static const Normalizer2 * 0164 getNFKCCasefoldInstance(UErrorCode &errorCode); 0165 0166 #ifndef U_HIDE_DRAFT_API 0167 /** 0168 * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization 0169 * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC. 0170 * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold 0171 * 0172 * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode). 0173 * Returns an unmodifiable singleton instance. Do not delete it. 0174 * @param errorCode Standard ICU error code. Its input value must 0175 * pass the U_SUCCESS() test, or else the function returns 0176 * immediately. Check for U_FAILURE() on output or use with 0177 * function chaining. (See User Guide for details.) 0178 * @return the requested Normalizer2, if successful 0179 * @draft ICU 74 0180 */ 0181 static const Normalizer2 * 0182 getNFKCSimpleCasefoldInstance(UErrorCode &errorCode); 0183 #endif // U_HIDE_DRAFT_API 0184 0185 /** 0186 * Returns a Normalizer2 instance which uses the specified data file 0187 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) 0188 * and which composes or decomposes text according to the specified mode. 0189 * Returns an unmodifiable singleton instance. Do not delete it. 0190 * 0191 * Use packageName=nullptr for data files that are part of ICU's own data. 0192 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. 0193 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. 0194 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. 0195 * 0196 * @param packageName nullptr for ICU built-in data, otherwise application data package name 0197 * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file 0198 * @param mode normalization mode (compose or decompose etc.) 0199 * @param errorCode Standard ICU error code. Its input value must 0200 * pass the U_SUCCESS() test, or else the function returns 0201 * immediately. Check for U_FAILURE() on output or use with 0202 * function chaining. (See User Guide for details.) 0203 * @return the requested Normalizer2, if successful 0204 * @stable ICU 4.4 0205 */ 0206 static const Normalizer2 * 0207 getInstance(const char *packageName, 0208 const char *name, 0209 UNormalization2Mode mode, 0210 UErrorCode &errorCode); 0211 0212 /** 0213 * Returns the normalized form of the source string. 0214 * @param src source string 0215 * @param errorCode Standard ICU error code. Its input value must 0216 * pass the U_SUCCESS() test, or else the function returns 0217 * immediately. Check for U_FAILURE() on output or use with 0218 * function chaining. (See User Guide for details.) 0219 * @return normalized src 0220 * @stable ICU 4.4 0221 */ 0222 UnicodeString 0223 normalize(const UnicodeString &src, UErrorCode &errorCode) const { 0224 UnicodeString result; 0225 normalize(src, result, errorCode); 0226 return result; 0227 } 0228 /** 0229 * Writes the normalized form of the source string to the destination string 0230 * (replacing its contents) and returns the destination string. 0231 * The source and destination strings must be different objects. 0232 * @param src source string 0233 * @param dest destination string; its contents is replaced with normalized src 0234 * @param errorCode Standard ICU error code. Its input value must 0235 * pass the U_SUCCESS() test, or else the function returns 0236 * immediately. Check for U_FAILURE() on output or use with 0237 * function chaining. (See User Guide for details.) 0238 * @return dest 0239 * @stable ICU 4.4 0240 */ 0241 virtual UnicodeString & 0242 normalize(const UnicodeString &src, 0243 UnicodeString &dest, 0244 UErrorCode &errorCode) const = 0; 0245 0246 /** 0247 * Normalizes a UTF-8 string and optionally records how source substrings 0248 * relate to changed and unchanged result substrings. 0249 * 0250 * Implemented completely for all built-in modes except for FCD. 0251 * The base class implementation converts to & from UTF-16 and does not support edits. 0252 * 0253 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 0254 * @param src Source UTF-8 string. 0255 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 0256 * sink.Flush() is called at the end. 0257 * @param edits Records edits for index mapping, working with styled text, 0258 * and getting only changes (if any). 0259 * The Edits contents is undefined if any error occurs. 0260 * This function calls edits->reset() first unless 0261 * options includes U_EDITS_NO_RESET. edits can be nullptr. 0262 * @param errorCode Standard ICU error code. Its input value must 0263 * pass the U_SUCCESS() test, or else the function returns 0264 * immediately. Check for U_FAILURE() on output or use with 0265 * function chaining. (See User Guide for details.) 0266 * @stable ICU 60 0267 */ 0268 virtual void 0269 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 0270 Edits *edits, UErrorCode &errorCode) const; 0271 0272 /** 0273 * Appends the normalized form of the second string to the first string 0274 * (merging them at the boundary) and returns the first string. 0275 * The result is normalized if the first string was normalized. 0276 * The first and second strings must be different objects. 0277 * @param first string, should be normalized 0278 * @param second string, will be normalized 0279 * @param errorCode Standard ICU error code. Its input value must 0280 * pass the U_SUCCESS() test, or else the function returns 0281 * immediately. Check for U_FAILURE() on output or use with 0282 * function chaining. (See User Guide for details.) 0283 * @return first 0284 * @stable ICU 4.4 0285 */ 0286 virtual UnicodeString & 0287 normalizeSecondAndAppend(UnicodeString &first, 0288 const UnicodeString &second, 0289 UErrorCode &errorCode) const = 0; 0290 /** 0291 * Appends the second string to the first string 0292 * (merging them at the boundary) and returns the first string. 0293 * The result is normalized if both the strings were normalized. 0294 * The first and second strings must be different objects. 0295 * @param first string, should be normalized 0296 * @param second string, should be normalized 0297 * @param errorCode Standard ICU error code. Its input value must 0298 * pass the U_SUCCESS() test, or else the function returns 0299 * immediately. Check for U_FAILURE() on output or use with 0300 * function chaining. (See User Guide for details.) 0301 * @return first 0302 * @stable ICU 4.4 0303 */ 0304 virtual UnicodeString & 0305 append(UnicodeString &first, 0306 const UnicodeString &second, 0307 UErrorCode &errorCode) const = 0; 0308 0309 /** 0310 * Gets the decomposition mapping of c. 0311 * Roughly equivalent to normalizing the String form of c 0312 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function 0313 * returns false and does not write a string 0314 * if c does not have a decomposition mapping in this instance's data. 0315 * This function is independent of the mode of the Normalizer2. 0316 * @param c code point 0317 * @param decomposition String object which will be set to c's 0318 * decomposition mapping, if there is one. 0319 * @return true if c has a decomposition, otherwise false 0320 * @stable ICU 4.6 0321 */ 0322 virtual UBool 0323 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; 0324 0325 /** 0326 * Gets the raw decomposition mapping of c. 0327 * 0328 * This is similar to the getDecomposition() method but returns the 0329 * raw decomposition mapping as specified in UnicodeData.txt or 0330 * (for custom data) in the mapping files processed by the gennorm2 tool. 0331 * By contrast, getDecomposition() returns the processed, 0332 * recursively-decomposed version of this mapping. 0333 * 0334 * When used on a standard NFKC Normalizer2 instance, 0335 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. 0336 * 0337 * When used on a standard NFC Normalizer2 instance, 0338 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); 0339 * in this case, the result contains either one or two code points (=1..4 char16_ts). 0340 * 0341 * This function is independent of the mode of the Normalizer2. 0342 * The default implementation returns false. 0343 * @param c code point 0344 * @param decomposition String object which will be set to c's 0345 * raw decomposition mapping, if there is one. 0346 * @return true if c has a decomposition, otherwise false 0347 * @stable ICU 49 0348 */ 0349 virtual UBool 0350 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; 0351 0352 /** 0353 * Performs pairwise composition of a & b and returns the composite if there is one. 0354 * 0355 * Returns a composite code point c only if c has a two-way mapping to a+b. 0356 * In standard Unicode normalization, this means that 0357 * c has a canonical decomposition to a+b 0358 * and c does not have the Full_Composition_Exclusion property. 0359 * 0360 * This function is independent of the mode of the Normalizer2. 0361 * The default implementation returns a negative value. 0362 * @param a A (normalization starter) code point. 0363 * @param b Another code point. 0364 * @return The non-negative composite code point if there is one; otherwise a negative value. 0365 * @stable ICU 49 0366 */ 0367 virtual UChar32 0368 composePair(UChar32 a, UChar32 b) const; 0369 0370 /** 0371 * Gets the combining class of c. 0372 * The default implementation returns 0 0373 * but all standard implementations return the Unicode Canonical_Combining_Class value. 0374 * @param c code point 0375 * @return c's combining class 0376 * @stable ICU 49 0377 */ 0378 virtual uint8_t 0379 getCombiningClass(UChar32 c) const; 0380 0381 /** 0382 * Tests if the string is normalized. 0383 * Internally, in cases where the quickCheck() method would return "maybe" 0384 * (which is only possible for the two COMPOSE modes) this method 0385 * resolves to "yes" or "no" to provide a definitive result, 0386 * at the cost of doing more work in those cases. 0387 * @param s input string 0388 * @param errorCode Standard ICU error code. Its input value must 0389 * pass the U_SUCCESS() test, or else the function returns 0390 * immediately. Check for U_FAILURE() on output or use with 0391 * function chaining. (See User Guide for details.) 0392 * @return true if s is normalized 0393 * @stable ICU 4.4 0394 */ 0395 virtual UBool 0396 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; 0397 /** 0398 * Tests if the UTF-8 string is normalized. 0399 * Internally, in cases where the quickCheck() method would return "maybe" 0400 * (which is only possible for the two COMPOSE modes) this method 0401 * resolves to "yes" or "no" to provide a definitive result, 0402 * at the cost of doing more work in those cases. 0403 * 0404 * This works for all normalization modes. 0405 * It is optimized for UTF-8 for all built-in modes except for FCD. 0406 * The base class implementation converts to UTF-16 and calls isNormalized(). 0407 * 0408 * @param s UTF-8 input string 0409 * @param errorCode Standard ICU error code. Its input value must 0410 * pass the U_SUCCESS() test, or else the function returns 0411 * immediately. Check for U_FAILURE() on output or use with 0412 * function chaining. (See User Guide for details.) 0413 * @return true if s is normalized 0414 * @stable ICU 60 0415 */ 0416 virtual UBool 0417 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; 0418 0419 0420 /** 0421 * Tests if the string is normalized. 0422 * For the two COMPOSE modes, the result could be "maybe" in cases that 0423 * would take a little more work to resolve definitively. 0424 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster 0425 * combination of quick check + normalization, to avoid 0426 * re-checking the "yes" prefix. 0427 * @param s input string 0428 * @param errorCode Standard ICU error code. Its input value must 0429 * pass the U_SUCCESS() test, or else the function returns 0430 * immediately. Check for U_FAILURE() on output or use with 0431 * function chaining. (See User Guide for details.) 0432 * @return UNormalizationCheckResult 0433 * @stable ICU 4.4 0434 */ 0435 virtual UNormalizationCheckResult 0436 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; 0437 0438 /** 0439 * Returns the end of the normalized substring of the input string. 0440 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> 0441 * the substring <code>UnicodeString(s, 0, end)</code> 0442 * will pass the quick check with a "yes" result. 0443 * 0444 * The returned end index is usually one or more characters before the 0445 * "no" or "maybe" character: The end index is at a normalization boundary. 0446 * (See the class documentation for more about normalization boundaries.) 0447 * 0448 * When the goal is a normalized string and most input strings are expected 0449 * to be normalized already, then call this method, 0450 * and if it returns a prefix shorter than the input string, 0451 * copy that prefix and use normalizeSecondAndAppend() for the remainder. 0452 * @param s input string 0453 * @param errorCode Standard ICU error code. Its input value must 0454 * pass the U_SUCCESS() test, or else the function returns 0455 * immediately. Check for U_FAILURE() on output or use with 0456 * function chaining. (See User Guide for details.) 0457 * @return "yes" span end index 0458 * @stable ICU 4.4 0459 */ 0460 virtual int32_t 0461 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; 0462 0463 /** 0464 * Tests if the character always has a normalization boundary before it, 0465 * regardless of context. 0466 * If true, then the character does not normalization-interact with 0467 * preceding characters. 0468 * In other words, a string containing this character can be normalized 0469 * by processing portions before this character and starting from this 0470 * character independently. 0471 * This is used for iterative normalization. See the class documentation for details. 0472 * @param c character to test 0473 * @return true if c has a normalization boundary before it 0474 * @stable ICU 4.4 0475 */ 0476 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; 0477 0478 /** 0479 * Tests if the character always has a normalization boundary after it, 0480 * regardless of context. 0481 * If true, then the character does not normalization-interact with 0482 * following characters. 0483 * In other words, a string containing this character can be normalized 0484 * by processing portions up to this character and after this 0485 * character independently. 0486 * This is used for iterative normalization. See the class documentation for details. 0487 * Note that this operation may be significantly slower than hasBoundaryBefore(). 0488 * @param c character to test 0489 * @return true if c has a normalization boundary after it 0490 * @stable ICU 4.4 0491 */ 0492 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; 0493 0494 /** 0495 * Tests if the character is normalization-inert. 0496 * If true, then the character does not change, nor normalization-interact with 0497 * preceding or following characters. 0498 * In other words, a string containing this character can be normalized 0499 * by processing portions before this character and after this 0500 * character independently. 0501 * This is used for iterative normalization. See the class documentation for details. 0502 * Note that this operation may be significantly slower than hasBoundaryBefore(). 0503 * @param c character to test 0504 * @return true if c is normalization-inert 0505 * @stable ICU 4.4 0506 */ 0507 virtual UBool isInert(UChar32 c) const = 0; 0508 }; 0509 0510 /** 0511 * Normalization filtered by a UnicodeSet. 0512 * Normalizes portions of the text contained in the filter set and leaves 0513 * portions not contained in the filter set unchanged. 0514 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). 0515 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". 0516 * This class implements all of (and only) the Normalizer2 API. 0517 * An instance of this class is unmodifiable/immutable but is constructed and 0518 * must be destructed by the owner. 0519 * @stable ICU 4.4 0520 */ 0521 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { 0522 public: 0523 /** 0524 * Constructs a filtered normalizer wrapping any Normalizer2 instance 0525 * and a filter set. 0526 * Both are aliased and must not be modified or deleted while this object 0527 * is used. 0528 * The filter set should be frozen; otherwise the performance will suffer greatly. 0529 * @param n2 wrapped Normalizer2 instance 0530 * @param filterSet UnicodeSet which determines the characters to be normalized 0531 * @stable ICU 4.4 0532 */ 0533 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : 0534 norm2(n2), set(filterSet) {} 0535 0536 /** 0537 * Destructor. 0538 * @stable ICU 4.4 0539 */ 0540 ~FilteredNormalizer2(); 0541 0542 /** 0543 * Writes the normalized form of the source string to the destination string 0544 * (replacing its contents) and returns the destination string. 0545 * The source and destination strings must be different objects. 0546 * @param src source string 0547 * @param dest destination string; its contents is replaced with normalized src 0548 * @param errorCode Standard ICU error code. Its input value must 0549 * pass the U_SUCCESS() test, or else the function returns 0550 * immediately. Check for U_FAILURE() on output or use with 0551 * function chaining. (See User Guide for details.) 0552 * @return dest 0553 * @stable ICU 4.4 0554 */ 0555 virtual UnicodeString & 0556 normalize(const UnicodeString &src, 0557 UnicodeString &dest, 0558 UErrorCode &errorCode) const override; 0559 0560 /** 0561 * Normalizes a UTF-8 string and optionally records how source substrings 0562 * relate to changed and unchanged result substrings. 0563 * 0564 * Implemented completely for most built-in modes except for FCD. 0565 * The base class implementation converts to & from UTF-16 and does not support edits. 0566 * 0567 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. 0568 * @param src Source UTF-8 string. 0569 * @param sink A ByteSink to which the normalized UTF-8 result string is written. 0570 * sink.Flush() is called at the end. 0571 * @param edits Records edits for index mapping, working with styled text, 0572 * and getting only changes (if any). 0573 * The Edits contents is undefined if any error occurs. 0574 * This function calls edits->reset() first unless 0575 * options includes U_EDITS_NO_RESET. edits can be nullptr. 0576 * @param errorCode Standard ICU error code. Its input value must 0577 * pass the U_SUCCESS() test, or else the function returns 0578 * immediately. Check for U_FAILURE() on output or use with 0579 * function chaining. (See User Guide for details.) 0580 * @stable ICU 60 0581 */ 0582 virtual void 0583 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, 0584 Edits *edits, UErrorCode &errorCode) const override; 0585 0586 /** 0587 * Appends the normalized form of the second string to the first string 0588 * (merging them at the boundary) and returns the first string. 0589 * The result is normalized if the first string was normalized. 0590 * The first and second strings must be different objects. 0591 * @param first string, should be normalized 0592 * @param second string, will be normalized 0593 * @param errorCode Standard ICU error code. Its input value must 0594 * pass the U_SUCCESS() test, or else the function returns 0595 * immediately. Check for U_FAILURE() on output or use with 0596 * function chaining. (See User Guide for details.) 0597 * @return first 0598 * @stable ICU 4.4 0599 */ 0600 virtual UnicodeString & 0601 normalizeSecondAndAppend(UnicodeString &first, 0602 const UnicodeString &second, 0603 UErrorCode &errorCode) const override; 0604 /** 0605 * Appends the second string to the first string 0606 * (merging them at the boundary) and returns the first string. 0607 * The result is normalized if both the strings were normalized. 0608 * The first and second strings must be different objects. 0609 * @param first string, should be normalized 0610 * @param second string, should be normalized 0611 * @param errorCode Standard ICU error code. Its input value must 0612 * pass the U_SUCCESS() test, or else the function returns 0613 * immediately. Check for U_FAILURE() on output or use with 0614 * function chaining. (See User Guide for details.) 0615 * @return first 0616 * @stable ICU 4.4 0617 */ 0618 virtual UnicodeString & 0619 append(UnicodeString &first, 0620 const UnicodeString &second, 0621 UErrorCode &errorCode) const override; 0622 0623 /** 0624 * Gets the decomposition mapping of c. 0625 * For details see the base class documentation. 0626 * 0627 * This function is independent of the mode of the Normalizer2. 0628 * @param c code point 0629 * @param decomposition String object which will be set to c's 0630 * decomposition mapping, if there is one. 0631 * @return true if c has a decomposition, otherwise false 0632 * @stable ICU 4.6 0633 */ 0634 virtual UBool 0635 getDecomposition(UChar32 c, UnicodeString &decomposition) const override; 0636 0637 /** 0638 * Gets the raw decomposition mapping of c. 0639 * For details see the base class documentation. 0640 * 0641 * This function is independent of the mode of the Normalizer2. 0642 * @param c code point 0643 * @param decomposition String object which will be set to c's 0644 * raw decomposition mapping, if there is one. 0645 * @return true if c has a decomposition, otherwise false 0646 * @stable ICU 49 0647 */ 0648 virtual UBool 0649 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override; 0650 0651 /** 0652 * Performs pairwise composition of a & b and returns the composite if there is one. 0653 * For details see the base class documentation. 0654 * 0655 * This function is independent of the mode of the Normalizer2. 0656 * @param a A (normalization starter) code point. 0657 * @param b Another code point. 0658 * @return The non-negative composite code point if there is one; otherwise a negative value. 0659 * @stable ICU 49 0660 */ 0661 virtual UChar32 0662 composePair(UChar32 a, UChar32 b) const override; 0663 0664 /** 0665 * Gets the combining class of c. 0666 * The default implementation returns 0 0667 * but all standard implementations return the Unicode Canonical_Combining_Class value. 0668 * @param c code point 0669 * @return c's combining class 0670 * @stable ICU 49 0671 */ 0672 virtual uint8_t 0673 getCombiningClass(UChar32 c) const override; 0674 0675 /** 0676 * Tests if the string is normalized. 0677 * For details see the Normalizer2 base class documentation. 0678 * @param s input string 0679 * @param errorCode Standard ICU error code. Its input value must 0680 * pass the U_SUCCESS() test, or else the function returns 0681 * immediately. Check for U_FAILURE() on output or use with 0682 * function chaining. (See User Guide for details.) 0683 * @return true if s is normalized 0684 * @stable ICU 4.4 0685 */ 0686 virtual UBool 0687 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override; 0688 /** 0689 * Tests if the UTF-8 string is normalized. 0690 * Internally, in cases where the quickCheck() method would return "maybe" 0691 * (which is only possible for the two COMPOSE modes) this method 0692 * resolves to "yes" or "no" to provide a definitive result, 0693 * at the cost of doing more work in those cases. 0694 * 0695 * This works for all normalization modes. 0696 * It is optimized for UTF-8 for all built-in modes except for FCD. 0697 * The base class implementation converts to UTF-16 and calls isNormalized(). 0698 * 0699 * @param s UTF-8 input string 0700 * @param errorCode Standard ICU error code. Its input value must 0701 * pass the U_SUCCESS() test, or else the function returns 0702 * immediately. Check for U_FAILURE() on output or use with 0703 * function chaining. (See User Guide for details.) 0704 * @return true if s is normalized 0705 * @stable ICU 60 0706 */ 0707 virtual UBool 0708 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override; 0709 /** 0710 * Tests if the string is normalized. 0711 * For details see the Normalizer2 base class documentation. 0712 * @param s input string 0713 * @param errorCode Standard ICU error code. Its input value must 0714 * pass the U_SUCCESS() test, or else the function returns 0715 * immediately. Check for U_FAILURE() on output or use with 0716 * function chaining. (See User Guide for details.) 0717 * @return UNormalizationCheckResult 0718 * @stable ICU 4.4 0719 */ 0720 virtual UNormalizationCheckResult 0721 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override; 0722 /** 0723 * Returns the end of the normalized substring of the input string. 0724 * For details see the Normalizer2 base class documentation. 0725 * @param s input string 0726 * @param errorCode Standard ICU error code. Its input value must 0727 * pass the U_SUCCESS() test, or else the function returns 0728 * immediately. Check for U_FAILURE() on output or use with 0729 * function chaining. (See User Guide for details.) 0730 * @return "yes" span end index 0731 * @stable ICU 4.4 0732 */ 0733 virtual int32_t 0734 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override; 0735 0736 /** 0737 * Tests if the character always has a normalization boundary before it, 0738 * regardless of context. 0739 * For details see the Normalizer2 base class documentation. 0740 * @param c character to test 0741 * @return true if c has a normalization boundary before it 0742 * @stable ICU 4.4 0743 */ 0744 virtual UBool hasBoundaryBefore(UChar32 c) const override; 0745 0746 /** 0747 * Tests if the character always has a normalization boundary after it, 0748 * regardless of context. 0749 * For details see the Normalizer2 base class documentation. 0750 * @param c character to test 0751 * @return true if c has a normalization boundary after it 0752 * @stable ICU 4.4 0753 */ 0754 virtual UBool hasBoundaryAfter(UChar32 c) const override; 0755 0756 /** 0757 * Tests if the character is normalization-inert. 0758 * For details see the Normalizer2 base class documentation. 0759 * @param c character to test 0760 * @return true if c is normalization-inert 0761 * @stable ICU 4.4 0762 */ 0763 virtual UBool isInert(UChar32 c) const override; 0764 private: 0765 UnicodeString & 0766 normalize(const UnicodeString &src, 0767 UnicodeString &dest, 0768 USetSpanCondition spanCondition, 0769 UErrorCode &errorCode) const; 0770 0771 void 0772 normalizeUTF8(uint32_t options, const char *src, int32_t length, 0773 ByteSink &sink, Edits *edits, 0774 USetSpanCondition spanCondition, 0775 UErrorCode &errorCode) const; 0776 0777 UnicodeString & 0778 normalizeSecondAndAppend(UnicodeString &first, 0779 const UnicodeString &second, 0780 UBool doNormalize, 0781 UErrorCode &errorCode) const; 0782 0783 const Normalizer2 &norm2; 0784 const UnicodeSet &set; 0785 }; 0786 0787 U_NAMESPACE_END 0788 0789 #endif // !UCONFIG_NO_NORMALIZATION 0790 0791 #endif /* U_SHOW_CPLUSPLUS_API */ 0792 0793 #endif // __NORMALIZER2_H__
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |