Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:13:06

0001 // © 2016 and later: Unicode, Inc. and others.
0002 // License & terms of use: http://www.unicode.org/copyright.html
0003 /*
0004 *******************************************************************************
0005 *
0006 *   Copyright (C) 2009-2013, International Business Machines
0007 *   Corporation and others.  All Rights Reserved.
0008 *
0009 *******************************************************************************
0010 *   file name:  normalizer2.h
0011 *   encoding:   UTF-8
0012 *   tab size:   8 (not used)
0013 *   indentation:4
0014 *
0015 *   created on: 2009nov22
0016 *   created by: Markus W. Scherer
0017 */
0018 
0019 #ifndef __NORMALIZER2_H__
0020 #define __NORMALIZER2_H__
0021 
0022 /**
0023  * \file
0024  * \brief C++ API: New API for Unicode Normalization.
0025  */
0026 
0027 #include "unicode/utypes.h"
0028 
0029 #if U_SHOW_CPLUSPLUS_API
0030 
0031 #if !UCONFIG_NO_NORMALIZATION
0032 
0033 #include "unicode/stringpiece.h"
0034 #include "unicode/uniset.h"
0035 #include "unicode/unistr.h"
0036 #include "unicode/unorm2.h"
0037 
0038 U_NAMESPACE_BEGIN
0039 
0040 class ByteSink;
0041 
0042 /**
0043  * Unicode normalization functionality for standard Unicode normalization or
0044  * for using custom mapping tables.
0045  * All instances of this class are unmodifiable/immutable.
0046  * Instances returned by getInstance() are singletons that must not be deleted by the caller.
0047  * The Normalizer2 class is not intended for public subclassing.
0048  *
0049  * The primary functions are to produce a normalized string and to detect whether
0050  * a string is already normalized.
0051  * The most commonly used normalization forms are those defined in
0052  * http://www.unicode.org/unicode/reports/tr15/
0053  * However, this API supports additional normalization forms for specialized purposes.
0054  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
0055  * and can be used in implementations of UTS #46.
0056  *
0057  * Not only are the standard compose and decompose modes supplied,
0058  * but additional modes are provided as documented in the Mode enum.
0059  *
0060  * Some of the functions in this class identify normalization boundaries.
0061  * At a normalization boundary, the portions of the string
0062  * before it and starting from it do not interact and can be handled independently.
0063  *
0064  * The spanQuickCheckYes() stops at a normalization boundary.
0065  * When the goal is a normalized string, then the text before the boundary
0066  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
0067  *
0068  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
0069  * a character is guaranteed to be at a normalization boundary,
0070  * regardless of context.
0071  * This is used for moving from one normalization boundary to the next
0072  * or preceding boundary, and for performing iterative normalization.
0073  *
0074  * Iterative normalization is useful when only a small portion of a
0075  * longer string needs to be processed.
0076  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
0077  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
0078  * (to process only the substring for which sort key bytes are computed).
0079  *
0080  * The set of normalization boundaries returned by these functions may not be
0081  * complete: There may be more boundaries that could be returned.
0082  * Different functions may return different boundaries.
0083  * @stable ICU 4.4
0084  */
0085 class U_COMMON_API Normalizer2 : public UObject {
0086 public:
0087     /**
0088      * Destructor.
0089      * @stable ICU 4.4
0090      */
0091     ~Normalizer2();
0092 
0093     /**
0094      * Returns a Normalizer2 instance for Unicode NFC normalization.
0095      * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
0096      * Returns an unmodifiable singleton instance. Do not delete it.
0097      * @param errorCode Standard ICU error code. Its input value must
0098      *                  pass the U_SUCCESS() test, or else the function returns
0099      *                  immediately. Check for U_FAILURE() on output or use with
0100      *                  function chaining. (See User Guide for details.)
0101      * @return the requested Normalizer2, if successful
0102      * @stable ICU 49
0103      */
0104     static const Normalizer2 *
0105     getNFCInstance(UErrorCode &errorCode);
0106 
0107     /**
0108      * Returns a Normalizer2 instance for Unicode NFD normalization.
0109      * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
0110      * Returns an unmodifiable singleton instance. Do not delete it.
0111      * @param errorCode Standard ICU error code. Its input value must
0112      *                  pass the U_SUCCESS() test, or else the function returns
0113      *                  immediately. Check for U_FAILURE() on output or use with
0114      *                  function chaining. (See User Guide for details.)
0115      * @return the requested Normalizer2, if successful
0116      * @stable ICU 49
0117      */
0118     static const Normalizer2 *
0119     getNFDInstance(UErrorCode &errorCode);
0120 
0121     /**
0122      * Returns a Normalizer2 instance for Unicode NFKC normalization.
0123      * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
0124      * Returns an unmodifiable singleton instance. Do not delete it.
0125      * @param errorCode Standard ICU error code. Its input value must
0126      *                  pass the U_SUCCESS() test, or else the function returns
0127      *                  immediately. Check for U_FAILURE() on output or use with
0128      *                  function chaining. (See User Guide for details.)
0129      * @return the requested Normalizer2, if successful
0130      * @stable ICU 49
0131      */
0132     static const Normalizer2 *
0133     getNFKCInstance(UErrorCode &errorCode);
0134 
0135     /**
0136      * Returns a Normalizer2 instance for Unicode NFKD normalization.
0137      * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
0138      * Returns an unmodifiable singleton instance. Do not delete it.
0139      * @param errorCode Standard ICU error code. Its input value must
0140      *                  pass the U_SUCCESS() test, or else the function returns
0141      *                  immediately. Check for U_FAILURE() on output or use with
0142      *                  function chaining. (See User Guide for details.)
0143      * @return the requested Normalizer2, if successful
0144      * @stable ICU 49
0145      */
0146     static const Normalizer2 *
0147     getNFKDInstance(UErrorCode &errorCode);
0148 
0149     /**
0150      * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
0151      * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
0152      * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
0153      *
0154      * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
0155      * Returns an unmodifiable singleton instance. Do not delete it.
0156      * @param errorCode Standard ICU error code. Its input value must
0157      *                  pass the U_SUCCESS() test, or else the function returns
0158      *                  immediately. Check for U_FAILURE() on output or use with
0159      *                  function chaining. (See User Guide for details.)
0160      * @return the requested Normalizer2, if successful
0161      * @stable ICU 49
0162      */
0163     static const Normalizer2 *
0164     getNFKCCasefoldInstance(UErrorCode &errorCode);
0165 
0166 #ifndef U_HIDE_DRAFT_API
0167     /**
0168      * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
0169      * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
0170      * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
0171      *
0172      * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
0173      * Returns an unmodifiable singleton instance. Do not delete it.
0174      * @param errorCode Standard ICU error code. Its input value must
0175      *                  pass the U_SUCCESS() test, or else the function returns
0176      *                  immediately. Check for U_FAILURE() on output or use with
0177      *                  function chaining. (See User Guide for details.)
0178      * @return the requested Normalizer2, if successful
0179      * @draft ICU 74
0180      */
0181     static const Normalizer2 *
0182     getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
0183 #endif  // U_HIDE_DRAFT_API
0184 
0185     /**
0186      * Returns a Normalizer2 instance which uses the specified data file
0187      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
0188      * and which composes or decomposes text according to the specified mode.
0189      * Returns an unmodifiable singleton instance. Do not delete it.
0190      *
0191      * Use packageName=nullptr for data files that are part of ICU's own data.
0192      * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
0193      * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
0194      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
0195      *
0196      * @param packageName nullptr for ICU built-in data, otherwise application data package name
0197      * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
0198      * @param mode normalization mode (compose or decompose etc.)
0199      * @param errorCode Standard ICU error code. Its input value must
0200      *                  pass the U_SUCCESS() test, or else the function returns
0201      *                  immediately. Check for U_FAILURE() on output or use with
0202      *                  function chaining. (See User Guide for details.)
0203      * @return the requested Normalizer2, if successful
0204      * @stable ICU 4.4
0205      */
0206     static const Normalizer2 *
0207     getInstance(const char *packageName,
0208                 const char *name,
0209                 UNormalization2Mode mode,
0210                 UErrorCode &errorCode);
0211 
0212     /**
0213      * Returns the normalized form of the source string.
0214      * @param src source string
0215      * @param errorCode Standard ICU error code. Its input value must
0216      *                  pass the U_SUCCESS() test, or else the function returns
0217      *                  immediately. Check for U_FAILURE() on output or use with
0218      *                  function chaining. (See User Guide for details.)
0219      * @return normalized src
0220      * @stable ICU 4.4
0221      */
0222     UnicodeString
0223     normalize(const UnicodeString &src, UErrorCode &errorCode) const {
0224         UnicodeString result;
0225         normalize(src, result, errorCode);
0226         return result;
0227     }
0228     /**
0229      * Writes the normalized form of the source string to the destination string
0230      * (replacing its contents) and returns the destination string.
0231      * The source and destination strings must be different objects.
0232      * @param src source string
0233      * @param dest destination string; its contents is replaced with normalized src
0234      * @param errorCode Standard ICU error code. Its input value must
0235      *                  pass the U_SUCCESS() test, or else the function returns
0236      *                  immediately. Check for U_FAILURE() on output or use with
0237      *                  function chaining. (See User Guide for details.)
0238      * @return dest
0239      * @stable ICU 4.4
0240      */
0241     virtual UnicodeString &
0242     normalize(const UnicodeString &src,
0243               UnicodeString &dest,
0244               UErrorCode &errorCode) const = 0;
0245 
0246     /**
0247      * Normalizes a UTF-8 string and optionally records how source substrings
0248      * relate to changed and unchanged result substrings.
0249      *
0250      * Implemented completely for all built-in modes except for FCD.
0251      * The base class implementation converts to & from UTF-16 and does not support edits.
0252      *
0253      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
0254      * @param src       Source UTF-8 string.
0255      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
0256      *                  sink.Flush() is called at the end.
0257      * @param edits     Records edits for index mapping, working with styled text,
0258      *                  and getting only changes (if any).
0259      *                  The Edits contents is undefined if any error occurs.
0260      *                  This function calls edits->reset() first unless
0261      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
0262      * @param errorCode Standard ICU error code. Its input value must
0263      *                  pass the U_SUCCESS() test, or else the function returns
0264      *                  immediately. Check for U_FAILURE() on output or use with
0265      *                  function chaining. (See User Guide for details.)
0266      * @stable ICU 60
0267      */
0268     virtual void
0269     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
0270                   Edits *edits, UErrorCode &errorCode) const;
0271 
0272     /**
0273      * Appends the normalized form of the second string to the first string
0274      * (merging them at the boundary) and returns the first string.
0275      * The result is normalized if the first string was normalized.
0276      * The first and second strings must be different objects.
0277      * @param first string, should be normalized
0278      * @param second string, will be normalized
0279      * @param errorCode Standard ICU error code. Its input value must
0280      *                  pass the U_SUCCESS() test, or else the function returns
0281      *                  immediately. Check for U_FAILURE() on output or use with
0282      *                  function chaining. (See User Guide for details.)
0283      * @return first
0284      * @stable ICU 4.4
0285      */
0286     virtual UnicodeString &
0287     normalizeSecondAndAppend(UnicodeString &first,
0288                              const UnicodeString &second,
0289                              UErrorCode &errorCode) const = 0;
0290     /**
0291      * Appends the second string to the first string
0292      * (merging them at the boundary) and returns the first string.
0293      * The result is normalized if both the strings were normalized.
0294      * The first and second strings must be different objects.
0295      * @param first string, should be normalized
0296      * @param second string, should be normalized
0297      * @param errorCode Standard ICU error code. Its input value must
0298      *                  pass the U_SUCCESS() test, or else the function returns
0299      *                  immediately. Check for U_FAILURE() on output or use with
0300      *                  function chaining. (See User Guide for details.)
0301      * @return first
0302      * @stable ICU 4.4
0303      */
0304     virtual UnicodeString &
0305     append(UnicodeString &first,
0306            const UnicodeString &second,
0307            UErrorCode &errorCode) const = 0;
0308 
0309     /**
0310      * Gets the decomposition mapping of c.
0311      * Roughly equivalent to normalizing the String form of c
0312      * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
0313      * returns false and does not write a string
0314      * if c does not have a decomposition mapping in this instance's data.
0315      * This function is independent of the mode of the Normalizer2.
0316      * @param c code point
0317      * @param decomposition String object which will be set to c's
0318      *                      decomposition mapping, if there is one.
0319      * @return true if c has a decomposition, otherwise false
0320      * @stable ICU 4.6
0321      */
0322     virtual UBool
0323     getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
0324 
0325     /**
0326      * Gets the raw decomposition mapping of c.
0327      *
0328      * This is similar to the getDecomposition() method but returns the
0329      * raw decomposition mapping as specified in UnicodeData.txt or
0330      * (for custom data) in the mapping files processed by the gennorm2 tool.
0331      * By contrast, getDecomposition() returns the processed,
0332      * recursively-decomposed version of this mapping.
0333      *
0334      * When used on a standard NFKC Normalizer2 instance,
0335      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
0336      *
0337      * When used on a standard NFC Normalizer2 instance,
0338      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
0339      * in this case, the result contains either one or two code points (=1..4 char16_ts).
0340      *
0341      * This function is independent of the mode of the Normalizer2.
0342      * The default implementation returns false.
0343      * @param c code point
0344      * @param decomposition String object which will be set to c's
0345      *                      raw decomposition mapping, if there is one.
0346      * @return true if c has a decomposition, otherwise false
0347      * @stable ICU 49
0348      */
0349     virtual UBool
0350     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
0351 
0352     /**
0353      * Performs pairwise composition of a & b and returns the composite if there is one.
0354      *
0355      * Returns a composite code point c only if c has a two-way mapping to a+b.
0356      * In standard Unicode normalization, this means that
0357      * c has a canonical decomposition to a+b
0358      * and c does not have the Full_Composition_Exclusion property.
0359      *
0360      * This function is independent of the mode of the Normalizer2.
0361      * The default implementation returns a negative value.
0362      * @param a A (normalization starter) code point.
0363      * @param b Another code point.
0364      * @return The non-negative composite code point if there is one; otherwise a negative value.
0365      * @stable ICU 49
0366      */
0367     virtual UChar32
0368     composePair(UChar32 a, UChar32 b) const;
0369 
0370     /**
0371      * Gets the combining class of c.
0372      * The default implementation returns 0
0373      * but all standard implementations return the Unicode Canonical_Combining_Class value.
0374      * @param c code point
0375      * @return c's combining class
0376      * @stable ICU 49
0377      */
0378     virtual uint8_t
0379     getCombiningClass(UChar32 c) const;
0380 
0381     /**
0382      * Tests if the string is normalized.
0383      * Internally, in cases where the quickCheck() method would return "maybe"
0384      * (which is only possible for the two COMPOSE modes) this method
0385      * resolves to "yes" or "no" to provide a definitive result,
0386      * at the cost of doing more work in those cases.
0387      * @param s input string
0388      * @param errorCode Standard ICU error code. Its input value must
0389      *                  pass the U_SUCCESS() test, or else the function returns
0390      *                  immediately. Check for U_FAILURE() on output or use with
0391      *                  function chaining. (See User Guide for details.)
0392      * @return true if s is normalized
0393      * @stable ICU 4.4
0394      */
0395     virtual UBool
0396     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0397     /**
0398      * Tests if the UTF-8 string is normalized.
0399      * Internally, in cases where the quickCheck() method would return "maybe"
0400      * (which is only possible for the two COMPOSE modes) this method
0401      * resolves to "yes" or "no" to provide a definitive result,
0402      * at the cost of doing more work in those cases.
0403      *
0404      * This works for all normalization modes.
0405      * It is optimized for UTF-8 for all built-in modes except for FCD.
0406      * The base class implementation converts to UTF-16 and calls isNormalized().
0407      *
0408      * @param s UTF-8 input string
0409      * @param errorCode Standard ICU error code. Its input value must
0410      *                  pass the U_SUCCESS() test, or else the function returns
0411      *                  immediately. Check for U_FAILURE() on output or use with
0412      *                  function chaining. (See User Guide for details.)
0413      * @return true if s is normalized
0414      * @stable ICU 60
0415      */
0416     virtual UBool
0417     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
0418 
0419 
0420     /**
0421      * Tests if the string is normalized.
0422      * For the two COMPOSE modes, the result could be "maybe" in cases that
0423      * would take a little more work to resolve definitively.
0424      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
0425      * combination of quick check + normalization, to avoid
0426      * re-checking the "yes" prefix.
0427      * @param s input string
0428      * @param errorCode Standard ICU error code. Its input value must
0429      *                  pass the U_SUCCESS() test, or else the function returns
0430      *                  immediately. Check for U_FAILURE() on output or use with
0431      *                  function chaining. (See User Guide for details.)
0432      * @return UNormalizationCheckResult
0433      * @stable ICU 4.4
0434      */
0435     virtual UNormalizationCheckResult
0436     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0437 
0438     /**
0439      * Returns the end of the normalized substring of the input string.
0440      * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
0441      * the substring <code>UnicodeString(s, 0, end)</code>
0442      * will pass the quick check with a "yes" result.
0443      *
0444      * The returned end index is usually one or more characters before the
0445      * "no" or "maybe" character: The end index is at a normalization boundary.
0446      * (See the class documentation for more about normalization boundaries.)
0447      *
0448      * When the goal is a normalized string and most input strings are expected
0449      * to be normalized already, then call this method,
0450      * and if it returns a prefix shorter than the input string,
0451      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
0452      * @param s input string
0453      * @param errorCode Standard ICU error code. Its input value must
0454      *                  pass the U_SUCCESS() test, or else the function returns
0455      *                  immediately. Check for U_FAILURE() on output or use with
0456      *                  function chaining. (See User Guide for details.)
0457      * @return "yes" span end index
0458      * @stable ICU 4.4
0459      */
0460     virtual int32_t
0461     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0462 
0463     /**
0464      * Tests if the character always has a normalization boundary before it,
0465      * regardless of context.
0466      * If true, then the character does not normalization-interact with
0467      * preceding characters.
0468      * In other words, a string containing this character can be normalized
0469      * by processing portions before this character and starting from this
0470      * character independently.
0471      * This is used for iterative normalization. See the class documentation for details.
0472      * @param c character to test
0473      * @return true if c has a normalization boundary before it
0474      * @stable ICU 4.4
0475      */
0476     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
0477 
0478     /**
0479      * Tests if the character always has a normalization boundary after it,
0480      * regardless of context.
0481      * If true, then the character does not normalization-interact with
0482      * following characters.
0483      * In other words, a string containing this character can be normalized
0484      * by processing portions up to this character and after this
0485      * character independently.
0486      * This is used for iterative normalization. See the class documentation for details.
0487      * Note that this operation may be significantly slower than hasBoundaryBefore().
0488      * @param c character to test
0489      * @return true if c has a normalization boundary after it
0490      * @stable ICU 4.4
0491      */
0492     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
0493 
0494     /**
0495      * Tests if the character is normalization-inert.
0496      * If true, then the character does not change, nor normalization-interact with
0497      * preceding or following characters.
0498      * In other words, a string containing this character can be normalized
0499      * by processing portions before this character and after this
0500      * character independently.
0501      * This is used for iterative normalization. See the class documentation for details.
0502      * Note that this operation may be significantly slower than hasBoundaryBefore().
0503      * @param c character to test
0504      * @return true if c is normalization-inert
0505      * @stable ICU 4.4
0506      */
0507     virtual UBool isInert(UChar32 c) const = 0;
0508 };
0509 
0510 /**
0511  * Normalization filtered by a UnicodeSet.
0512  * Normalizes portions of the text contained in the filter set and leaves
0513  * portions not contained in the filter set unchanged.
0514  * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
0515  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
0516  * This class implements all of (and only) the Normalizer2 API.
0517  * An instance of this class is unmodifiable/immutable but is constructed and
0518  * must be destructed by the owner.
0519  * @stable ICU 4.4
0520  */
0521 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
0522 public:
0523     /**
0524      * Constructs a filtered normalizer wrapping any Normalizer2 instance
0525      * and a filter set.
0526      * Both are aliased and must not be modified or deleted while this object
0527      * is used.
0528      * The filter set should be frozen; otherwise the performance will suffer greatly.
0529      * @param n2 wrapped Normalizer2 instance
0530      * @param filterSet UnicodeSet which determines the characters to be normalized
0531      * @stable ICU 4.4
0532      */
0533     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
0534             norm2(n2), set(filterSet) {}
0535 
0536     /**
0537      * Destructor.
0538      * @stable ICU 4.4
0539      */
0540     ~FilteredNormalizer2();
0541 
0542     /**
0543      * Writes the normalized form of the source string to the destination string
0544      * (replacing its contents) and returns the destination string.
0545      * The source and destination strings must be different objects.
0546      * @param src source string
0547      * @param dest destination string; its contents is replaced with normalized src
0548      * @param errorCode Standard ICU error code. Its input value must
0549      *                  pass the U_SUCCESS() test, or else the function returns
0550      *                  immediately. Check for U_FAILURE() on output or use with
0551      *                  function chaining. (See User Guide for details.)
0552      * @return dest
0553      * @stable ICU 4.4
0554      */
0555     virtual UnicodeString &
0556     normalize(const UnicodeString &src,
0557               UnicodeString &dest,
0558               UErrorCode &errorCode) const override;
0559 
0560     /**
0561      * Normalizes a UTF-8 string and optionally records how source substrings
0562      * relate to changed and unchanged result substrings.
0563      *
0564      * Implemented completely for most built-in modes except for FCD.
0565      * The base class implementation converts to & from UTF-16 and does not support edits.
0566      *
0567      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
0568      * @param src       Source UTF-8 string.
0569      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
0570      *                  sink.Flush() is called at the end.
0571      * @param edits     Records edits for index mapping, working with styled text,
0572      *                  and getting only changes (if any).
0573      *                  The Edits contents is undefined if any error occurs.
0574      *                  This function calls edits->reset() first unless
0575      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
0576      * @param errorCode Standard ICU error code. Its input value must
0577      *                  pass the U_SUCCESS() test, or else the function returns
0578      *                  immediately. Check for U_FAILURE() on output or use with
0579      *                  function chaining. (See User Guide for details.)
0580      * @stable ICU 60
0581      */
0582     virtual void
0583     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
0584                   Edits *edits, UErrorCode &errorCode) const override;
0585 
0586     /**
0587      * Appends the normalized form of the second string to the first string
0588      * (merging them at the boundary) and returns the first string.
0589      * The result is normalized if the first string was normalized.
0590      * The first and second strings must be different objects.
0591      * @param first string, should be normalized
0592      * @param second string, will be normalized
0593      * @param errorCode Standard ICU error code. Its input value must
0594      *                  pass the U_SUCCESS() test, or else the function returns
0595      *                  immediately. Check for U_FAILURE() on output or use with
0596      *                  function chaining. (See User Guide for details.)
0597      * @return first
0598      * @stable ICU 4.4
0599      */
0600     virtual UnicodeString &
0601     normalizeSecondAndAppend(UnicodeString &first,
0602                              const UnicodeString &second,
0603                              UErrorCode &errorCode) const override;
0604     /**
0605      * Appends the second string to the first string
0606      * (merging them at the boundary) and returns the first string.
0607      * The result is normalized if both the strings were normalized.
0608      * The first and second strings must be different objects.
0609      * @param first string, should be normalized
0610      * @param second string, should be normalized
0611      * @param errorCode Standard ICU error code. Its input value must
0612      *                  pass the U_SUCCESS() test, or else the function returns
0613      *                  immediately. Check for U_FAILURE() on output or use with
0614      *                  function chaining. (See User Guide for details.)
0615      * @return first
0616      * @stable ICU 4.4
0617      */
0618     virtual UnicodeString &
0619     append(UnicodeString &first,
0620            const UnicodeString &second,
0621            UErrorCode &errorCode) const override;
0622 
0623     /**
0624      * Gets the decomposition mapping of c.
0625      * For details see the base class documentation.
0626      *
0627      * This function is independent of the mode of the Normalizer2.
0628      * @param c code point
0629      * @param decomposition String object which will be set to c's
0630      *                      decomposition mapping, if there is one.
0631      * @return true if c has a decomposition, otherwise false
0632      * @stable ICU 4.6
0633      */
0634     virtual UBool
0635     getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
0636 
0637     /**
0638      * Gets the raw decomposition mapping of c.
0639      * For details see the base class documentation.
0640      *
0641      * This function is independent of the mode of the Normalizer2.
0642      * @param c code point
0643      * @param decomposition String object which will be set to c's
0644      *                      raw decomposition mapping, if there is one.
0645      * @return true if c has a decomposition, otherwise false
0646      * @stable ICU 49
0647      */
0648     virtual UBool
0649     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
0650 
0651     /**
0652      * Performs pairwise composition of a & b and returns the composite if there is one.
0653      * For details see the base class documentation.
0654      *
0655      * This function is independent of the mode of the Normalizer2.
0656      * @param a A (normalization starter) code point.
0657      * @param b Another code point.
0658      * @return The non-negative composite code point if there is one; otherwise a negative value.
0659      * @stable ICU 49
0660      */
0661     virtual UChar32
0662     composePair(UChar32 a, UChar32 b) const override;
0663 
0664     /**
0665      * Gets the combining class of c.
0666      * The default implementation returns 0
0667      * but all standard implementations return the Unicode Canonical_Combining_Class value.
0668      * @param c code point
0669      * @return c's combining class
0670      * @stable ICU 49
0671      */
0672     virtual uint8_t
0673     getCombiningClass(UChar32 c) const override;
0674 
0675     /**
0676      * Tests if the string is normalized.
0677      * For details see the Normalizer2 base class documentation.
0678      * @param s input string
0679      * @param errorCode Standard ICU error code. Its input value must
0680      *                  pass the U_SUCCESS() test, or else the function returns
0681      *                  immediately. Check for U_FAILURE() on output or use with
0682      *                  function chaining. (See User Guide for details.)
0683      * @return true if s is normalized
0684      * @stable ICU 4.4
0685      */
0686     virtual UBool
0687     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
0688     /**
0689      * Tests if the UTF-8 string is normalized.
0690      * Internally, in cases where the quickCheck() method would return "maybe"
0691      * (which is only possible for the two COMPOSE modes) this method
0692      * resolves to "yes" or "no" to provide a definitive result,
0693      * at the cost of doing more work in those cases.
0694      *
0695      * This works for all normalization modes.
0696      * It is optimized for UTF-8 for all built-in modes except for FCD.
0697      * The base class implementation converts to UTF-16 and calls isNormalized().
0698      *
0699      * @param s UTF-8 input string
0700      * @param errorCode Standard ICU error code. Its input value must
0701      *                  pass the U_SUCCESS() test, or else the function returns
0702      *                  immediately. Check for U_FAILURE() on output or use with
0703      *                  function chaining. (See User Guide for details.)
0704      * @return true if s is normalized
0705      * @stable ICU 60
0706      */
0707     virtual UBool
0708     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
0709     /**
0710      * Tests if the string is normalized.
0711      * For details see the Normalizer2 base class documentation.
0712      * @param s input string
0713      * @param errorCode Standard ICU error code. Its input value must
0714      *                  pass the U_SUCCESS() test, or else the function returns
0715      *                  immediately. Check for U_FAILURE() on output or use with
0716      *                  function chaining. (See User Guide for details.)
0717      * @return UNormalizationCheckResult
0718      * @stable ICU 4.4
0719      */
0720     virtual UNormalizationCheckResult
0721     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
0722     /**
0723      * Returns the end of the normalized substring of the input string.
0724      * For details see the Normalizer2 base class documentation.
0725      * @param s input string
0726      * @param errorCode Standard ICU error code. Its input value must
0727      *                  pass the U_SUCCESS() test, or else the function returns
0728      *                  immediately. Check for U_FAILURE() on output or use with
0729      *                  function chaining. (See User Guide for details.)
0730      * @return "yes" span end index
0731      * @stable ICU 4.4
0732      */
0733     virtual int32_t
0734     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
0735 
0736     /**
0737      * Tests if the character always has a normalization boundary before it,
0738      * regardless of context.
0739      * For details see the Normalizer2 base class documentation.
0740      * @param c character to test
0741      * @return true if c has a normalization boundary before it
0742      * @stable ICU 4.4
0743      */
0744     virtual UBool hasBoundaryBefore(UChar32 c) const override;
0745 
0746     /**
0747      * Tests if the character always has a normalization boundary after it,
0748      * regardless of context.
0749      * For details see the Normalizer2 base class documentation.
0750      * @param c character to test
0751      * @return true if c has a normalization boundary after it
0752      * @stable ICU 4.4
0753      */
0754     virtual UBool hasBoundaryAfter(UChar32 c) const override;
0755 
0756     /**
0757      * Tests if the character is normalization-inert.
0758      * For details see the Normalizer2 base class documentation.
0759      * @param c character to test
0760      * @return true if c is normalization-inert
0761      * @stable ICU 4.4
0762      */
0763     virtual UBool isInert(UChar32 c) const override;
0764 private:
0765     UnicodeString &
0766     normalize(const UnicodeString &src,
0767               UnicodeString &dest,
0768               USetSpanCondition spanCondition,
0769               UErrorCode &errorCode) const;
0770 
0771     void
0772     normalizeUTF8(uint32_t options, const char *src, int32_t length,
0773                   ByteSink &sink, Edits *edits,
0774                   USetSpanCondition spanCondition,
0775                   UErrorCode &errorCode) const;
0776 
0777     UnicodeString &
0778     normalizeSecondAndAppend(UnicodeString &first,
0779                              const UnicodeString &second,
0780                              UBool doNormalize,
0781                              UErrorCode &errorCode) const;
0782 
0783     const Normalizer2 &norm2;
0784     const UnicodeSet &set;
0785 };
0786 
0787 U_NAMESPACE_END
0788 
0789 #endif  // !UCONFIG_NO_NORMALIZATION
0790 
0791 #endif /* U_SHOW_CPLUSPLUS_API */
0792 
0793 #endif  // __NORMALIZER2_H__