include/unicode/normalizer2.h

0001 // © 2016 and later: Unicode, Inc. and others.
0002 // License & terms of use: http://www.unicode.org/copyright.html
0003 /*
0004 *******************************************************************************
0005 *
0006 *   Copyright (C) 2009-2013, International Business Machines
0007 *   Corporation and others.  All Rights Reserved.
0008 *
0009 *******************************************************************************
0010 *   file name:  normalizer2.h
0011 *   encoding:   UTF-8
0012 *   tab size:   8 (not used)
0013 *   indentation:4
0014 *
0015 *   created on: 2009nov22
0016 *   created by: Markus W. Scherer
0017 */
0018
0019 #ifndef __NORMALIZER2_H__
0020 #define __NORMALIZER2_H__
0021
0022 /**
0023  * \file
0024  * \brief C++ API: New API for Unicode Normalization.
0025  */
0026
0027 #include "unicode/utypes.h"
0028
0029 #if U_SHOW_CPLUSPLUS_API
0030
0031 #if !UCONFIG_NO_NORMALIZATION
0032
0033 #include "unicode/stringpiece.h"
0034 #include "unicode/uniset.h"
0035 #include "unicode/unistr.h"
0036 #include "unicode/unorm2.h"
0037
0038 U_NAMESPACE_BEGIN
0039
0040 class ByteSink;
0041
0042 /**
0043  * Unicode normalization functionality for standard Unicode normalization or
0044  * for using custom mapping tables.
0045  * All instances of this class are unmodifiable/immutable.
0046  * Instances returned by getInstance() are singletons that must not be deleted by the caller.
0047  * The Normalizer2 class is not intended for public subclassing.
0048  *
0049  * The primary functions are to produce a normalized string and to detect whether
0050  * a string is already normalized.
0051  * The most commonly used normalization forms are those defined in
0052  * http://www.unicode.org/unicode/reports/tr15/
0053  * However, this API supports additional normalization forms for specialized purposes.
0054  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
0055  * and can be used in implementations of UTS #46.
0056  *
0057  * Not only are the standard compose and decompose modes supplied,
0058  * but additional modes are provided as documented in the Mode enum.
0059  *
0060  * Some of the functions in this class identify normalization boundaries.
0061  * At a normalization boundary, the portions of the string
0062  * before it and starting from it do not interact and can be handled independently.
0063  *
0064  * The spanQuickCheckYes() stops at a normalization boundary.
0065  * When the goal is a normalized string, then the text before the boundary
0066  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
0067  *
0068  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
0069  * a character is guaranteed to be at a normalization boundary,
0070  * regardless of context.
0071  * This is used for moving from one normalization boundary to the next
0072  * or preceding boundary, and for performing iterative normalization.
0073  *
0074  * Iterative normalization is useful when only a small portion of a
0075  * longer string needs to be processed.
0076  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
0077  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
0078  * (to process only the substring for which sort key bytes are computed).
0079  *
0080  * The set of normalization boundaries returned by these functions may not be
0081  * complete: There may be more boundaries that could be returned.
0082  * Different functions may return different boundaries.
0083  * @stable ICU 4.4
0084  */
0085 class U_COMMON_API Normalizer2 : public UObject {
0086 public:
0087     /**
0088      * Destructor.
0089      * @stable ICU 4.4
0090      */
0091     ~Normalizer2();
0092
0093     /**
0094      * Returns a Normalizer2 instance for Unicode NFC normalization.
0095      * Same as getInstance(nullptr, "nfc", UNORM2_COMPOSE, errorCode).
0096      * Returns an unmodifiable singleton instance. Do not delete it.
0097      * @param errorCode Standard ICU error code. Its input value must
0098      *                  pass the U_SUCCESS() test, or else the function returns
0099      *                  immediately. Check for U_FAILURE() on output or use with
0100      *                  function chaining. (See User Guide for details.)
0101      * @return the requested Normalizer2, if successful
0102      * @stable ICU 49
0103      */
0104     static const Normalizer2 *
0105     getNFCInstance(UErrorCode &errorCode);
0106
0107     /**
0108      * Returns a Normalizer2 instance for Unicode NFD normalization.
0109      * Same as getInstance(nullptr, "nfc", UNORM2_DECOMPOSE, errorCode).
0110      * Returns an unmodifiable singleton instance. Do not delete it.
0111      * @param errorCode Standard ICU error code. Its input value must
0112      *                  pass the U_SUCCESS() test, or else the function returns
0113      *                  immediately. Check for U_FAILURE() on output or use with
0114      *                  function chaining. (See User Guide for details.)
0115      * @return the requested Normalizer2, if successful
0116      * @stable ICU 49
0117      */
0118     static const Normalizer2 *
0119     getNFDInstance(UErrorCode &errorCode);
0120
0121     /**
0122      * Returns a Normalizer2 instance for Unicode NFKC normalization.
0123      * Same as getInstance(nullptr, "nfkc", UNORM2_COMPOSE, errorCode).
0124      * Returns an unmodifiable singleton instance. Do not delete it.
0125      * @param errorCode Standard ICU error code. Its input value must
0126      *                  pass the U_SUCCESS() test, or else the function returns
0127      *                  immediately. Check for U_FAILURE() on output or use with
0128      *                  function chaining. (See User Guide for details.)
0129      * @return the requested Normalizer2, if successful
0130      * @stable ICU 49
0131      */
0132     static const Normalizer2 *
0133     getNFKCInstance(UErrorCode &errorCode);
0134
0135     /**
0136      * Returns a Normalizer2 instance for Unicode NFKD normalization.
0137      * Same as getInstance(nullptr, "nfkc", UNORM2_DECOMPOSE, errorCode).
0138      * Returns an unmodifiable singleton instance. Do not delete it.
0139      * @param errorCode Standard ICU error code. Its input value must
0140      *                  pass the U_SUCCESS() test, or else the function returns
0141      *                  immediately. Check for U_FAILURE() on output or use with
0142      *                  function chaining. (See User Guide for details.)
0143      * @return the requested Normalizer2, if successful
0144      * @stable ICU 49
0145      */
0146     static const Normalizer2 *
0147     getNFKDInstance(UErrorCode &errorCode);
0148
0149     /**
0150      * Returns a Normalizer2 instance for Unicode toNFKC_Casefold() normalization
0151      * which is equivalent to applying the NFKC_Casefold mappings and then NFC.
0152      * See https://www.unicode.org/reports/tr44/#NFKC_Casefold
0153      *
0154      * Same as getInstance(nullptr, "nfkc_cf", UNORM2_COMPOSE, errorCode).
0155      * Returns an unmodifiable singleton instance. Do not delete it.
0156      * @param errorCode Standard ICU error code. Its input value must
0157      *                  pass the U_SUCCESS() test, or else the function returns
0158      *                  immediately. Check for U_FAILURE() on output or use with
0159      *                  function chaining. (See User Guide for details.)
0160      * @return the requested Normalizer2, if successful
0161      * @stable ICU 49
0162      */
0163     static const Normalizer2 *
0164     getNFKCCasefoldInstance(UErrorCode &errorCode);
0165
0166     /**
0167      * Returns a Normalizer2 instance for a variant of Unicode toNFKC_Casefold() normalization
0168      * which is equivalent to applying the NFKC_Simple_Casefold mappings and then NFC.
0169      * See https://www.unicode.org/reports/tr44/#NFKC_Simple_Casefold
0170      *
0171      * Same as getInstance(nullptr, "nfkc_scf", UNORM2_COMPOSE, errorCode).
0172      * Returns an unmodifiable singleton instance. Do not delete it.
0173      * @param errorCode Standard ICU error code. Its input value must
0174      *                  pass the U_SUCCESS() test, or else the function returns
0175      *                  immediately. Check for U_FAILURE() on output or use with
0176      *                  function chaining. (See User Guide for details.)
0177      * @return the requested Normalizer2, if successful
0178      * @stable ICU 74
0179      */
0180     static const Normalizer2 *
0181     getNFKCSimpleCasefoldInstance(UErrorCode &errorCode);
0182
0183     /**
0184      * Returns a Normalizer2 instance which uses the specified data file
0185      * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
0186      * and which composes or decomposes text according to the specified mode.
0187      * Returns an unmodifiable singleton instance. Do not delete it.
0188      *
0189      * Use packageName=nullptr for data files that are part of ICU's own data.
0190      * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
0191      * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
0192      * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
0193      *
0194      * @param packageName nullptr for ICU built-in data, otherwise application data package name
0195      * @param name "nfc" or "nfkc" or "nfkc_cf" or "nfkc_scf" or name of custom data file
0196      * @param mode normalization mode (compose or decompose etc.)
0197      * @param errorCode Standard ICU error code. Its input value must
0198      *                  pass the U_SUCCESS() test, or else the function returns
0199      *                  immediately. Check for U_FAILURE() on output or use with
0200      *                  function chaining. (See User Guide for details.)
0201      * @return the requested Normalizer2, if successful
0202      * @stable ICU 4.4
0203      */
0204     static const Normalizer2 *
0205     getInstance(const char *packageName,
0206                 const char *name,
0207                 UNormalization2Mode mode,
0208                 UErrorCode &errorCode);
0209
0210     /**
0211      * Returns the normalized form of the source string.
0212      * @param src source string
0213      * @param errorCode Standard ICU error code. Its input value must
0214      *                  pass the U_SUCCESS() test, or else the function returns
0215      *                  immediately. Check for U_FAILURE() on output or use with
0216      *                  function chaining. (See User Guide for details.)
0217      * @return normalized src
0218      * @stable ICU 4.4
0219      */
0220     UnicodeString
0221     normalize(const UnicodeString &src, UErrorCode &errorCode) const {
0222         UnicodeString result;
0223         normalize(src, result, errorCode);
0224         return result;
0225     }
0226     /**
0227      * Writes the normalized form of the source string to the destination string
0228      * (replacing its contents) and returns the destination string.
0229      * The source and destination strings must be different objects.
0230      * @param src source string
0231      * @param dest destination string; its contents is replaced with normalized src
0232      * @param errorCode Standard ICU error code. Its input value must
0233      *                  pass the U_SUCCESS() test, or else the function returns
0234      *                  immediately. Check for U_FAILURE() on output or use with
0235      *                  function chaining. (See User Guide for details.)
0236      * @return dest
0237      * @stable ICU 4.4
0238      */
0239     virtual UnicodeString &
0240     normalize(const UnicodeString &src,
0241               UnicodeString &dest,
0242               UErrorCode &errorCode) const = 0;
0243
0244     /**
0245      * Normalizes a UTF-8 string and optionally records how source substrings
0246      * relate to changed and unchanged result substrings.
0247      *
0248      * Implemented completely for all built-in modes except for FCD.
0249      * The base class implementation converts to & from UTF-16 and does not support edits.
0250      *
0251      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
0252      * @param src       Source UTF-8 string.
0253      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
0254      *                  sink.Flush() is called at the end.
0255      * @param edits     Records edits for index mapping, working with styled text,
0256      *                  and getting only changes (if any).
0257      *                  The Edits contents is undefined if any error occurs.
0258      *                  This function calls edits->reset() first unless
0259      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
0260      * @param errorCode Standard ICU error code. Its input value must
0261      *                  pass the U_SUCCESS() test, or else the function returns
0262      *                  immediately. Check for U_FAILURE() on output or use with
0263      *                  function chaining. (See User Guide for details.)
0264      * @stable ICU 60
0265      */
0266     virtual void
0267     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
0268                   Edits *edits, UErrorCode &errorCode) const;
0269
0270     /**
0271      * Appends the normalized form of the second string to the first string
0272      * (merging them at the boundary) and returns the first string.
0273      * The result is normalized if the first string was normalized.
0274      * The first and second strings must be different objects.
0275      * @param first string, should be normalized
0276      * @param second string, will be normalized
0277      * @param errorCode Standard ICU error code. Its input value must
0278      *                  pass the U_SUCCESS() test, or else the function returns
0279      *                  immediately. Check for U_FAILURE() on output or use with
0280      *                  function chaining. (See User Guide for details.)
0281      * @return first
0282      * @stable ICU 4.4
0283      */
0284     virtual UnicodeString &
0285     normalizeSecondAndAppend(UnicodeString &first,
0286                              const UnicodeString &second,
0287                              UErrorCode &errorCode) const = 0;
0288     /**
0289      * Appends the second string to the first string
0290      * (merging them at the boundary) and returns the first string.
0291      * The result is normalized if both the strings were normalized.
0292      * The first and second strings must be different objects.
0293      * @param first string, should be normalized
0294      * @param second string, should be normalized
0295      * @param errorCode Standard ICU error code. Its input value must
0296      *                  pass the U_SUCCESS() test, or else the function returns
0297      *                  immediately. Check for U_FAILURE() on output or use with
0298      *                  function chaining. (See User Guide for details.)
0299      * @return first
0300      * @stable ICU 4.4
0301      */
0302     virtual UnicodeString &
0303     append(UnicodeString &first,
0304            const UnicodeString &second,
0305            UErrorCode &errorCode) const = 0;
0306
0307     /**
0308      * Gets the decomposition mapping of c.
0309      * Roughly equivalent to normalizing the String form of c
0310      * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
0311      * returns false and does not write a string
0312      * if c does not have a decomposition mapping in this instance's data.
0313      * This function is independent of the mode of the Normalizer2.
0314      * @param c code point
0315      * @param decomposition String object which will be set to c's
0316      *                      decomposition mapping, if there is one.
0317      * @return true if c has a decomposition, otherwise false
0318      * @stable ICU 4.6
0319      */
0320     virtual UBool
0321     getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
0322
0323     /**
0324      * Gets the raw decomposition mapping of c.
0325      *
0326      * This is similar to the getDecomposition() method but returns the
0327      * raw decomposition mapping as specified in UnicodeData.txt or
0328      * (for custom data) in the mapping files processed by the gennorm2 tool.
0329      * By contrast, getDecomposition() returns the processed,
0330      * recursively-decomposed version of this mapping.
0331      *
0332      * When used on a standard NFKC Normalizer2 instance,
0333      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
0334      *
0335      * When used on a standard NFC Normalizer2 instance,
0336      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
0337      * in this case, the result contains either one or two code points (=1..4 char16_ts).
0338      *
0339      * This function is independent of the mode of the Normalizer2.
0340      * The default implementation returns false.
0341      * @param c code point
0342      * @param decomposition String object which will be set to c's
0343      *                      raw decomposition mapping, if there is one.
0344      * @return true if c has a decomposition, otherwise false
0345      * @stable ICU 49
0346      */
0347     virtual UBool
0348     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
0349
0350     /**
0351      * Performs pairwise composition of a & b and returns the composite if there is one.
0352      *
0353      * Returns a composite code point c only if c has a two-way mapping to a+b.
0354      * In standard Unicode normalization, this means that
0355      * c has a canonical decomposition to a+b
0356      * and c does not have the Full_Composition_Exclusion property.
0357      *
0358      * This function is independent of the mode of the Normalizer2.
0359      * The default implementation returns a negative value.
0360      * @param a A (normalization starter) code point.
0361      * @param b Another code point.
0362      * @return The non-negative composite code point if there is one; otherwise a negative value.
0363      * @stable ICU 49
0364      */
0365     virtual UChar32
0366     composePair(UChar32 a, UChar32 b) const;
0367
0368     /**
0369      * Gets the combining class of c.
0370      * The default implementation returns 0
0371      * but all standard implementations return the Unicode Canonical_Combining_Class value.
0372      * @param c code point
0373      * @return c's combining class
0374      * @stable ICU 49
0375      */
0376     virtual uint8_t
0377     getCombiningClass(UChar32 c) const;
0378
0379     /**
0380      * Tests if the string is normalized.
0381      * Internally, in cases where the quickCheck() method would return "maybe"
0382      * (which is only possible for the two COMPOSE modes) this method
0383      * resolves to "yes" or "no" to provide a definitive result,
0384      * at the cost of doing more work in those cases.
0385      * @param s input string
0386      * @param errorCode Standard ICU error code. Its input value must
0387      *                  pass the U_SUCCESS() test, or else the function returns
0388      *                  immediately. Check for U_FAILURE() on output or use with
0389      *                  function chaining. (See User Guide for details.)
0390      * @return true if s is normalized
0391      * @stable ICU 4.4
0392      */
0393     virtual UBool
0394     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0395     /**
0396      * Tests if the UTF-8 string is normalized.
0397      * Internally, in cases where the quickCheck() method would return "maybe"
0398      * (which is only possible for the two COMPOSE modes) this method
0399      * resolves to "yes" or "no" to provide a definitive result,
0400      * at the cost of doing more work in those cases.
0401      *
0402      * This works for all normalization modes.
0403      * It is optimized for UTF-8 for all built-in modes except for FCD.
0404      * The base class implementation converts to UTF-16 and calls isNormalized().
0405      *
0406      * @param s UTF-8 input string
0407      * @param errorCode Standard ICU error code. Its input value must
0408      *                  pass the U_SUCCESS() test, or else the function returns
0409      *                  immediately. Check for U_FAILURE() on output or use with
0410      *                  function chaining. (See User Guide for details.)
0411      * @return true if s is normalized
0412      * @stable ICU 60
0413      */
0414     virtual UBool
0415     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
0416
0417
0418     /**
0419      * Tests if the string is normalized.
0420      * For the two COMPOSE modes, the result could be "maybe" in cases that
0421      * would take a little more work to resolve definitively.
0422      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
0423      * combination of quick check + normalization, to avoid
0424      * re-checking the "yes" prefix.
0425      * @param s input string
0426      * @param errorCode Standard ICU error code. Its input value must
0427      *                  pass the U_SUCCESS() test, or else the function returns
0428      *                  immediately. Check for U_FAILURE() on output or use with
0429      *                  function chaining. (See User Guide for details.)
0430      * @return UNormalizationCheckResult
0431      * @stable ICU 4.4
0432      */
0433     virtual UNormalizationCheckResult
0434     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0435
0436     /**
0437      * Returns the end of the normalized substring of the input string.
0438      * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
0439      * the substring <code>UnicodeString(s, 0, end)</code>
0440      * will pass the quick check with a "yes" result.
0441      *
0442      * The returned end index is usually one or more characters before the
0443      * "no" or "maybe" character: The end index is at a normalization boundary.
0444      * (See the class documentation for more about normalization boundaries.)
0445      *
0446      * When the goal is a normalized string and most input strings are expected
0447      * to be normalized already, then call this method,
0448      * and if it returns a prefix shorter than the input string,
0449      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
0450      * @param s input string
0451      * @param errorCode Standard ICU error code. Its input value must
0452      *                  pass the U_SUCCESS() test, or else the function returns
0453      *                  immediately. Check for U_FAILURE() on output or use with
0454      *                  function chaining. (See User Guide for details.)
0455      * @return "yes" span end index
0456      * @stable ICU 4.4
0457      */
0458     virtual int32_t
0459     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
0460
0461     /**
0462      * Tests if the character always has a normalization boundary before it,
0463      * regardless of context.
0464      * If true, then the character does not normalization-interact with
0465      * preceding characters.
0466      * In other words, a string containing this character can be normalized
0467      * by processing portions before this character and starting from this
0468      * character independently.
0469      * This is used for iterative normalization. See the class documentation for details.
0470      * @param c character to test
0471      * @return true if c has a normalization boundary before it
0472      * @stable ICU 4.4
0473      */
0474     virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
0475
0476     /**
0477      * Tests if the character always has a normalization boundary after it,
0478      * regardless of context.
0479      * If true, then the character does not normalization-interact with
0480      * following characters.
0481      * In other words, a string containing this character can be normalized
0482      * by processing portions up to this character and after this
0483      * character independently.
0484      * This is used for iterative normalization. See the class documentation for details.
0485      * Note that this operation may be significantly slower than hasBoundaryBefore().
0486      * @param c character to test
0487      * @return true if c has a normalization boundary after it
0488      * @stable ICU 4.4
0489      */
0490     virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
0491
0492     /**
0493      * Tests if the character is normalization-inert.
0494      * If true, then the character does not change, nor normalization-interact with
0495      * preceding or following characters.
0496      * In other words, a string containing this character can be normalized
0497      * by processing portions before this character and after this
0498      * character independently.
0499      * This is used for iterative normalization. See the class documentation for details.
0500      * Note that this operation may be significantly slower than hasBoundaryBefore().
0501      * @param c character to test
0502      * @return true if c is normalization-inert
0503      * @stable ICU 4.4
0504      */
0505     virtual UBool isInert(UChar32 c) const = 0;
0506 };
0507
0508 /**
0509  * Normalization filtered by a UnicodeSet.
0510  * Normalizes portions of the text contained in the filter set and leaves
0511  * portions not contained in the filter set unchanged.
0512  * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
0513  * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
0514  * This class implements all of (and only) the Normalizer2 API.
0515  * An instance of this class is unmodifiable/immutable but is constructed and
0516  * must be destructed by the owner.
0517  * @stable ICU 4.4
0518  */
0519 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
0520 public:
0521     /**
0522      * Constructs a filtered normalizer wrapping any Normalizer2 instance
0523      * and a filter set.
0524      * Both are aliased and must not be modified or deleted while this object
0525      * is used.
0526      * The filter set should be frozen; otherwise the performance will suffer greatly.
0527      * @param n2 wrapped Normalizer2 instance
0528      * @param filterSet UnicodeSet which determines the characters to be normalized
0529      * @stable ICU 4.4
0530      */
0531     FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
0532             norm2(n2), set(filterSet) {}
0533
0534     /**
0535      * Destructor.
0536      * @stable ICU 4.4
0537      */
0538     ~FilteredNormalizer2();
0539
0540     /**
0541      * Writes the normalized form of the source string to the destination string
0542      * (replacing its contents) and returns the destination string.
0543      * The source and destination strings must be different objects.
0544      * @param src source string
0545      * @param dest destination string; its contents is replaced with normalized src
0546      * @param errorCode Standard ICU error code. Its input value must
0547      *                  pass the U_SUCCESS() test, or else the function returns
0548      *                  immediately. Check for U_FAILURE() on output or use with
0549      *                  function chaining. (See User Guide for details.)
0550      * @return dest
0551      * @stable ICU 4.4
0552      */
0553     virtual UnicodeString &
0554     normalize(const UnicodeString &src,
0555               UnicodeString &dest,
0556               UErrorCode &errorCode) const override;
0557
0558     /**
0559      * Normalizes a UTF-8 string and optionally records how source substrings
0560      * relate to changed and unchanged result substrings.
0561      *
0562      * Implemented completely for most built-in modes except for FCD.
0563      * The base class implementation converts to & from UTF-16 and does not support edits.
0564      *
0565      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
0566      * @param src       Source UTF-8 string.
0567      * @param sink      A ByteSink to which the normalized UTF-8 result string is written.
0568      *                  sink.Flush() is called at the end.
0569      * @param edits     Records edits for index mapping, working with styled text,
0570      *                  and getting only changes (if any).
0571      *                  The Edits contents is undefined if any error occurs.
0572      *                  This function calls edits->reset() first unless
0573      *                  options includes U_EDITS_NO_RESET. edits can be nullptr.
0574      * @param errorCode Standard ICU error code. Its input value must
0575      *                  pass the U_SUCCESS() test, or else the function returns
0576      *                  immediately. Check for U_FAILURE() on output or use with
0577      *                  function chaining. (See User Guide for details.)
0578      * @stable ICU 60
0579      */
0580     virtual void
0581     normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
0582                   Edits *edits, UErrorCode &errorCode) const override;
0583
0584     /**
0585      * Appends the normalized form of the second string to the first string
0586      * (merging them at the boundary) and returns the first string.
0587      * The result is normalized if the first string was normalized.
0588      * The first and second strings must be different objects.
0589      * @param first string, should be normalized
0590      * @param second string, will be normalized
0591      * @param errorCode Standard ICU error code. Its input value must
0592      *                  pass the U_SUCCESS() test, or else the function returns
0593      *                  immediately. Check for U_FAILURE() on output or use with
0594      *                  function chaining. (See User Guide for details.)
0595      * @return first
0596      * @stable ICU 4.4
0597      */
0598     virtual UnicodeString &
0599     normalizeSecondAndAppend(UnicodeString &first,
0600                              const UnicodeString &second,
0601                              UErrorCode &errorCode) const override;
0602     /**
0603      * Appends the second string to the first string
0604      * (merging them at the boundary) and returns the first string.
0605      * The result is normalized if both the strings were normalized.
0606      * The first and second strings must be different objects.
0607      * @param first string, should be normalized
0608      * @param second string, should be normalized
0609      * @param errorCode Standard ICU error code. Its input value must
0610      *                  pass the U_SUCCESS() test, or else the function returns
0611      *                  immediately. Check for U_FAILURE() on output or use with
0612      *                  function chaining. (See User Guide for details.)
0613      * @return first
0614      * @stable ICU 4.4
0615      */
0616     virtual UnicodeString &
0617     append(UnicodeString &first,
0618            const UnicodeString &second,
0619            UErrorCode &errorCode) const override;
0620
0621     /**
0622      * Gets the decomposition mapping of c.
0623      * For details see the base class documentation.
0624      *
0625      * This function is independent of the mode of the Normalizer2.
0626      * @param c code point
0627      * @param decomposition String object which will be set to c's
0628      *                      decomposition mapping, if there is one.
0629      * @return true if c has a decomposition, otherwise false
0630      * @stable ICU 4.6
0631      */
0632     virtual UBool
0633     getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
0634
0635     /**
0636      * Gets the raw decomposition mapping of c.
0637      * For details see the base class documentation.
0638      *
0639      * This function is independent of the mode of the Normalizer2.
0640      * @param c code point
0641      * @param decomposition String object which will be set to c's
0642      *                      raw decomposition mapping, if there is one.
0643      * @return true if c has a decomposition, otherwise false
0644      * @stable ICU 49
0645      */
0646     virtual UBool
0647     getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
0648
0649     /**
0650      * Performs pairwise composition of a & b and returns the composite if there is one.
0651      * For details see the base class documentation.
0652      *
0653      * This function is independent of the mode of the Normalizer2.
0654      * @param a A (normalization starter) code point.
0655      * @param b Another code point.
0656      * @return The non-negative composite code point if there is one; otherwise a negative value.
0657      * @stable ICU 49
0658      */
0659     virtual UChar32
0660     composePair(UChar32 a, UChar32 b) const override;
0661
0662     /**
0663      * Gets the combining class of c.
0664      * The default implementation returns 0
0665      * but all standard implementations return the Unicode Canonical_Combining_Class value.
0666      * @param c code point
0667      * @return c's combining class
0668      * @stable ICU 49
0669      */
0670     virtual uint8_t
0671     getCombiningClass(UChar32 c) const override;
0672
0673     /**
0674      * Tests if the string is normalized.
0675      * For details see the Normalizer2 base class documentation.
0676      * @param s input string
0677      * @param errorCode Standard ICU error code. Its input value must
0678      *                  pass the U_SUCCESS() test, or else the function returns
0679      *                  immediately. Check for U_FAILURE() on output or use with
0680      *                  function chaining. (See User Guide for details.)
0681      * @return true if s is normalized
0682      * @stable ICU 4.4
0683      */
0684     virtual UBool
0685     isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
0686     /**
0687      * Tests if the UTF-8 string is normalized.
0688      * Internally, in cases where the quickCheck() method would return "maybe"
0689      * (which is only possible for the two COMPOSE modes) this method
0690      * resolves to "yes" or "no" to provide a definitive result,
0691      * at the cost of doing more work in those cases.
0692      *
0693      * This works for all normalization modes.
0694      * It is optimized for UTF-8 for all built-in modes except for FCD.
0695      * The base class implementation converts to UTF-16 and calls isNormalized().
0696      *
0697      * @param s UTF-8 input string
0698      * @param errorCode Standard ICU error code. Its input value must
0699      *                  pass the U_SUCCESS() test, or else the function returns
0700      *                  immediately. Check for U_FAILURE() on output or use with
0701      *                  function chaining. (See User Guide for details.)
0702      * @return true if s is normalized
0703      * @stable ICU 60
0704      */
0705     virtual UBool
0706     isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
0707     /**
0708      * Tests if the string is normalized.
0709      * For details see the Normalizer2 base class documentation.
0710      * @param s input string
0711      * @param errorCode Standard ICU error code. Its input value must
0712      *                  pass the U_SUCCESS() test, or else the function returns
0713      *                  immediately. Check for U_FAILURE() on output or use with
0714      *                  function chaining. (See User Guide for details.)
0715      * @return UNormalizationCheckResult
0716      * @stable ICU 4.4
0717      */
0718     virtual UNormalizationCheckResult
0719     quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
0720     /**
0721      * Returns the end of the normalized substring of the input string.
0722      * For details see the Normalizer2 base class documentation.
0723      * @param s input string
0724      * @param errorCode Standard ICU error code. Its input value must
0725      *                  pass the U_SUCCESS() test, or else the function returns
0726      *                  immediately. Check for U_FAILURE() on output or use with
0727      *                  function chaining. (See User Guide for details.)
0728      * @return "yes" span end index
0729      * @stable ICU 4.4
0730      */
0731     virtual int32_t
0732     spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
0733
0734     /**
0735      * Tests if the character always has a normalization boundary before it,
0736      * regardless of context.
0737      * For details see the Normalizer2 base class documentation.
0738      * @param c character to test
0739      * @return true if c has a normalization boundary before it
0740      * @stable ICU 4.4
0741      */
0742     virtual UBool hasBoundaryBefore(UChar32 c) const override;
0743
0744     /**
0745      * Tests if the character always has a normalization boundary after it,
0746      * regardless of context.
0747      * For details see the Normalizer2 base class documentation.
0748      * @param c character to test
0749      * @return true if c has a normalization boundary after it
0750      * @stable ICU 4.4
0751      */
0752     virtual UBool hasBoundaryAfter(UChar32 c) const override;
0753
0754     /**
0755      * Tests if the character is normalization-inert.
0756      * For details see the Normalizer2 base class documentation.
0757      * @param c character to test
0758      * @return true if c is normalization-inert
0759      * @stable ICU 4.4
0760      */
0761     virtual UBool isInert(UChar32 c) const override;
0762 private:
0763     UnicodeString &
0764     normalize(const UnicodeString &src,
0765               UnicodeString &dest,
0766               USetSpanCondition spanCondition,
0767               UErrorCode &errorCode) const;
0768
0769     void
0770     normalizeUTF8(uint32_t options, const char *src, int32_t length,
0771                   ByteSink &sink, Edits *edits,
0772                   USetSpanCondition spanCondition,
0773                   UErrorCode &errorCode) const;
0774
0775     UnicodeString &
0776     normalizeSecondAndAppend(UnicodeString &first,
0777                              const UnicodeString &second,
0778                              UBool doNormalize,
0779                              UErrorCode &errorCode) const;
0780
0781     const Normalizer2 &norm2;
0782     const UnicodeSet &set;
0783 };
0784
0785 U_NAMESPACE_END
0786
0787 #endif  // !UCONFIG_NO_NORMALIZATION
0788
0789 #endif /* U_SHOW_CPLUSPLUS_API */
0790
0791 #endif  // __NORMALIZER2_H__