Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-15 10:31:22

0001 // © 2016 and later: Unicode, Inc. and others.
0002 // License & terms of use: http://www.unicode.org/copyright.html
0003 /*
0004 ***************************************************************************
0005 * Copyright (C) 2008-2016, International Business Machines Corporation
0006 * and others. All Rights Reserved.
0007 ***************************************************************************
0008 *   file name:  uspoof.h
0009 *   encoding:   UTF-8
0010 *   tab size:   8 (not used)
0011 *   indentation:4
0012 *
0013 *   created on: 2008Feb13
0014 *   created by: Andy Heninger
0015 *
0016 *   Unicode Spoof Detection
0017 */
0018 
0019 #ifndef USPOOF_H
0020 #define USPOOF_H
0021 
0022 #include "unicode/ubidi.h"
0023 #include "unicode/utypes.h"
0024 #include "unicode/uset.h"
0025 #include "unicode/parseerr.h"
0026 
0027 #if !UCONFIG_NO_NORMALIZATION
0028 
0029 
0030 #if U_SHOW_CPLUSPLUS_API
0031 #include "unicode/localpointer.h"
0032 #include "unicode/unistr.h"
0033 #include "unicode/uniset.h"
0034 #endif
0035 
0036 
0037 /**
0038  * \file
0039  * \brief C API: Unicode Security and Spoofing Detection
0040  *
0041  * <p>
0042  * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
0043  * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
0044  *
0045  * <ol>
0046  * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
0047  * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
0048  * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
0049  * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
0050  * </ol>
0051  *
0052  * <p>
0053  * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
0054  * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
0055  * content filters.
0056  *
0057  * <p>
0058  * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++.
0059  *
0060  * <h2>Confusables</h2>
0061  *
0062  * <p>
0063  * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings:
0064  *
0065  * \code{.c}
0066  * UErrorCode status = U_ZERO_ERROR;
0067  * UChar* str1 = (UChar*) u"Harvest";
0068  * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
0069  *
0070  * USpoofChecker* sc = uspoof_open(&status);
0071  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
0072  *
0073  * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
0074  * UBool result = bitmask != 0;
0075  * // areConfusable: 1 (status: U_ZERO_ERROR)
0076  * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
0077  * uspoof_close(sc);
0078  * \endcode
0079  *
0080  * <p>
0081  * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
0082  * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
0083  * confusability test; and the following line extracts the result out of the return value. For best performance,
0084  * the instance should be created once (e.g., upon application startup), and the efficient
0085  * {@link uspoof_areConfusable} method can be used at runtime.
0086  *
0087  * If the paragraph direction used to display the strings is known, the bidi function should be used instead:
0088  *
0089  * \code{.c}
0090  * UErrorCode status = U_ZERO_ERROR;
0091  * // These strings look identical when rendered in a left-to-right context.
0092  * // They look distinct in a right-to-left context.
0093  * UChar* str1 = (UChar*) u"A1\u05D0";  // A1א
0094  * UChar* str2 = (UChar*) u"A\u05D01";  // Aא1
0095  *
0096  * USpoofChecker* sc = uspoof_open(&status);
0097  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
0098  *
0099  * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status);
0100  * UBool result = bitmask != 0;
0101  * // areBidiConfusable: 1 (status: U_ZERO_ERROR)
0102  * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status));
0103  * uspoof_close(sc);
0104  * \endcode
0105  *
0106  * <p>
0107  * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers.  It will automatically call
0108  * {@link uspoof_close} when the object goes out of scope:
0109  *
0110  * \code{.cpp}
0111  * UErrorCode status = U_ZERO_ERROR;
0112  * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
0113  * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status);
0114  * // ...
0115  * \endcode
0116  *
0117  * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can
0118  * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so
0119  * the following snippet is equivalent to the example above:
0120  *
0121  * \code{.c}
0122  * UErrorCode status = U_ZERO_ERROR;
0123  * UChar* str1 = (UChar*) u"Harvest";
0124  * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
0125  *
0126  * USpoofChecker* sc = uspoof_open(&status);
0127  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
0128  *
0129  * // Get skeleton 1
0130  * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
0131  * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
0132  * status = U_ZERO_ERROR;
0133  * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
0134  *
0135  * // Get skeleton 2
0136  * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
0137  * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
0138  * status = U_ZERO_ERROR;
0139  * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
0140  *
0141  * // Are the skeletons the same?
0142  * UBool result = u_strcmp(skel1, skel2) == 0;
0143  * // areConfusable: 1 (status: U_ZERO_ERROR)
0144  * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
0145  * uspoof_close(sc);
0146  * free(skel1);
0147  * free(skel2);
0148  * \endcode
0149  *
0150  * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
0151  * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below:
0152  *
0153  * \code{.c}
0154  * UErrorCode status = U_ZERO_ERROR;
0155  * #define DICTIONARY_LENGTH 2
0156  * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
0157  * UChar* skeletons[DICTIONARY_LENGTH];
0158  * UChar* str = (UChar*) u"1orern";
0159  *
0160  * // Setup:
0161  * USpoofChecker* sc = uspoof_open(&status);
0162  * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
0163  * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
0164  *     UChar* word = dictionary[i];
0165  *     int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
0166  *     skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
0167  *     status = U_ZERO_ERROR;
0168  *     uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
0169  * }
0170  *
0171  * // Live Check:
0172  * {
0173  *     int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
0174  *     UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
0175  *     status = U_ZERO_ERROR;
0176  *     uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
0177  *     UBool result = false;
0178  *     for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
0179  *         result = u_strcmp(skel, skeletons[i]) == 0;
0180  *         if (result == true) { break; }
0181  *     }
0182  *     // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
0183  *     printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
0184  *     free(skel);
0185  * }
0186  *
0187  * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
0188  *     free(skeletons[i]);
0189  * }
0190  * uspoof_close(sc);
0191  * \endcode
0192  *
0193  * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
0194  * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
0195  * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
0196  *
0197  * <h2>Spoof Detection</h2>
0198  *
0199  * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a
0200  * string:
0201  *
0202  * \code{.c}
0203  * UErrorCode status = U_ZERO_ERROR;
0204  * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
0205  *
0206  * // Get the default set of allowable characters:
0207  * USet* allowed = uset_openEmpty();
0208  * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
0209  * uset_addAll(allowed, uspoof_getInclusionSet(&status));
0210  *
0211  * USpoofChecker* sc = uspoof_open(&status);
0212  * uspoof_setAllowedChars(sc, allowed, &status);
0213  * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
0214  *
0215  * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
0216  * UBool result = bitmask != 0;
0217  * // fails checks: 1 (status: U_ZERO_ERROR)
0218  * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
0219  * uspoof_close(sc);
0220  * uset_close(allowed);
0221  * \endcode
0222  *
0223  * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at
0224  * startup, and call the cheaper {@link uspoof_check} online. We specify the set of
0225  * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39.
0226  *
0227  * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings,
0228  * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers.
0229  *
0230  * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks
0231  * is available in the returned bitmask.  For complete information, use the {@link uspoof_check2} class of functions
0232  * with a {@link USpoofCheckResult} parameter:
0233  *
0234  * \code{.c}
0235  * UErrorCode status = U_ZERO_ERROR;
0236  * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
0237  *
0238  * // Get the default set of allowable characters:
0239  * USet* allowed = uset_openEmpty();
0240  * uset_addAll(allowed, uspoof_getRecommendedSet(&status));
0241  * uset_addAll(allowed, uspoof_getInclusionSet(&status));
0242  *
0243  * USpoofChecker* sc = uspoof_open(&status);
0244  * uspoof_setAllowedChars(sc, allowed, &status);
0245  * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE);
0246  *
0247  * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status);
0248  * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status);
0249  *
0250  * int32_t failures1 = bitmask;
0251  * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
0252  * assert(failures1 == failures2);
0253  * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
0254  * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
0255  *
0256  * // Cleanup:
0257  * uspoof_close(sc);
0258  * uset_close(allowed);
0259  * uspoof_closeCheckResult(checkResult);
0260  * \endcode
0261  *
0262  * C++ users can take advantage of a few syntactical conveniences.  The following snippet is functionally
0263  * equivalent to the one above:
0264  *
0265  * \code{.cpp}
0266  * UErrorCode status = U_ZERO_ERROR;
0267  * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
0268  *
0269  * // Get the default set of allowable characters:
0270  * UnicodeSet allowed;
0271  * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
0272  * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
0273  *
0274  * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
0275  * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
0276  * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
0277  *
0278  * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
0279  * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
0280  *
0281  * int32_t failures1 = bitmask;
0282  * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
0283  * assert(failures1 == failures2);
0284  * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
0285  * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
0286  *
0287  * // Explicit cleanup not necessary.
0288  * \endcode
0289  *
0290  * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
0291  * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
0292  *
0293  * <ul>
0294  * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
0295  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
0296  * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
0297  * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
0298  * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
0299  * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
0300  * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li>
0301  * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
0302  * </ul>
0303  *
0304  * <p>
0305  * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
0306  * INVISIBLE and MIXED_NUMBERS conditions, you could do:
0307  *
0308  * \code{.c}
0309  * UErrorCode status = U_ZERO_ERROR;
0310  * UChar* str = (UChar*) u"8\u09EA";  // 8 mixed with U+09EA BENGALI DIGIT FOUR
0311  *
0312  * USpoofChecker* sc = uspoof_open(&status);
0313  * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
0314  *
0315  * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
0316  * UBool result = bitmask != 0;
0317  * // fails checks: 1 (status: U_ZERO_ERROR)
0318  * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
0319  * uspoof_close(sc);
0320  * \endcode
0321  *
0322  * Here is an example in C++ showing how to compute the restriction level of a string:
0323  *
0324  * \code{.cpp}
0325  * UErrorCode status = U_ZERO_ERROR;
0326  * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
0327  *
0328  * // Get the default set of allowable characters:
0329  * UnicodeSet allowed;
0330  * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status));
0331  * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status));
0332  *
0333  * LocalUSpoofCheckerPointer sc(uspoof_open(&status));
0334  * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status);
0335  * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE);
0336  * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status);
0337  *
0338  * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status));
0339  * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
0340  *
0341  * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
0342  * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
0343  * assert((restrictionLevel & bitmask) == restrictionLevel);
0344  * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
0345  * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
0346  * \endcode
0347  *
0348  * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
0349  * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
0350  *
0351  * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
0352  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
0353  * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
0354  * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
0355  * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
0356  * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
0357  * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of
0358  * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code
0359  * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
0360  * scripts.
0361  *
0362  * <h2>Advanced bidirectional usage</h2>
0363  * If the paragraph direction with which the identifiers will be displayed is not known, there are
0364  * multiple options for confusable detection depending on the circumstances.
0365  *
0366  * <p>
0367  * In some circumstances, the only concern is confusion between identifiers displayed with the same
0368  * paragraph direction.
0369  *
0370  * <p>
0371  * An example is the case where identifiers are usernames prefixed with the @ symbol.
0372  * That symbol will appear to the left in a left-to-right context, and to the right in a
0373  * right-to-left context, so that an identifier displayed in a left-to-right context can never be
0374  * confused with an identifier displayed in a right-to-left context:
0375  * <ul>
0376  * <li>
0377  * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1)
0378  * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the
0379  * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered
0380  * confusable, since they both appear as A_1א@ in a right-to-left context.
0381  * </li>
0382  * <li>
0383  * The username "Mark_" would not be considered confusable with the username "_Mark",
0384  * even though the latter would appear as Mark_@ in a right-to-left context, and the
0385  * former as \@Mark_ in a left-to-right context.
0386  * </li>
0387  * </ul>
0388  * <p>
0389  * In that case, the caller should check for both LTR-confusability and RTL-confusability:
0390  *
0391  * \code{.cpp}
0392  * bool confusableInEitherDirection =
0393  *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) ||
0394  *     uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status);
0395  * \endcode
0396  *
0397  * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR
0398  * with LTR and RTL with RTL.
0399  *
0400  * <p>
0401  * In cases where confusability between the visual appearances of an identifier displayed in a
0402  * left-to-right context with another identifier displayed in a right-to-left context is a concern,
0403  * the LTR skeleton of one can be compared with the RTL skeleton of the other.  However, this
0404  * very broad definition of confusability may have unexpected results; for instance, it treats the
0405  * ASCII identifiers "Mark_" and "_Mark" as confusable.
0406  *
0407  * <h2>Additional Information</h2>
0408  *
0409  * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
0410  *
0411  * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
0412  * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
0413  * using the same USpoofChecker instance.
0414  *
0415  * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
0416  * thread safe. Those that take a non-const USpoofChecker are not thread safe..
0417  *
0418  * @stable ICU 4.6
0419  */
0420 
0421 U_CDECL_BEGIN
0422 
0423 struct USpoofChecker;
0424 /**
0425  * @stable ICU 4.2
0426  */
0427 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */
0428 
0429 struct USpoofCheckResult;
0430 /**
0431  * @see uspoof_openCheckResult
0432  * @stable ICU 58
0433  */
0434 typedef struct USpoofCheckResult USpoofCheckResult;
0435 
0436 /**
0437  * Enum for the kinds of checks that USpoofChecker can perform.
0438  * These enum values are used both to select the set of checks that
0439  * will be performed, and to report results from the check function.
0440  *
0441  * @stable ICU 4.2
0442  */
0443 typedef enum USpoofChecks {
0444     /**
0445      * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
0446      * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
0447      * 4.
0448      *
0449      * @see uspoof_areConfusable
0450      * @stable ICU 4.2
0451      */
0452     USPOOF_SINGLE_SCRIPT_CONFUSABLE =   1,
0453 
0454     /**
0455      * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
0456      * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
0457      * 39 section 4.
0458      *
0459      * @see uspoof_areConfusable
0460      * @stable ICU 4.2
0461      */
0462     USPOOF_MIXED_SCRIPT_CONFUSABLE  =   2,
0463 
0464     /**
0465      * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates
0466      * that the two strings are visually confusable and that they are not from the same script but both of them are
0467      * single-script strings, according to UTS 39 section 4.
0468      *
0469      * @see uspoof_areConfusable
0470      * @stable ICU 4.2
0471      */
0472     USPOOF_WHOLE_SCRIPT_CONFUSABLE  =   4,
0473 
0474     /**
0475      * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables.  You may set
0476      * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
0477      * make {@link uspoof_areConfusable} return only those types of confusables.
0478      *
0479      * @see uspoof_areConfusable
0480      * @see uspoof_getSkeleton
0481      * @stable ICU 58
0482      */
0483     USPOOF_CONFUSABLE               =   USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
0484 
0485 #ifndef U_HIDE_DEPRECATED_API
0486     /**
0487       * This flag is deprecated and no longer affects the behavior of SpoofChecker.
0488       *
0489       * @deprecated ICU 58  Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated.
0490       */
0491     USPOOF_ANY_CASE                 =   8,
0492 #endif  /* U_HIDE_DEPRECATED_API */
0493 
0494     /**
0495       * Check that an identifier is no looser than the specified RestrictionLevel.
0496       * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE.
0497       *
0498       * If USPOOF_AUX_INFO is enabled the actual restriction level of the
0499       * identifier being tested will also be returned by uspoof_check().
0500       *
0501       * @see URestrictionLevel
0502       * @see uspoof_setRestrictionLevel
0503       * @see USPOOF_AUX_INFO
0504       *
0505       * @stable ICU 51
0506       */
0507     USPOOF_RESTRICTION_LEVEL        = 16,
0508 
0509 #ifndef U_HIDE_DEPRECATED_API
0510     /** Check that an identifier contains only characters from a
0511       * single script (plus chars from the common and inherited scripts.)
0512       * Applies to checks of a single identifier check only.
0513       * @deprecated ICU 51  Use RESTRICTION_LEVEL instead.
0514       */
0515     USPOOF_SINGLE_SCRIPT            =  USPOOF_RESTRICTION_LEVEL,
0516 #endif  /* U_HIDE_DEPRECATED_API */
0517 
0518     /** Check an identifier for the presence of invisible characters,
0519       * such as zero-width spaces, or character sequences that are
0520       * likely not to display, such as multiple occurrences of the same
0521       * non-spacing mark.  This check does not test the input string as a whole
0522       * for conformance to any particular syntax for identifiers.
0523       */
0524     USPOOF_INVISIBLE                =  32,
0525 
0526     /** Check that an identifier contains only characters from a specified set
0527       * of acceptable characters.  See {@link uspoof_setAllowedChars} and
0528       * {@link uspoof_setAllowedLocales}.  Note that a string that fails this check
0529       * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
0530       */
0531     USPOOF_CHAR_LIMIT               =  64,
0532 
0533     /**
0534      * Check that an identifier does not mix numbers from different numbering systems.
0535      * For more information, see UTS 39 section 5.3.
0536      *
0537      * @stable ICU 51
0538      */
0539     USPOOF_MIXED_NUMBERS            = 128,
0540 
0541     /**
0542      * Check that an identifier does not have a combining character following a character in which that
0543      * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
0544      *
0545      * More specifically, the following characters are forbidden from preceding a U+0307:
0546      * <ul>
0547      * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
0548      * <li>Latin lowercase letter 'l'</li>
0549      * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
0550      * <li>Any character whose confusable prototype ends with such a character
0551      * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
0552      * </ul>
0553      * In addition, combining characters are allowed between the above characters and U+0307 except those
0554      * with combining class 0 or combining class "Above" (230, same class as U+0307).
0555      *
0556      * This list and the number of combing characters considered by this check may grow over time.
0557      *
0558      * @stable ICU 62
0559      */
0560     USPOOF_HIDDEN_OVERLAY            = 256,
0561 
0562    /**
0563      * Enable all spoof checks.
0564      *
0565      * @stable ICU 4.6
0566      */
0567     USPOOF_ALL_CHECKS               = 0xFFFF,
0568 
0569     /**
0570       * Enable the return of auxiliary (non-error) information in the
0571       * upper bits of the check results value.
0572       *
0573       * If this "check" is not enabled, the results of {@link uspoof_check} will be
0574       * zero when an identifier passes all of the enabled checks.
0575       *
0576       * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will
0577       * be zero when an identifier passes all checks.
0578       *
0579       * @stable ICU 51
0580       */
0581     USPOOF_AUX_INFO                  = 0x40000000
0582 
0583     } USpoofChecks;
0584 
0585 
0586     /**
0587      * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and
0588      * for returned identifier restriction levels in check results.
0589      *
0590      * @stable ICU 51
0591      *
0592      * @see uspoof_setRestrictionLevel
0593      * @see uspoof_check
0594      */
0595     typedef enum URestrictionLevel {
0596         /**
0597          * All characters in the string are in the identifier profile and all characters in the string are in the
0598          * ASCII range.
0599          *
0600          * @stable ICU 51
0601          */
0602         USPOOF_ASCII = 0x10000000,
0603         /**
0604          * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and
0605          * the string is single-script, according to the definition in UTS 39 section 5.1.
0606          *
0607          * @stable ICU 53
0608          */
0609         USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000,
0610         /**
0611          * The string classifies as Single Script, or all characters in the string are in the identifier profile and
0612          * the string is covered by any of the following sets of scripts, according to the definition in UTS 39
0613          * section 5.1:
0614          * <ul>
0615          *   <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
0616          *   <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
0617          *   <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
0618          * </ul>
0619          * This is the default restriction in ICU.
0620          *
0621          * @stable ICU 51
0622          */
0623         USPOOF_HIGHLY_RESTRICTIVE = 0x30000000,
0624         /**
0625          * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
0626          * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
0627          * Greek, and Cherokee.
0628          *
0629          * @stable ICU 51
0630          */
0631         USPOOF_MODERATELY_RESTRICTIVE = 0x40000000,
0632         /**
0633          * All characters in the string are in the identifier profile.  Allow arbitrary mixtures of scripts.
0634          *
0635          * @stable ICU 51
0636          */
0637         USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000,
0638         /**
0639          * Any valid identifiers, including characters outside of the Identifier Profile.
0640          *
0641          * @stable ICU 51
0642          */
0643         USPOOF_UNRESTRICTIVE = 0x60000000,
0644         /**
0645          * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}.
0646          *
0647          * @stable ICU 53
0648          */
0649         USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000,
0650 #ifndef U_HIDE_INTERNAL_API
0651         /**
0652          * An undefined restriction level.
0653          * @internal
0654          */
0655         USPOOF_UNDEFINED_RESTRICTIVE = -1
0656 #endif  /* U_HIDE_INTERNAL_API */
0657     } URestrictionLevel;
0658 
0659 /**
0660  *  Create a Unicode Spoof Checker, configured to perform all
0661  *  checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT.
0662  *  Note that additional checks may be added in the future,
0663  *  resulting in the changes to the default checking behavior.
0664  *
0665  *  @param status  The error code, set if this function encounters a problem.
0666  *  @return        the newly created Spoof Checker
0667  *  @stable ICU 4.2
0668  */
0669 U_CAPI USpoofChecker * U_EXPORT2
0670 uspoof_open(UErrorCode *status);
0671 
0672 
0673 /**
0674  * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory.
0675  * Inverse of uspoof_serialize().
0676  * The memory containing the serialized data must remain valid and unchanged
0677  * as long as the spoof checker, or any cloned copies of the spoof checker,
0678  * are in use.  Ownership of the memory remains with the caller.
0679  * The spoof checker (and any clones) must be closed prior to deleting the
0680  * serialized data.
0681  *
0682  * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data
0683  * @param length the number of bytes available at data;
0684  *               can be more than necessary
0685  * @param pActualLength receives the actual number of bytes at data taken up by the data;
0686  *                      can be NULL
0687  * @param pErrorCode ICU error code
0688  * @return the spoof checker.
0689  *
0690  * @see uspoof_open
0691  * @see uspoof_serialize
0692  * @stable ICU 4.2
0693  */
0694 U_CAPI USpoofChecker * U_EXPORT2
0695 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
0696                           UErrorCode *pErrorCode);
0697 
0698 /**
0699   * Open a Spoof Checker from the source form of the spoof data.
0700   * The input corresponds to the Unicode data file confusables.txt
0701   * as described in Unicode Technical Standard #39.  The syntax of the source data
0702   * is as described in UTS #39 for this file, and the content of
0703   * this file is acceptable input.
0704   *
0705   * The character encoding of the (char *) input text is UTF-8.
0706   *
0707   * @param confusables a pointer to the confusable characters definitions,
0708   *                    as found in file confusables.txt from unicode.org.
0709   * @param confusablesLen The length of the confusables text, or -1 if the
0710   *                    input string is zero terminated.
0711   * @param confusablesWholeScript
0712   *                    Deprecated in ICU 58.  No longer used.
0713   * @param confusablesWholeScriptLen
0714   *                    Deprecated in ICU 58.  No longer used.
0715   * @param errType     In the event of an error in the input, indicates
0716   *                    which of the input files contains the error.
0717   *                    The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or
0718   *                    USPOOF_WHOLE_SCRIPT_CONFUSABLE, or
0719   *                    zero if no errors are found.
0720   * @param pe          In the event of an error in the input, receives the position
0721   *                    in the input text (line, offset) of the error.
0722   * @param status      an in/out ICU UErrorCode.  Among the possible errors is
0723   *                    U_PARSE_ERROR, which is used to report syntax errors
0724   *                    in the input.
0725   * @return            A spoof checker that uses the rules from the input files.
0726   * @stable ICU 4.2
0727   */
0728 U_CAPI USpoofChecker * U_EXPORT2
0729 uspoof_openFromSource(const char *confusables,  int32_t confusablesLen,
0730                       const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
0731                       int32_t *errType, UParseError *pe, UErrorCode *status);
0732 
0733 
0734 /**
0735   * Close a Spoof Checker, freeing any memory that was being held by
0736   *   its implementation.
0737   * @stable ICU 4.2
0738   */
0739 U_CAPI void U_EXPORT2
0740 uspoof_close(USpoofChecker *sc);
0741 
0742 /**
0743  * Clone a Spoof Checker.  The clone will be set to perform the same checks
0744  *   as the original source.
0745  *
0746  * @param sc       The source USpoofChecker
0747  * @param status   The error code, set if this function encounters a problem.
0748  * @return
0749  * @stable ICU 4.2
0750  */
0751 U_CAPI USpoofChecker * U_EXPORT2
0752 uspoof_clone(const USpoofChecker *sc, UErrorCode *status);
0753 
0754 
0755 /**
0756  * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method
0757  * overwrites any checks that may have already been enabled. By default, all checks are enabled.
0758  *
0759  * To enable specific checks and disable all others,
0760  * OR together only the bit constants for the desired checks.
0761  * For example, to fail strings containing characters outside of
0762  * the set specified by {@link uspoof_setAllowedChars} and
0763  * also strings that contain digits from mixed numbering systems:
0764  *
0765  * <pre>
0766  * {@code
0767  * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS);
0768  * }
0769  * </pre>
0770  *
0771  * To disable specific checks and enable all others,
0772  * start with ALL_CHECKS and "AND away" the not-desired checks.
0773  * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality,
0774  * it is good practice to disable the CONFUSABLE check:
0775  *
0776  * <pre>
0777  * {@code
0778  * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE);
0779  * }
0780  * </pre>
0781  *
0782  * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and
0783  * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
0784  * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
0785  * methods.
0786  *
0787  * @param sc       The USpoofChecker
0788  * @param checks         The set of checks that this spoof checker will perform.
0789  *                 The value is a bit set, obtained by OR-ing together
0790  *                 values from enum USpoofChecks.
0791  * @param status   The error code, set if this function encounters a problem.
0792  * @stable ICU 4.2
0793  *
0794  */
0795 U_CAPI void U_EXPORT2
0796 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status);
0797 
0798 /**
0799  * Get the set of checks that this Spoof Checker has been configured to perform.
0800  *
0801  * @param sc       The USpoofChecker
0802  * @param status   The error code, set if this function encounters a problem.
0803  * @return         The set of checks that this spoof checker will perform.
0804  *                 The value is a bit set, obtained by OR-ing together
0805  *                 values from enum USpoofChecks.
0806  * @stable ICU 4.2
0807  *
0808  */
0809 U_CAPI int32_t U_EXPORT2
0810 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
0811 
0812 /**
0813  * Set the loosest restriction level allowed for strings. The default if this is not called is
0814  * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
0815  * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
0816  * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
0817  *
0818  * @param sc       The USpoofChecker
0819  * @param restrictionLevel The loosest restriction level allowed.
0820  * @see URestrictionLevel
0821  * @stable ICU 51
0822  */
0823 U_CAPI void U_EXPORT2
0824 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel);
0825 
0826 
0827 /**
0828   * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}.
0829   *
0830   * @return The restriction level
0831   * @see URestrictionLevel
0832   * @stable ICU 51
0833   */
0834 U_CAPI URestrictionLevel U_EXPORT2
0835 uspoof_getRestrictionLevel(const USpoofChecker *sc);
0836 
0837 /**
0838  * Limit characters that are acceptable in identifiers being checked to those
0839  * normally used with the languages associated with the specified locales.
0840  * Any previously specified list of locales is replaced by the new settings.
0841  *
0842  * A set of languages is determined from the locale(s), and
0843  * from those a set of acceptable Unicode scripts is determined.
0844  * Characters from this set of scripts, along with characters from
0845  * the "common" and "inherited" Unicode Script categories
0846  * will be permitted.
0847  *
0848  * Supplying an empty string removes all restrictions;
0849  * characters from any script will be allowed.
0850  *
0851  * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this
0852  * USpoofChecker when calling this function with a non-empty list
0853  * of locales.
0854  *
0855  * The Unicode Set of characters that will be allowed is accessible
0856  * via the uspoof_getAllowedChars() function.  uspoof_setAllowedLocales()
0857  * will <i>replace</i> any previously applied set of allowed characters.
0858  *
0859  * Adjustments, such as additions or deletions of certain classes of characters,
0860  * can be made to the result of uspoof_setAllowedLocales() by
0861  * fetching the resulting set with uspoof_getAllowedChars(),
0862  * manipulating it with the Unicode Set API, then resetting the
0863  * spoof detectors limits with uspoof_setAllowedChars().
0864  *
0865  * @param sc           The USpoofChecker
0866  * @param localesList  A list list of locales, from which the language
0867  *                     and associated script are extracted.  The locales
0868  *                     are comma-separated if there is more than one.
0869  *                     White space may not appear within an individual locale,
0870  *                     but is ignored otherwise.
0871  *                     The locales are syntactically like those from the
0872  *                     HTTP Accept-Language header.
0873  *                     If the localesList is empty, no restrictions will be placed on
0874  *                     the allowed characters.
0875  *
0876  * @param status       The error code, set if this function encounters a problem.
0877  * @stable ICU 4.2
0878  */
0879 U_CAPI void U_EXPORT2
0880 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status);
0881 
0882 /**
0883  * Get a list of locales for the scripts that are acceptable in strings
0884  *  to be checked.  If no limitations on scripts have been specified,
0885  *  an empty string will be returned.
0886  *
0887  *  uspoof_setAllowedChars() will reset the list of allowed to be empty.
0888  *
0889  *  The format of the returned list is the same as that supplied to
0890  *  uspoof_setAllowedLocales(), but returned list may not be identical
0891  *  to the originally specified string; the string may be reformatted,
0892  *  and information other than languages from
0893  *  the originally specified locales may be omitted.
0894  *
0895  * @param sc           The USpoofChecker
0896  * @param status       The error code, set if this function encounters a problem.
0897  * @return             A string containing a list of  locales corresponding
0898  *                     to the acceptable scripts, formatted like an
0899  *                     HTTP Accept Language value.
0900  *
0901  * @stable ICU 4.2
0902  */
0903 U_CAPI const char * U_EXPORT2
0904 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
0905 
0906 
0907 /**
0908  * Limit the acceptable characters to those specified by a Unicode Set.
0909  *   Any previously specified character limit is
0910  *   is replaced by the new settings.  This includes limits on
0911  *   characters that were set with the uspoof_setAllowedLocales() function.
0912  *
0913  * The USPOOF_CHAR_LIMIT test is automatically enabled for this
0914  * USpoofChecker by this function.
0915  *
0916  * @param sc       The USpoofChecker
0917  * @param chars    A Unicode Set containing the list of
0918  *                 characters that are permitted.  Ownership of the set
0919  *                 remains with the caller.  The incoming set is cloned by
0920  *                 this function, so there are no restrictions on modifying
0921  *                 or deleting the USet after calling this function.
0922  * @param status   The error code, set if this function encounters a problem.
0923  * @stable ICU 4.2
0924  */
0925 U_CAPI void U_EXPORT2
0926 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status);
0927 
0928 
0929 /**
0930  * Get a USet for the characters permitted in an identifier.
0931  * This corresponds to the limits imposed by the Set Allowed Characters
0932  * functions. Limitations imposed by other checks will not be
0933  * reflected in the set returned by this function.
0934  *
0935  * The returned set will be frozen, meaning that it cannot be modified
0936  * by the caller.
0937  *
0938  * Ownership of the returned set remains with the Spoof Detector.  The
0939  * returned set will become invalid if the spoof detector is closed,
0940  * or if a new set of allowed characters is specified.
0941  *
0942  *
0943  * @param sc       The USpoofChecker
0944  * @param status   The error code, set if this function encounters a problem.
0945  * @return         A USet containing the characters that are permitted by
0946  *                 the USPOOF_CHAR_LIMIT test.
0947  * @stable ICU 4.2
0948  */
0949 U_CAPI const USet * U_EXPORT2
0950 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
0951 
0952 
0953 /**
0954  * Check the specified string for possible security issues.
0955  * The text to be checked will typically be an identifier of some sort.
0956  * The set of checks to be performed is specified with uspoof_setChecks().
0957  *
0958  * \note
0959  *   Consider using the newer API, {@link uspoof_check2}, instead.
0960  *   The newer API exposes additional information from the check procedure
0961  *   and is otherwise identical to this method.
0962  *
0963  * @param sc      The USpoofChecker
0964  * @param id      The identifier to be checked for possible security issues,
0965  *                in UTF-16 format.
0966  * @param length  the length of the string to be checked, expressed in
0967  *                16 bit UTF-16 code units, or -1 if the string is
0968  *                zero terminated.
0969  * @param position  Deprecated in ICU 51.  Always returns zero.
0970  *                Originally, an out parameter for the index of the first
0971  *                string position that failed a check.
0972  *                This parameter may be NULL.
0973  * @param status  The error code, set if an error occurred while attempting to
0974  *                perform the check.
0975  *                Spoofing or security issues detected with the input string are
0976  *                not reported here, but through the function's return value.
0977  * @return        An integer value with bits set for any potential security
0978  *                or spoofing issues detected.  The bits are defined by
0979  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
0980  *                will be zero if the input string passes all of the
0981  *                enabled checks.
0982  * @see uspoof_check2
0983  * @stable ICU 4.2
0984  */
0985 U_CAPI int32_t U_EXPORT2
0986 uspoof_check(const USpoofChecker *sc,
0987                          const UChar *id, int32_t length,
0988                          int32_t *position,
0989                          UErrorCode *status);
0990 
0991 
0992 /**
0993  * Check the specified string for possible security issues.
0994  * The text to be checked will typically be an identifier of some sort.
0995  * The set of checks to be performed is specified with uspoof_setChecks().
0996  *
0997  * \note
0998  *   Consider using the newer API, {@link uspoof_check2UTF8}, instead.
0999  *   The newer API exposes additional information from the check procedure
1000  *   and is otherwise identical to this method.
1001  *
1002  * @param sc      The USpoofChecker
1003  * @param id      A identifier to be checked for possible security issues, in UTF8 format.
1004  * @param length  the length of the string to be checked, or -1 if the string is
1005  *                zero terminated.
1006  * @param position  Deprecated in ICU 51.  Always returns zero.
1007  *                Originally, an out parameter for the index of the first
1008  *                string position that failed a check.
1009  *                This parameter may be NULL.
1010  * @param status  The error code, set if an error occurred while attempting to
1011  *                perform the check.
1012  *                Spoofing or security issues detected with the input string are
1013  *                not reported here, but through the function's return value.
1014  *                If the input contains invalid UTF-8 sequences,
1015  *                a status of U_INVALID_CHAR_FOUND will be returned.
1016  * @return        An integer value with bits set for any potential security
1017  *                or spoofing issues detected.  The bits are defined by
1018  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1019  *                will be zero if the input string passes all of the
1020  *                enabled checks.
1021  * @see uspoof_check2UTF8
1022  * @stable ICU 4.2
1023  */
1024 U_CAPI int32_t U_EXPORT2
1025 uspoof_checkUTF8(const USpoofChecker *sc,
1026                  const char *id, int32_t length,
1027                  int32_t *position,
1028                  UErrorCode *status);
1029 
1030 
1031 /**
1032  * Check the specified string for possible security issues.
1033  * The text to be checked will typically be an identifier of some sort.
1034  * The set of checks to be performed is specified with uspoof_setChecks().
1035  *
1036  * @param sc      The USpoofChecker
1037  * @param id      The identifier to be checked for possible security issues,
1038  *                in UTF-16 format.
1039  * @param length  the length of the string to be checked, or -1 if the string is
1040  *                zero terminated.
1041  * @param checkResult  An instance of USpoofCheckResult to be filled with
1042  *                details about the identifier.  Can be NULL.
1043  * @param status  The error code, set if an error occurred while attempting to
1044  *                perform the check.
1045  *                Spoofing or security issues detected with the input string are
1046  *                not reported here, but through the function's return value.
1047  * @return        An integer value with bits set for any potential security
1048  *                or spoofing issues detected.  The bits are defined by
1049  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1050  *                will be zero if the input string passes all of the
1051  *                enabled checks.  Any information in this bitmask will be
1052  *                consistent with the information saved in the optional
1053  *                checkResult parameter.
1054  * @see uspoof_openCheckResult
1055  * @see uspoof_check2UTF8
1056  * @see uspoof_check2UnicodeString
1057  * @stable ICU 58
1058  */
1059 U_CAPI int32_t U_EXPORT2
1060 uspoof_check2(const USpoofChecker *sc,
1061     const UChar* id, int32_t length,
1062     USpoofCheckResult* checkResult,
1063     UErrorCode *status);
1064 
1065 /**
1066  * Check the specified string for possible security issues.
1067  * The text to be checked will typically be an identifier of some sort.
1068  * The set of checks to be performed is specified with uspoof_setChecks().
1069  *
1070  * This version of {@link uspoof_check} accepts a USpoofCheckResult, which
1071  * returns additional information about the identifier.  For more
1072  * information, see {@link uspoof_openCheckResult}.
1073  *
1074  * @param sc      The USpoofChecker
1075  * @param id      A identifier to be checked for possible security issues, in UTF8 format.
1076  * @param length  the length of the string to be checked, or -1 if the string is
1077  *                zero terminated.
1078  * @param checkResult  An instance of USpoofCheckResult to be filled with
1079  *                details about the identifier.  Can be NULL.
1080  * @param status  The error code, set if an error occurred while attempting to
1081  *                perform the check.
1082  *                Spoofing or security issues detected with the input string are
1083  *                not reported here, but through the function's return value.
1084  * @return        An integer value with bits set for any potential security
1085  *                or spoofing issues detected.  The bits are defined by
1086  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1087  *                will be zero if the input string passes all of the
1088  *                enabled checks.  Any information in this bitmask will be
1089  *                consistent with the information saved in the optional
1090  *                checkResult parameter.
1091  * @see uspoof_openCheckResult
1092  * @see uspoof_check2
1093  * @see uspoof_check2UnicodeString
1094  * @stable ICU 58
1095  */
1096 U_CAPI int32_t U_EXPORT2
1097 uspoof_check2UTF8(const USpoofChecker *sc,
1098     const char *id, int32_t length,
1099     USpoofCheckResult* checkResult,
1100     UErrorCode *status);
1101 
1102 /**
1103  * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return
1104  * information about the identifier.  Information includes:
1105  * <ul>
1106  *   <li>A bitmask of the checks that failed</li>
1107  *   <li>The identifier's restriction level (UTS 39 section 5.2)</li>
1108  *   <li>The set of numerics in the string (UTS 39 section 5.3)</li>
1109  * </ul>
1110  * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call
1111  * of {@link uspoof_check2}.
1112  *
1113  * @param status  The error code, set if this function encounters a problem.
1114  * @return        the newly created USpoofCheckResult
1115  * @see uspoof_check2
1116  * @see uspoof_check2UTF8
1117  * @see uspoof_check2UnicodeString
1118  * @stable ICU 58
1119  */
1120 U_CAPI USpoofCheckResult* U_EXPORT2
1121 uspoof_openCheckResult(UErrorCode *status);
1122 
1123 /**
1124  * Close a USpoofCheckResult, freeing any memory that was being held by
1125  *   its implementation.
1126  *
1127  * @param checkResult  The instance of USpoofCheckResult to close
1128  * @stable ICU 58
1129  */
1130 U_CAPI void U_EXPORT2
1131 uspoof_closeCheckResult(USpoofCheckResult *checkResult);
1132 
1133 /**
1134  * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1135  * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on.
1136  *
1137  * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1138  * @param status       The error code, set if an error occurred.
1139  * @return        An integer value with bits set for any potential security
1140  *                or spoofing issues detected.  The bits are defined by
1141  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1142  *                will be zero if the input string passes all of the
1143  *                enabled checks.
1144  * @see uspoof_setChecks
1145  * @stable ICU 58
1146  */
1147 U_CAPI int32_t U_EXPORT2
1148 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status);
1149 
1150 /**
1151  * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check
1152  * was enabled; otherwise, undefined.
1153  *
1154  * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1155  * @param status       The error code, set if an error occurred.
1156  * @return             The restriction level contained in the USpoofCheckResult
1157  * @see uspoof_setRestrictionLevel
1158  * @stable ICU 58
1159  */
1160 U_CAPI URestrictionLevel U_EXPORT2
1161 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status);
1162 
1163 /**
1164  * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled;
1165  * otherwise, undefined.  The set will contain the zero digit from each decimal number system found
1166  * in the input string.  Ownership of the returned USet remains with the USpoofCheckResult.
1167  * The USet will be free'd when {@link uspoof_closeCheckResult} is called.
1168  *
1169  * @param checkResult  The instance of USpoofCheckResult created by {@link uspoof_openCheckResult}
1170  * @return             The set of numerics contained in the USpoofCheckResult
1171  * @param status       The error code, set if an error occurred.
1172  * @stable ICU 58
1173  */
1174 U_CAPI const USet* U_EXPORT2
1175 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status);
1176 
1177 
1178 /**
1179  * Check whether two specified strings are visually confusable.
1180  *
1181  * If the strings are confusable, the return value will be nonzero, as long as
1182  * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1183  *
1184  * The bits in the return value correspond to flags for each of the classes of
1185  * confusables applicable to the two input strings.  According to UTS 39
1186  * section 4, the possible flags are:
1187  *
1188  * <ul>
1189  *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1190  *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1191  *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1192  * </ul>
1193  *
1194  * If one or more of the above flags were not listed in uspoof_setChecks(), this
1195  * function will never report that class of confusable.  The check
1196  * {@link USPOOF_CONFUSABLE} enables all three flags.
1197  *
1198  *
1199  * @param sc      The USpoofChecker
1200  * @param id1     The first of the two identifiers to be compared for
1201  *                confusability.  The strings are in UTF-16 format.
1202  * @param length1 the length of the first identifier, expressed in
1203  *                16 bit UTF-16 code units, or -1 if the string is
1204  *                nul terminated.
1205  * @param id2     The second of the two identifiers to be compared for
1206  *                confusability.  The identifiers are in UTF-16 format.
1207  * @param length2 The length of the second identifiers, expressed in
1208  *                16 bit UTF-16 code units, or -1 if the string is
1209  *                nul terminated.
1210  * @param status  The error code, set if an error occurred while attempting to
1211  *                perform the check.
1212  *                Confusability of the identifiers is not reported here,
1213  *                but through this function's return value.
1214  * @return        An integer value with bit(s) set corresponding to
1215  *                the type of confusability found, as defined by
1216  *                enum USpoofChecks.  Zero is returned if the identifiers
1217  *                are not confusable.
1218  *
1219  * @stable ICU 4.2
1220  */
1221 U_CAPI int32_t U_EXPORT2
1222 uspoof_areConfusable(const USpoofChecker *sc,
1223                      const UChar *id1, int32_t length1,
1224                      const UChar *id2, int32_t length2,
1225                      UErrorCode *status);
1226 
1227 #ifndef U_HIDE_DRAFT_API
1228 /**
1229  * Check whether two specified strings are visually confusable when
1230  * displayed in a context with the given paragraph direction.
1231  *
1232  * If the strings are confusable, the return value will be nonzero, as long as
1233  * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks().
1234  *
1235  * The bits in the return value correspond to flags for each of the classes of
1236  * confusables applicable to the two input strings.  According to UTS 39
1237  * section 4, the possible flags are:
1238  *
1239  * <ul>
1240  *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
1241  *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
1242  *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
1243  * </ul>
1244  *
1245  * If one or more of the above flags were not listed in uspoof_setChecks(), this
1246  * function will never report that class of confusable.  The check
1247  * {@link USPOOF_CONFUSABLE} enables all three flags.
1248  *
1249  *
1250  * @param sc      The USpoofChecker
1251  * @param direction The paragraph direction with which the identifiers are
1252  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1253  * @param id1     The first of the two identifiers to be compared for
1254  *                confusability.  The strings are in UTF-16 format.
1255  * @param length1 the length of the first identifier, expressed in
1256  *                16 bit UTF-16 code units, or -1 if the string is
1257  *                nul terminated.
1258  * @param id2     The second of the two identifiers to be compared for
1259  *                confusability.  The identifiers are in UTF-16 format.
1260  * @param length2 The length of the second identifiers, expressed in
1261  *                16 bit UTF-16 code units, or -1 if the string is
1262  *                nul terminated.
1263  * @param status  The error code, set if an error occurred while attempting to
1264  *                perform the check.
1265  *                Confusability of the identifiers is not reported here,
1266  *                but through this function's return value.
1267  * @return        An integer value with bit(s) set corresponding to
1268  *                the type of confusability found, as defined by
1269  *                enum USpoofChecks.  Zero is returned if the identifiers
1270  *                are not confusable.
1271  *
1272  * @draft ICU 74
1273  */
1274 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction,
1275                                                   const UChar *id1, int32_t length1,
1276                                                   const UChar *id2, int32_t length2,
1277                                                   UErrorCode *status);
1278 #endif /* U_HIDE_DRAFT_API */
1279 
1280 /**
1281  * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format.
1282  *
1283  * @param sc      The USpoofChecker
1284  * @param id1     The first of the two identifiers to be compared for
1285  *                confusability.  The strings are in UTF-8 format.
1286  * @param length1 the length of the first identifiers, in bytes, or -1
1287  *                if the string is nul terminated.
1288  * @param id2     The second of the two identifiers to be compared for
1289  *                confusability.  The strings are in UTF-8 format.
1290  * @param length2 The length of the second string in bytes, or -1
1291  *                if the string is nul terminated.
1292  * @param status  The error code, set if an error occurred while attempting to
1293  *                perform the check.
1294  *                Confusability of the strings is not reported here,
1295  *                but through this function's return value.
1296  * @return        An integer value with bit(s) set corresponding to
1297  *                the type of confusability found, as defined by
1298  *                enum USpoofChecks.  Zero is returned if the strings
1299  *                are not confusable.
1300  *
1301  * @stable ICU 4.2
1302  *
1303  * @see uspoof_areConfusable
1304  */
1305 U_CAPI int32_t U_EXPORT2
1306 uspoof_areConfusableUTF8(const USpoofChecker *sc,
1307                          const char *id1, int32_t length1,
1308                          const char *id2, int32_t length2,
1309                          UErrorCode *status);
1310 
1311 #ifndef U_HIDE_DRAFT_API
1312 /**
1313  * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format.
1314  *
1315  * @param sc      The USpoofChecker
1316  * @param direction The paragraph direction with which the identifiers are
1317  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1318  * @param id1     The first of the two identifiers to be compared for
1319  *                confusability.  The strings are in UTF-8 format.
1320  * @param length1 the length of the first identifiers, in bytes, or -1
1321  *                if the string is nul terminated.
1322  * @param id2     The second of the two identifiers to be compared for
1323  *                confusability.  The strings are in UTF-8 format.
1324  * @param length2 The length of the second string in bytes, or -1
1325  *                if the string is nul terminated.
1326  * @param status  The error code, set if an error occurred while attempting to
1327  *                perform the check.
1328  *                Confusability of the strings is not reported here,
1329  *                but through this function's return value.
1330  * @return        An integer value with bit(s) set corresponding to
1331  *                the type of confusability found, as defined by
1332  *                enum USpoofChecks.  Zero is returned if the strings
1333  *                are not confusable.
1334  *
1335  * @draft ICU 74
1336  *
1337  * @see uspoof_areBidiConfusable
1338  */
1339 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1340                                                       const char *id1, int32_t length1,
1341                                                       const char *id2, int32_t length2,
1342                                                       UErrorCode *status);
1343 #endif /* U_HIDE_DRAFT_API */
1344 
1345 /**
1346  *  Get the "skeleton" for an identifier.
1347  *  Skeletons are a transformation of the input identifier;
1348  * Two identifiers are confusable if their skeletons are identical.
1349  *  See Unicode Technical Standard #39 for additional information.
1350  *
1351  *  Using skeletons directly makes it possible to quickly check
1352  *  whether an identifier is confusable with any of some large
1353  *  set of existing identifiers, by creating an efficiently
1354  *  searchable collection of the skeletons.
1355  *
1356  * @param sc      The USpoofChecker
1357  * @param type    Deprecated in ICU 58.  You may pass any number.
1358  *                Originally, controlled which of the Unicode confusable data
1359  *                tables to use.
1360  * @param id      The input identifier whose skeleton will be computed.
1361  * @param length  The length of the input identifier, expressed in 16 bit
1362  *                UTF-16 code units, or -1 if the string is zero terminated.
1363  * @param dest    The output buffer, to receive the skeleton string.
1364  * @param destCapacity  The length of the output buffer, in 16 bit units.
1365  *                The destCapacity may be zero, in which case the function will
1366  *                return the actual length of the skeleton.
1367  * @param status  The error code, set if an error occurred while attempting to
1368  *                perform the check.
1369  * @return        The length of the skeleton string.  The returned length
1370  *                is always that of the complete skeleton, even when the
1371  *                supplied buffer is too small (or of zero length)
1372  *
1373  * @stable ICU 4.2
1374  * @see uspoof_areConfusable
1375  */
1376 U_CAPI int32_t U_EXPORT2
1377 uspoof_getSkeleton(const USpoofChecker *sc,
1378                    uint32_t type,
1379                    const UChar *id,  int32_t length,
1380                    UChar *dest, int32_t destCapacity,
1381                    UErrorCode *status);
1382 
1383 #ifndef U_HIDE_DRAFT_API
1384 /**
1385  *  Get the "bidiSkeleton" for an identifier and a direction.
1386  *  Skeletons are a transformation of the input identifier;
1387  *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1388  *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1389  *  See Unicode Technical Standard #39 for additional information:
1390  *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1391  *
1392  *  Using skeletons directly makes it possible to quickly check
1393  *  whether an identifier is confusable with any of some large
1394  *  set of existing identifiers, by creating an efficiently
1395  *  searchable collection of the skeletons.
1396  *
1397  * @param sc      The USpoofChecker.
1398  * @param direction The context direction with which the identifier will be
1399  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1400  * @param id      The input identifier whose skeleton will be computed.
1401  * @param length  The length of the input identifier, expressed in 16 bit
1402  *                UTF-16 code units, or -1 if the string is zero terminated.
1403  * @param dest    The output buffer, to receive the skeleton string.
1404  * @param destCapacity  The length of the output buffer, in 16 bit units.
1405  *                The destCapacity may be zero, in which case the function will
1406  *                return the actual length of the skeleton.
1407  * @param status  The error code, set if an error occurred while attempting to
1408  *                perform the check.
1409  * @return        The length of the skeleton string.  The returned length
1410  *                is always that of the complete skeleton, even when the
1411  *                supplied buffer is too small (or of zero length)
1412  *
1413  * @draft ICU 74
1414  * @see uspoof_areBidiConfusable
1415  */
1416 U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc,
1417                                                 UBiDiDirection direction,
1418                                                 const UChar *id, int32_t length,
1419                                                 UChar *dest, int32_t destCapacity, UErrorCode *status);
1420 #endif /* U_HIDE_DRAFT_API */
1421 
1422 /**
1423  *  Get the "skeleton" for an identifier.
1424  *  Skeletons are a transformation of the input identifier;
1425  *  Two identifiers are confusable if their skeletons are identical.
1426  *  See Unicode Technical Standard #39 for additional information.
1427  *
1428  *  Using skeletons directly makes it possible to quickly check
1429  *  whether an identifier is confusable with any of some large
1430  *  set of existing identifiers, by creating an efficiently
1431  *  searchable collection of the skeletons.
1432  *
1433  * @param sc      The USpoofChecker
1434  * @param type    Deprecated in ICU 58.  You may pass any number.
1435  *                Originally, controlled which of the Unicode confusable data
1436  *                tables to use.
1437  * @param id      The UTF-8 format identifier whose skeleton will be computed.
1438  * @param length  The length of the input string, in bytes,
1439  *                or -1 if the string is zero terminated.
1440  * @param dest    The output buffer, to receive the skeleton string.
1441  * @param destCapacity  The length of the output buffer, in bytes.
1442  *                The destCapacity may be zero, in which case the function will
1443  *                return the actual length of the skeleton.
1444  * @param status  The error code, set if an error occurred while attempting to
1445  *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
1446  *                   for invalid UTF-8 sequences, and
1447  *                   U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1448  *                   to hold the complete skeleton.
1449  * @return        The length of the skeleton string, in bytes.  The returned length
1450  *                is always that of the complete skeleton, even when the
1451  *                supplied buffer is too small (or of zero length)
1452  *
1453  * @stable ICU 4.2
1454  */
1455 U_CAPI int32_t U_EXPORT2
1456 uspoof_getSkeletonUTF8(const USpoofChecker *sc,
1457                        uint32_t type,
1458                        const char *id,  int32_t length,
1459                        char *dest, int32_t destCapacity,
1460                        UErrorCode *status);
1461 
1462 #ifndef U_HIDE_DRAFT_API
1463 /**
1464  *  Get the "bidiSkeleton" for an identifier and a direction.
1465  *  Skeletons are a transformation of the input identifier;
1466  *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1467  *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1468  *  See Unicode Technical Standard #39 for additional information:
1469  *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1470  *
1471  *  Using skeletons directly makes it possible to quickly check
1472  *  whether an identifier is confusable with any of some large
1473  *  set of existing identifiers, by creating an efficiently
1474  *  searchable collection of the skeletons.
1475  *
1476  * @param sc      The USpoofChecker
1477  * @param direction The context direction with which the identifier will be
1478  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1479  * @param id      The UTF-8 format identifier whose skeleton will be computed.
1480  * @param length  The length of the input string, in bytes,
1481  *                or -1 if the string is zero terminated.
1482  * @param dest    The output buffer, to receive the skeleton string.
1483  * @param destCapacity  The length of the output buffer, in bytes.
1484  *                The destCapacity may be zero, in which case the function will
1485  *                return the actual length of the skeleton.
1486  * @param status  The error code, set if an error occurred while attempting to
1487  *                perform the check.  Possible Errors include U_INVALID_CHAR_FOUND
1488  *                for invalid UTF-8 sequences, and
1489  *                U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small
1490  *                to hold the complete skeleton.
1491  * @return        The length of the skeleton string, in bytes.  The returned length
1492  *                is always that of the complete skeleton, even when the
1493  *                supplied buffer is too small (or of zero length)
1494  *
1495  * @draft ICU 74
1496  */
1497 U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction,
1498                                                     const char *id, int32_t length, char *dest,
1499                                                     int32_t destCapacity, UErrorCode *status);
1500 #endif /* U_HIDE_DRAFT_API */
1501 
1502 /**
1503   * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1504   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1505   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1506   *
1507   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1508   * be deleted by the caller.
1509   *
1510   * @param status The error code, set if a problem occurs while creating the set.
1511   *
1512   * @stable ICU 51
1513   */
1514 U_CAPI const USet * U_EXPORT2
1515 uspoof_getInclusionSet(UErrorCode *status);
1516 
1517 /**
1518   * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1519   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1520   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1521   *
1522   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1523   * be deleted by the caller.
1524   *
1525   * @param status The error code, set if a problem occurs while creating the set.
1526   *
1527   * @stable ICU 51
1528   */
1529 U_CAPI const USet * U_EXPORT2
1530 uspoof_getRecommendedSet(UErrorCode *status);
1531 
1532 /**
1533  * Serialize the data for a spoof detector into a chunk of memory.
1534  * The flattened spoof detection tables can later be used to efficiently
1535  * instantiate a new Spoof Detector.
1536  *
1537  * The serialized spoof checker includes only the data compiled from the
1538  * Unicode data tables by uspoof_openFromSource(); it does not include
1539  * include any other state or configuration that may have been set.
1540  *
1541  * @param sc   the Spoof Detector whose data is to be serialized.
1542  * @param data a pointer to 32-bit-aligned memory to be filled with the data,
1543  *             can be NULL if capacity==0
1544  * @param capacity the number of bytes available at data,
1545  *                 or 0 for preflighting
1546  * @param status an in/out ICU UErrorCode; possible errors include:
1547  * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
1548  * - U_ILLEGAL_ARGUMENT_ERROR  the data or capacity parameters are bad
1549  * @return the number of bytes written or needed for the spoof data
1550  *
1551  * @see utrie2_openFromSerialized()
1552  * @stable ICU 4.2
1553  */
1554 U_CAPI int32_t U_EXPORT2
1555 uspoof_serialize(USpoofChecker *sc,
1556                  void *data, int32_t capacity,
1557                  UErrorCode *status);
1558 
1559 U_CDECL_END
1560 
1561 #if U_SHOW_CPLUSPLUS_API
1562 
1563 U_NAMESPACE_BEGIN
1564 
1565 /**
1566  * \class LocalUSpoofCheckerPointer
1567  * "Smart pointer" class, closes a USpoofChecker via uspoof_close().
1568  * For most methods see the LocalPointerBase base class.
1569  *
1570  * @see LocalPointerBase
1571  * @see LocalPointer
1572  * @stable ICU 4.4
1573  */
1574 /**
1575  * \cond
1576  * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1577  *       For now, suppress with a Doxygen cond
1578  */
1579 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close);
1580 /** \endcond */
1581 
1582 /**
1583  * \class LocalUSpoofCheckResultPointer
1584  * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`.
1585  * For most methods see the LocalPointerBase base class.
1586  *
1587  * @see LocalPointerBase
1588  * @see LocalPointer
1589  * @stable ICU 58
1590  */
1591 
1592 /**
1593  * \cond
1594  * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER.
1595  *       For now, suppress with a Doxygen cond
1596  */
1597 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult);
1598 /** \endcond */
1599 
1600 U_NAMESPACE_END
1601 
1602 /**
1603  * Limit the acceptable characters to those specified by a Unicode Set.
1604  *   Any previously specified character limit is
1605  *   is replaced by the new settings.    This includes limits on
1606  *   characters that were set with the uspoof_setAllowedLocales() function.
1607  *
1608  * The USPOOF_CHAR_LIMIT test is automatically enabled for this
1609  * USoofChecker by this function.
1610  *
1611  * @param sc       The USpoofChecker
1612  * @param chars    A Unicode Set containing the list of
1613  *                 characters that are permitted.  Ownership of the set
1614  *                 remains with the caller.  The incoming set is cloned by
1615  *                 this function, so there are no restrictions on modifying
1616  *                 or deleting the UnicodeSet after calling this function.
1617  * @param status   The error code, set if this function encounters a problem.
1618  * @stable ICU 4.2
1619  */
1620 U_CAPI void U_EXPORT2
1621 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status);
1622 
1623 
1624 /**
1625  * Get a UnicodeSet for the characters permitted in an identifier.
1626  * This corresponds to the limits imposed by the Set Allowed Characters /
1627  * UnicodeSet functions. Limitations imposed by other checks will not be
1628  * reflected in the set returned by this function.
1629  *
1630  * The returned set will be frozen, meaning that it cannot be modified
1631  * by the caller.
1632  *
1633  * Ownership of the returned set remains with the Spoof Detector.  The
1634  * returned set will become invalid if the spoof detector is closed,
1635  * or if a new set of allowed characters is specified.
1636  *
1637  *
1638  * @param sc       The USpoofChecker
1639  * @param status   The error code, set if this function encounters a problem.
1640  * @return         A UnicodeSet containing the characters that are permitted by
1641  *                 the USPOOF_CHAR_LIMIT test.
1642  * @stable ICU 4.2
1643  */
1644 U_CAPI const icu::UnicodeSet * U_EXPORT2
1645 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
1646 
1647 /**
1648  * Check the specified string for possible security issues.
1649  * The text to be checked will typically be an identifier of some sort.
1650  * The set of checks to be performed is specified with uspoof_setChecks().
1651  *
1652  * \note
1653  *   Consider using the newer API, {@link uspoof_check2UnicodeString}, instead.
1654  *   The newer API exposes additional information from the check procedure
1655  *   and is otherwise identical to this method.
1656  *
1657  * @param sc      The USpoofChecker
1658  * @param id      A identifier to be checked for possible security issues.
1659  * @param position  Deprecated in ICU 51.  Always returns zero.
1660  *                Originally, an out parameter for the index of the first
1661  *                string position that failed a check.
1662  *                This parameter may be nullptr.
1663  * @param status  The error code, set if an error occurred while attempting to
1664  *                perform the check.
1665  *                Spoofing or security issues detected with the input string are
1666  *                not reported here, but through the function's return value.
1667  * @return        An integer value with bits set for any potential security
1668  *                or spoofing issues detected.  The bits are defined by
1669  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1670  *                will be zero if the input string passes all of the
1671  *                enabled checks.
1672  * @see uspoof_check2UnicodeString
1673  * @stable ICU 4.2
1674  */
1675 U_CAPI int32_t U_EXPORT2
1676 uspoof_checkUnicodeString(const USpoofChecker *sc,
1677                           const icu::UnicodeString &id,
1678                           int32_t *position,
1679                           UErrorCode *status);
1680 
1681 /**
1682  * Check the specified string for possible security issues.
1683  * The text to be checked will typically be an identifier of some sort.
1684  * The set of checks to be performed is specified with uspoof_setChecks().
1685  *
1686  * @param sc      The USpoofChecker
1687  * @param id      A identifier to be checked for possible security issues.
1688  * @param checkResult  An instance of USpoofCheckResult to be filled with
1689  *                details about the identifier.  Can be nullptr.
1690  * @param status  The error code, set if an error occurred while attempting to
1691  *                perform the check.
1692  *                Spoofing or security issues detected with the input string are
1693  *                not reported here, but through the function's return value.
1694  * @return        An integer value with bits set for any potential security
1695  *                or spoofing issues detected.  The bits are defined by
1696  *                enum USpoofChecks.  (returned_value & USPOOF_ALL_CHECKS)
1697  *                will be zero if the input string passes all of the
1698  *                enabled checks.  Any information in this bitmask will be
1699  *                consistent with the information saved in the optional
1700  *                checkResult parameter.
1701  * @see uspoof_openCheckResult
1702  * @see uspoof_check2
1703  * @see uspoof_check2UTF8
1704  * @stable ICU 58
1705  */
1706 U_CAPI int32_t U_EXPORT2
1707 uspoof_check2UnicodeString(const USpoofChecker *sc,
1708     const icu::UnicodeString &id,
1709     USpoofCheckResult* checkResult,
1710     UErrorCode *status);
1711 
1712 /**
1713  * A version of {@link uspoof_areConfusable} accepting UnicodeStrings.
1714  *
1715  * @param sc      The USpoofChecker
1716  * @param s1     The first of the two identifiers to be compared for
1717  *                confusability.  The strings are in UTF-8 format.
1718  * @param s2     The second of the two identifiers to be compared for
1719  *                confusability.  The strings are in UTF-8 format.
1720  * @param status  The error code, set if an error occurred while attempting to
1721  *                perform the check.
1722  *                Confusability of the identifiers is not reported here,
1723  *                but through this function's return value.
1724  * @return        An integer value with bit(s) set corresponding to
1725  *                the type of confusability found, as defined by
1726  *                enum USpoofChecks.  Zero is returned if the identifiers
1727  *                are not confusable.
1728  *
1729  * @stable ICU 4.2
1730  *
1731  * @see uspoof_areConfusable
1732  */
1733 U_CAPI int32_t U_EXPORT2
1734 uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
1735                                   const icu::UnicodeString &s1,
1736                                   const icu::UnicodeString &s2,
1737                                   UErrorCode *status);
1738 
1739 #ifndef U_HIDE_DRAFT_API
1740 /**
1741  * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings.
1742  *
1743  * @param sc      The USpoofChecker
1744  * @param direction The paragraph direction with which the identifiers are
1745  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1746  * @param s1     The first of the two identifiers to be compared for
1747  *                confusability.  The strings are in UTF-8 format.
1748  * @param s2     The second of the two identifiers to be compared for
1749  *                confusability.  The strings are in UTF-8 format.
1750  * @param status  The error code, set if an error occurred while attempting to
1751  *                perform the check.
1752  *                Confusability of the identifiers is not reported here,
1753  *                but through this function's return value.
1754  * @return        An integer value with bit(s) set corresponding to
1755  *                the type of confusability found, as defined by
1756  *                enum USpoofChecks.  Zero is returned if the identifiers
1757  *                are not confusable.
1758  *
1759  * @draft ICU 74
1760  *
1761  * @see uspoof_areBidiConfusable
1762  */
1763 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc,
1764                                                                UBiDiDirection direction,
1765                                                                const icu::UnicodeString &s1,
1766                                                                const icu::UnicodeString &s2,
1767                                                                UErrorCode *status);
1768 #endif /* U_HIDE_DRAFT_API */
1769 
1770 /**
1771  *  Get the "skeleton" for an identifier.
1772  *  Skeletons are a transformation of the input identifier;
1773  *  Two identifiers are confusable if their skeletons are identical.
1774  *  See Unicode Technical Standard #39 for additional information.
1775  *
1776  *  Using skeletons directly makes it possible to quickly check
1777  *  whether an identifier is confusable with any of some large
1778  *  set of existing identifiers, by creating an efficiently
1779  *  searchable collection of the skeletons.
1780  *
1781  * @param sc      The USpoofChecker.
1782  * @param type    Deprecated in ICU 58.  You may pass any number.
1783  *                Originally, controlled which of the Unicode confusable data
1784  *                tables to use.
1785  * @param id      The input identifier whose skeleton will be computed.
1786  * @param dest    The output identifier, to receive the skeleton string.
1787  * @param status  The error code, set if an error occurred while attempting to
1788  *                perform the check.
1789  * @return        A reference to the destination (skeleton) string.
1790  *
1791  * @stable ICU 4.2
1792  */
1793 U_I18N_API icu::UnicodeString & U_EXPORT2
1794 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
1795                                 uint32_t type,
1796                                 const icu::UnicodeString &id,
1797                                 icu::UnicodeString &dest,
1798                                 UErrorCode *status);
1799 
1800 #ifndef U_HIDE_DRAFT_API
1801 /**
1802  *  Get the "bidiSkeleton" for an identifier and a direction.
1803  *  Skeletons are a transformation of the input identifier;
1804  *  Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical;
1805  *  they are RTL-confusable if their RTL bidiSkeletons are identical.
1806  *  See Unicode Technical Standard #39 for additional information.
1807  *  https://www.unicode.org/reports/tr39/#Confusable_Detection.
1808  *
1809  *  Using skeletons directly makes it possible to quickly check
1810  *  whether an identifier is confusable with any of some large
1811  *  set of existing identifiers, by creating an efficiently
1812  *  searchable collection of the skeletons.
1813  *
1814  * @param sc      The USpoofChecker.
1815  * @param direction The context direction with which the identifier will be
1816  *                displayed.  Must be either UBIDI_LTR or UBIDI_RTL.
1817  * @param id      The input identifier whose bidiSkeleton will be computed.
1818  * @param dest    The output identifier, to receive the skeleton string.
1819  * @param status  The error code, set if an error occurred while attempting to
1820  *                perform the check.
1821  * @return        A reference to the destination (skeleton) string.
1822  *
1823  * @draft ICU 74
1824  */
1825 U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString(
1826     const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id,
1827     icu::UnicodeString &dest, UErrorCode *status);
1828 #endif /* U_HIDE_DRAFT_API */
1829 
1830 /**
1831   * Get the set of Candidate Characters for Inclusion in Identifiers, as defined
1832   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1833   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1834   *
1835   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1836   * be deleted by the caller.
1837   *
1838   * @param status The error code, set if a problem occurs while creating the set.
1839   *
1840   * @stable ICU 51
1841   */
1842 U_CAPI const icu::UnicodeSet * U_EXPORT2
1843 uspoof_getInclusionUnicodeSet(UErrorCode *status);
1844 
1845 /**
1846   * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined
1847   * in http://unicode.org/Public/security/latest/xidmodifications.txt
1848   * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms.
1849   *
1850   * The returned set is frozen. Ownership of the set remains with the ICU library; it must not
1851   * be deleted by the caller.
1852   *
1853   * @param status The error code, set if a problem occurs while creating the set.
1854   *
1855   * @stable ICU 51
1856   */
1857 U_CAPI const icu::UnicodeSet * U_EXPORT2
1858 uspoof_getRecommendedUnicodeSet(UErrorCode *status);
1859 
1860 #endif /* U_SHOW_CPLUSPLUS_API */
1861 
1862 #endif /* UCONFIG_NO_NORMALIZATION */
1863 
1864 #endif   /* USPOOF_H */