|
|
|||
File indexing completed on 2025-12-15 10:31:22
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 *************************************************************************** 0005 * Copyright (C) 2008-2016, International Business Machines Corporation 0006 * and others. All Rights Reserved. 0007 *************************************************************************** 0008 * file name: uspoof.h 0009 * encoding: UTF-8 0010 * tab size: 8 (not used) 0011 * indentation:4 0012 * 0013 * created on: 2008Feb13 0014 * created by: Andy Heninger 0015 * 0016 * Unicode Spoof Detection 0017 */ 0018 0019 #ifndef USPOOF_H 0020 #define USPOOF_H 0021 0022 #include "unicode/ubidi.h" 0023 #include "unicode/utypes.h" 0024 #include "unicode/uset.h" 0025 #include "unicode/parseerr.h" 0026 0027 #if !UCONFIG_NO_NORMALIZATION 0028 0029 0030 #if U_SHOW_CPLUSPLUS_API 0031 #include "unicode/localpointer.h" 0032 #include "unicode/unistr.h" 0033 #include "unicode/uniset.h" 0034 #endif 0035 0036 0037 /** 0038 * \file 0039 * \brief C API: Unicode Security and Spoofing Detection 0040 * 0041 * <p> 0042 * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and 0043 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions: 0044 * 0045 * <ol> 0046 * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and 0047 * "Ηarvest", where the second string starts with the Greek capital letter Eta.</li> 0048 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof 0049 * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li> 0050 * </ol> 0051 * 0052 * <p> 0053 * Although originally designed as a method for flagging suspicious identifier strings such as URLs, 0054 * <code>USpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word 0055 * content filters. 0056 * 0057 * <p> 0058 * The functions of this class are exposed as C API, with a handful of syntactical conveniences for C++. 0059 * 0060 * <h2>Confusables</h2> 0061 * 0062 * <p> 0063 * The following example shows how to use <code>USpoofChecker</code> to check for confusability between two strings: 0064 * 0065 * \code{.c} 0066 * UErrorCode status = U_ZERO_ERROR; 0067 * UChar* str1 = (UChar*) u"Harvest"; 0068 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 0069 * 0070 * USpoofChecker* sc = uspoof_open(&status); 0071 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 0072 * 0073 * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status); 0074 * UBool result = bitmask != 0; 0075 * // areConfusable: 1 (status: U_ZERO_ERROR) 0076 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 0077 * uspoof_close(sc); 0078 * \endcode 0079 * 0080 * <p> 0081 * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks} 0082 * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the 0083 * confusability test; and the following line extracts the result out of the return value. For best performance, 0084 * the instance should be created once (e.g., upon application startup), and the efficient 0085 * {@link uspoof_areConfusable} method can be used at runtime. 0086 * 0087 * If the paragraph direction used to display the strings is known, the bidi function should be used instead: 0088 * 0089 * \code{.c} 0090 * UErrorCode status = U_ZERO_ERROR; 0091 * // These strings look identical when rendered in a left-to-right context. 0092 * // They look distinct in a right-to-left context. 0093 * UChar* str1 = (UChar*) u"A1\u05D0"; // A1א 0094 * UChar* str2 = (UChar*) u"A\u05D01"; // Aא1 0095 * 0096 * USpoofChecker* sc = uspoof_open(&status); 0097 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 0098 * 0099 * int32_t bitmask = uspoof_areBidiConfusable(sc, UBIDI_LTR, str1, -1, str2, -1, &status); 0100 * UBool result = bitmask != 0; 0101 * // areBidiConfusable: 1 (status: U_ZERO_ERROR) 0102 * printf("areBidiConfusable: %d (status: %s)\n", result, u_errorName(status)); 0103 * uspoof_close(sc); 0104 * \endcode 0105 * 0106 * <p> 0107 * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers. It will automatically call 0108 * {@link uspoof_close} when the object goes out of scope: 0109 * 0110 * \code{.cpp} 0111 * UErrorCode status = U_ZERO_ERROR; 0112 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 0113 * uspoof_setChecks(sc.getAlias(), USPOOF_CONFUSABLE, &status); 0114 * // ... 0115 * \endcode 0116 * 0117 * UTS 39 defines two strings to be <em>confusable</em> if they map to the same <em>skeleton string</em>. A skeleton can 0118 * be thought of as a "hash code". {@link uspoof_getSkeleton} computes the skeleton for a particular string, so 0119 * the following snippet is equivalent to the example above: 0120 * 0121 * \code{.c} 0122 * UErrorCode status = U_ZERO_ERROR; 0123 * UChar* str1 = (UChar*) u"Harvest"; 0124 * UChar* str2 = (UChar*) u"\u0397arvest"; // with U+0397 GREEK CAPITAL LETTER ETA 0125 * 0126 * USpoofChecker* sc = uspoof_open(&status); 0127 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 0128 * 0129 * // Get skeleton 1 0130 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status); 0131 * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar)); 0132 * status = U_ZERO_ERROR; 0133 * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status); 0134 * 0135 * // Get skeleton 2 0136 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status); 0137 * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar)); 0138 * status = U_ZERO_ERROR; 0139 * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status); 0140 * 0141 * // Are the skeletons the same? 0142 * UBool result = u_strcmp(skel1, skel2) == 0; 0143 * // areConfusable: 1 (status: U_ZERO_ERROR) 0144 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status)); 0145 * uspoof_close(sc); 0146 * free(skel1); 0147 * free(skel2); 0148 * \endcode 0149 * 0150 * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling 0151 * {@link uspoof_areConfusable} many times in a loop, {@link uspoof_getSkeleton} can be used instead, as shown below: 0152 * 0153 * \code{.c} 0154 * UErrorCode status = U_ZERO_ERROR; 0155 * #define DICTIONARY_LENGTH 2 0156 * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" }; 0157 * UChar* skeletons[DICTIONARY_LENGTH]; 0158 * UChar* str = (UChar*) u"1orern"; 0159 * 0160 * // Setup: 0161 * USpoofChecker* sc = uspoof_open(&status); 0162 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status); 0163 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 0164 * UChar* word = dictionary[i]; 0165 * int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status); 0166 * skeletons[i] = (UChar*) malloc(++len * sizeof(UChar)); 0167 * status = U_ZERO_ERROR; 0168 * uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status); 0169 * } 0170 * 0171 * // Live Check: 0172 * { 0173 * int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status); 0174 * UChar* skel = (UChar*) malloc(++len * sizeof(UChar)); 0175 * status = U_ZERO_ERROR; 0176 * uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status); 0177 * UBool result = false; 0178 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 0179 * result = u_strcmp(skel, skeletons[i]) == 0; 0180 * if (result == true) { break; } 0181 * } 0182 * // Has confusable in dictionary: 1 (status: U_ZERO_ERROR) 0183 * printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status)); 0184 * free(skel); 0185 * } 0186 * 0187 * for (size_t i=0; i<DICTIONARY_LENGTH; i++) { 0188 * free(skeletons[i]); 0189 * } 0190 * uspoof_close(sc); 0191 * \endcode 0192 * 0193 * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em> 0194 * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons 0195 * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons. 0196 * 0197 * <h2>Spoof Detection</h2> 0198 * 0199 * The following snippet shows a minimal example of using <code>USpoofChecker</code> to perform spoof detection on a 0200 * string: 0201 * 0202 * \code{.c} 0203 * UErrorCode status = U_ZERO_ERROR; 0204 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 0205 * 0206 * // Get the default set of allowable characters: 0207 * USet* allowed = uset_openEmpty(); 0208 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 0209 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 0210 * 0211 * USpoofChecker* sc = uspoof_open(&status); 0212 * uspoof_setAllowedChars(sc, allowed, &status); 0213 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 0214 * 0215 * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status); 0216 * UBool result = bitmask != 0; 0217 * // fails checks: 1 (status: U_ZERO_ERROR) 0218 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 0219 * uspoof_close(sc); 0220 * uset_close(allowed); 0221 * \endcode 0222 * 0223 * As in the case for confusability checking, it is good practice to create one <code>USpoofChecker</code> instance at 0224 * startup, and call the cheaper {@link uspoof_check} online. We specify the set of 0225 * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. 0226 * 0227 * In addition to {@link uspoof_check}, the function {@link uspoof_checkUTF8} is exposed for UTF8-encoded char* strings, 0228 * and {@link uspoof_checkUnicodeString} is exposed for C++ programmers. 0229 * 0230 * If the {@link USPOOF_AUX_INFO} check is enabled, a limited amount of information on why a string failed the checks 0231 * is available in the returned bitmask. For complete information, use the {@link uspoof_check2} class of functions 0232 * with a {@link USpoofCheckResult} parameter: 0233 * 0234 * \code{.c} 0235 * UErrorCode status = U_ZERO_ERROR; 0236 * UChar* str = (UChar*) u"p\u0430ypal"; // with U+0430 CYRILLIC SMALL LETTER A 0237 * 0238 * // Get the default set of allowable characters: 0239 * USet* allowed = uset_openEmpty(); 0240 * uset_addAll(allowed, uspoof_getRecommendedSet(&status)); 0241 * uset_addAll(allowed, uspoof_getInclusionSet(&status)); 0242 * 0243 * USpoofChecker* sc = uspoof_open(&status); 0244 * uspoof_setAllowedChars(sc, allowed, &status); 0245 * uspoof_setRestrictionLevel(sc, USPOOF_MODERATELY_RESTRICTIVE); 0246 * 0247 * USpoofCheckResult* checkResult = uspoof_openCheckResult(&status); 0248 * int32_t bitmask = uspoof_check2(sc, str, -1, checkResult, &status); 0249 * 0250 * int32_t failures1 = bitmask; 0251 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status); 0252 * assert(failures1 == failures2); 0253 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 0254 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 0255 * 0256 * // Cleanup: 0257 * uspoof_close(sc); 0258 * uset_close(allowed); 0259 * uspoof_closeCheckResult(checkResult); 0260 * \endcode 0261 * 0262 * C++ users can take advantage of a few syntactical conveniences. The following snippet is functionally 0263 * equivalent to the one above: 0264 * 0265 * \code{.cpp} 0266 * UErrorCode status = U_ZERO_ERROR; 0267 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 0268 * 0269 * // Get the default set of allowable characters: 0270 * UnicodeSet allowed; 0271 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 0272 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 0273 * 0274 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 0275 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 0276 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 0277 * 0278 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 0279 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 0280 * 0281 * int32_t failures1 = bitmask; 0282 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status); 0283 * assert(failures1 == failures2); 0284 * // checks that failed: 0x00000010 (status: U_ZERO_ERROR) 0285 * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status)); 0286 * 0287 * // Explicit cleanup not necessary. 0288 * \endcode 0289 * 0290 * The return value is a bitmask of the checks that failed. In this case, there was one check that failed: 0291 * {@link USPOOF_RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are: 0292 * 0293 * <ul> 0294 * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the 0295 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS 0296 * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li> 0297 * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character 0298 * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li> 0299 * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable 0300 * characters. See {@link uspoof_setAllowedChars} and {@link uspoof_setAllowedLocales}.</li> 0301 * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li> 0302 * </ul> 0303 * 0304 * <p> 0305 * These checks can be enabled independently of each other. For example, if you were interested in checking for only the 0306 * INVISIBLE and MIXED_NUMBERS conditions, you could do: 0307 * 0308 * \code{.c} 0309 * UErrorCode status = U_ZERO_ERROR; 0310 * UChar* str = (UChar*) u"8\u09EA"; // 8 mixed with U+09EA BENGALI DIGIT FOUR 0311 * 0312 * USpoofChecker* sc = uspoof_open(&status); 0313 * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status); 0314 * 0315 * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status); 0316 * UBool result = bitmask != 0; 0317 * // fails checks: 1 (status: U_ZERO_ERROR) 0318 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status)); 0319 * uspoof_close(sc); 0320 * \endcode 0321 * 0322 * Here is an example in C++ showing how to compute the restriction level of a string: 0323 * 0324 * \code{.cpp} 0325 * UErrorCode status = U_ZERO_ERROR; 0326 * UnicodeString str((UChar*) u"p\u0430ypal"); // with U+0430 CYRILLIC SMALL LETTER A 0327 * 0328 * // Get the default set of allowable characters: 0329 * UnicodeSet allowed; 0330 * allowed.addAll(*uspoof_getRecommendedUnicodeSet(&status)); 0331 * allowed.addAll(*uspoof_getInclusionUnicodeSet(&status)); 0332 * 0333 * LocalUSpoofCheckerPointer sc(uspoof_open(&status)); 0334 * uspoof_setAllowedChars(sc.getAlias(), allowed.toUSet(), &status); 0335 * uspoof_setRestrictionLevel(sc.getAlias(), USPOOF_MODERATELY_RESTRICTIVE); 0336 * uspoof_setChecks(sc.getAlias(), USPOOF_RESTRICTION_LEVEL | USPOOF_AUX_INFO, &status); 0337 * 0338 * LocalUSpoofCheckResultPointer checkResult(uspoof_openCheckResult(&status)); 0339 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status); 0340 * 0341 * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status); 0342 * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask: 0343 * assert((restrictionLevel & bitmask) == restrictionLevel); 0344 * // Restriction level: 0x50000000 (status: U_ZERO_ERROR) 0345 * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status)); 0346 * \endcode 0347 * 0348 * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE. Since 0349 * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check. 0350 * 0351 * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in 0352 * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings 0353 * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have 0354 * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is 0355 * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed 0356 * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on 0357 * the levels, see UTS 39 or {@link URestrictionLevel}. The Restriction Level test is aware of the set of 0358 * allowed characters set in {@link uspoof_setAllowedChars}. Note that characters which have script code 0359 * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple 0360 * scripts. 0361 * 0362 * <h2>Advanced bidirectional usage</h2> 0363 * If the paragraph direction with which the identifiers will be displayed is not known, there are 0364 * multiple options for confusable detection depending on the circumstances. 0365 * 0366 * <p> 0367 * In some circumstances, the only concern is confusion between identifiers displayed with the same 0368 * paragraph direction. 0369 * 0370 * <p> 0371 * An example is the case where identifiers are usernames prefixed with the @ symbol. 0372 * That symbol will appear to the left in a left-to-right context, and to the right in a 0373 * right-to-left context, so that an identifier displayed in a left-to-right context can never be 0374 * confused with an identifier displayed in a right-to-left context: 0375 * <ul> 0376 * <li> 0377 * The usernames "A1א" (A one aleph) and "Aא1" (A aleph 1) 0378 * would be considered confusable, since they both appear as \@A1א in a left-to-right context, and the 0379 * usernames "אA_1" (aleph A underscore one) and "א1_A" (aleph one underscore A) would be considered 0380 * confusable, since they both appear as A_1א@ in a right-to-left context. 0381 * </li> 0382 * <li> 0383 * The username "Mark_" would not be considered confusable with the username "_Mark", 0384 * even though the latter would appear as Mark_@ in a right-to-left context, and the 0385 * former as \@Mark_ in a left-to-right context. 0386 * </li> 0387 * </ul> 0388 * <p> 0389 * In that case, the caller should check for both LTR-confusability and RTL-confusability: 0390 * 0391 * \code{.cpp} 0392 * bool confusableInEitherDirection = 0393 * uspoof_areBidiConfusableUnicodeString(sc, UBIDI_LTR, id1, id2, &status) || 0394 * uspoof_areBidiConfusableUnicodeString(sc, UBIDI_RTL, id1, id2, &status); 0395 * \endcode 0396 * 0397 * If the bidiSkeleton is used, the LTR and RTL skeleta should be kept separately and compared, LTR 0398 * with LTR and RTL with RTL. 0399 * 0400 * <p> 0401 * In cases where confusability between the visual appearances of an identifier displayed in a 0402 * left-to-right context with another identifier displayed in a right-to-left context is a concern, 0403 * the LTR skeleton of one can be compared with the RTL skeleton of the other. However, this 0404 * very broad definition of confusability may have unexpected results; for instance, it treats the 0405 * ASCII identifiers "Mark_" and "_Mark" as confusable. 0406 * 0407 * <h2>Additional Information</h2> 0408 * 0409 * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers. 0410 * 0411 * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether 0412 * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads, 0413 * using the same USpoofChecker instance. 0414 * 0415 * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are 0416 * thread safe. Those that take a non-const USpoofChecker are not thread safe.. 0417 * 0418 * @stable ICU 4.6 0419 */ 0420 0421 U_CDECL_BEGIN 0422 0423 struct USpoofChecker; 0424 /** 0425 * @stable ICU 4.2 0426 */ 0427 typedef struct USpoofChecker USpoofChecker; /**< typedef for C of USpoofChecker */ 0428 0429 struct USpoofCheckResult; 0430 /** 0431 * @see uspoof_openCheckResult 0432 * @stable ICU 58 0433 */ 0434 typedef struct USpoofCheckResult USpoofCheckResult; 0435 0436 /** 0437 * Enum for the kinds of checks that USpoofChecker can perform. 0438 * These enum values are used both to select the set of checks that 0439 * will be performed, and to report results from the check function. 0440 * 0441 * @stable ICU 4.2 0442 */ 0443 typedef enum USpoofChecks { 0444 /** 0445 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 0446 * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section 0447 * 4. 0448 * 0449 * @see uspoof_areConfusable 0450 * @stable ICU 4.2 0451 */ 0452 USPOOF_SINGLE_SCRIPT_CONFUSABLE = 1, 0453 0454 /** 0455 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 0456 * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS 0457 * 39 section 4. 0458 * 0459 * @see uspoof_areConfusable 0460 * @stable ICU 4.2 0461 */ 0462 USPOOF_MIXED_SCRIPT_CONFUSABLE = 2, 0463 0464 /** 0465 * When performing the two-string {@link uspoof_areConfusable} test, this flag in the return value indicates 0466 * that the two strings are visually confusable and that they are not from the same script but both of them are 0467 * single-script strings, according to UTS 39 section 4. 0468 * 0469 * @see uspoof_areConfusable 0470 * @stable ICU 4.2 0471 */ 0472 USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4, 0473 0474 /** 0475 * Enable this flag in {@link uspoof_setChecks} to turn on all types of confusables. You may set 0476 * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to 0477 * make {@link uspoof_areConfusable} return only those types of confusables. 0478 * 0479 * @see uspoof_areConfusable 0480 * @see uspoof_getSkeleton 0481 * @stable ICU 58 0482 */ 0483 USPOOF_CONFUSABLE = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, 0484 0485 #ifndef U_HIDE_DEPRECATED_API 0486 /** 0487 * This flag is deprecated and no longer affects the behavior of SpoofChecker. 0488 * 0489 * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was deprecated. 0490 */ 0491 USPOOF_ANY_CASE = 8, 0492 #endif /* U_HIDE_DEPRECATED_API */ 0493 0494 /** 0495 * Check that an identifier is no looser than the specified RestrictionLevel. 0496 * The default if {@link uspoof_setRestrictionLevel} is not called is HIGHLY_RESTRICTIVE. 0497 * 0498 * If USPOOF_AUX_INFO is enabled the actual restriction level of the 0499 * identifier being tested will also be returned by uspoof_check(). 0500 * 0501 * @see URestrictionLevel 0502 * @see uspoof_setRestrictionLevel 0503 * @see USPOOF_AUX_INFO 0504 * 0505 * @stable ICU 51 0506 */ 0507 USPOOF_RESTRICTION_LEVEL = 16, 0508 0509 #ifndef U_HIDE_DEPRECATED_API 0510 /** Check that an identifier contains only characters from a 0511 * single script (plus chars from the common and inherited scripts.) 0512 * Applies to checks of a single identifier check only. 0513 * @deprecated ICU 51 Use RESTRICTION_LEVEL instead. 0514 */ 0515 USPOOF_SINGLE_SCRIPT = USPOOF_RESTRICTION_LEVEL, 0516 #endif /* U_HIDE_DEPRECATED_API */ 0517 0518 /** Check an identifier for the presence of invisible characters, 0519 * such as zero-width spaces, or character sequences that are 0520 * likely not to display, such as multiple occurrences of the same 0521 * non-spacing mark. This check does not test the input string as a whole 0522 * for conformance to any particular syntax for identifiers. 0523 */ 0524 USPOOF_INVISIBLE = 32, 0525 0526 /** Check that an identifier contains only characters from a specified set 0527 * of acceptable characters. See {@link uspoof_setAllowedChars} and 0528 * {@link uspoof_setAllowedLocales}. Note that a string that fails this check 0529 * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check. 0530 */ 0531 USPOOF_CHAR_LIMIT = 64, 0532 0533 /** 0534 * Check that an identifier does not mix numbers from different numbering systems. 0535 * For more information, see UTS 39 section 5.3. 0536 * 0537 * @stable ICU 51 0538 */ 0539 USPOOF_MIXED_NUMBERS = 128, 0540 0541 /** 0542 * Check that an identifier does not have a combining character following a character in which that 0543 * combining character would be hidden; for example 'i' followed by a U+0307 combining dot. 0544 * 0545 * More specifically, the following characters are forbidden from preceding a U+0307: 0546 * <ul> 0547 * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li> 0548 * <li>Latin lowercase letter 'l'</li> 0549 * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li> 0550 * <li>Any character whose confusable prototype ends with such a character 0551 * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li> 0552 * </ul> 0553 * In addition, combining characters are allowed between the above characters and U+0307 except those 0554 * with combining class 0 or combining class "Above" (230, same class as U+0307). 0555 * 0556 * This list and the number of combing characters considered by this check may grow over time. 0557 * 0558 * @stable ICU 62 0559 */ 0560 USPOOF_HIDDEN_OVERLAY = 256, 0561 0562 /** 0563 * Enable all spoof checks. 0564 * 0565 * @stable ICU 4.6 0566 */ 0567 USPOOF_ALL_CHECKS = 0xFFFF, 0568 0569 /** 0570 * Enable the return of auxiliary (non-error) information in the 0571 * upper bits of the check results value. 0572 * 0573 * If this "check" is not enabled, the results of {@link uspoof_check} will be 0574 * zero when an identifier passes all of the enabled checks. 0575 * 0576 * If this "check" is enabled, (uspoof_check() & {@link USPOOF_ALL_CHECKS}) will 0577 * be zero when an identifier passes all checks. 0578 * 0579 * @stable ICU 51 0580 */ 0581 USPOOF_AUX_INFO = 0x40000000 0582 0583 } USpoofChecks; 0584 0585 0586 /** 0587 * Constants from UTS #39 for use in {@link uspoof_setRestrictionLevel}, and 0588 * for returned identifier restriction levels in check results. 0589 * 0590 * @stable ICU 51 0591 * 0592 * @see uspoof_setRestrictionLevel 0593 * @see uspoof_check 0594 */ 0595 typedef enum URestrictionLevel { 0596 /** 0597 * All characters in the string are in the identifier profile and all characters in the string are in the 0598 * ASCII range. 0599 * 0600 * @stable ICU 51 0601 */ 0602 USPOOF_ASCII = 0x10000000, 0603 /** 0604 * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and 0605 * the string is single-script, according to the definition in UTS 39 section 5.1. 0606 * 0607 * @stable ICU 53 0608 */ 0609 USPOOF_SINGLE_SCRIPT_RESTRICTIVE = 0x20000000, 0610 /** 0611 * The string classifies as Single Script, or all characters in the string are in the identifier profile and 0612 * the string is covered by any of the following sets of scripts, according to the definition in UTS 39 0613 * section 5.1: 0614 * <ul> 0615 * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li> 0616 * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li> 0617 * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li> 0618 * </ul> 0619 * This is the default restriction in ICU. 0620 * 0621 * @stable ICU 51 0622 */ 0623 USPOOF_HIGHLY_RESTRICTIVE = 0x30000000, 0624 /** 0625 * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile 0626 * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic, 0627 * Greek, and Cherokee. 0628 * 0629 * @stable ICU 51 0630 */ 0631 USPOOF_MODERATELY_RESTRICTIVE = 0x40000000, 0632 /** 0633 * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts. 0634 * 0635 * @stable ICU 51 0636 */ 0637 USPOOF_MINIMALLY_RESTRICTIVE = 0x50000000, 0638 /** 0639 * Any valid identifiers, including characters outside of the Identifier Profile. 0640 * 0641 * @stable ICU 51 0642 */ 0643 USPOOF_UNRESTRICTIVE = 0x60000000, 0644 /** 0645 * Mask for selecting the Restriction Level bits from the return value of {@link uspoof_check}. 0646 * 0647 * @stable ICU 53 0648 */ 0649 USPOOF_RESTRICTION_LEVEL_MASK = 0x7F000000, 0650 #ifndef U_HIDE_INTERNAL_API 0651 /** 0652 * An undefined restriction level. 0653 * @internal 0654 */ 0655 USPOOF_UNDEFINED_RESTRICTIVE = -1 0656 #endif /* U_HIDE_INTERNAL_API */ 0657 } URestrictionLevel; 0658 0659 /** 0660 * Create a Unicode Spoof Checker, configured to perform all 0661 * checks except for USPOOF_LOCALE_LIMIT and USPOOF_CHAR_LIMIT. 0662 * Note that additional checks may be added in the future, 0663 * resulting in the changes to the default checking behavior. 0664 * 0665 * @param status The error code, set if this function encounters a problem. 0666 * @return the newly created Spoof Checker 0667 * @stable ICU 4.2 0668 */ 0669 U_CAPI USpoofChecker * U_EXPORT2 0670 uspoof_open(UErrorCode *status); 0671 0672 0673 /** 0674 * Open a Spoof checker from its serialized form, stored in 32-bit-aligned memory. 0675 * Inverse of uspoof_serialize(). 0676 * The memory containing the serialized data must remain valid and unchanged 0677 * as long as the spoof checker, or any cloned copies of the spoof checker, 0678 * are in use. Ownership of the memory remains with the caller. 0679 * The spoof checker (and any clones) must be closed prior to deleting the 0680 * serialized data. 0681 * 0682 * @param data a pointer to 32-bit-aligned memory containing the serialized form of spoof data 0683 * @param length the number of bytes available at data; 0684 * can be more than necessary 0685 * @param pActualLength receives the actual number of bytes at data taken up by the data; 0686 * can be NULL 0687 * @param pErrorCode ICU error code 0688 * @return the spoof checker. 0689 * 0690 * @see uspoof_open 0691 * @see uspoof_serialize 0692 * @stable ICU 4.2 0693 */ 0694 U_CAPI USpoofChecker * U_EXPORT2 0695 uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength, 0696 UErrorCode *pErrorCode); 0697 0698 /** 0699 * Open a Spoof Checker from the source form of the spoof data. 0700 * The input corresponds to the Unicode data file confusables.txt 0701 * as described in Unicode Technical Standard #39. The syntax of the source data 0702 * is as described in UTS #39 for this file, and the content of 0703 * this file is acceptable input. 0704 * 0705 * The character encoding of the (char *) input text is UTF-8. 0706 * 0707 * @param confusables a pointer to the confusable characters definitions, 0708 * as found in file confusables.txt from unicode.org. 0709 * @param confusablesLen The length of the confusables text, or -1 if the 0710 * input string is zero terminated. 0711 * @param confusablesWholeScript 0712 * Deprecated in ICU 58. No longer used. 0713 * @param confusablesWholeScriptLen 0714 * Deprecated in ICU 58. No longer used. 0715 * @param errType In the event of an error in the input, indicates 0716 * which of the input files contains the error. 0717 * The value is one of USPOOF_SINGLE_SCRIPT_CONFUSABLE or 0718 * USPOOF_WHOLE_SCRIPT_CONFUSABLE, or 0719 * zero if no errors are found. 0720 * @param pe In the event of an error in the input, receives the position 0721 * in the input text (line, offset) of the error. 0722 * @param status an in/out ICU UErrorCode. Among the possible errors is 0723 * U_PARSE_ERROR, which is used to report syntax errors 0724 * in the input. 0725 * @return A spoof checker that uses the rules from the input files. 0726 * @stable ICU 4.2 0727 */ 0728 U_CAPI USpoofChecker * U_EXPORT2 0729 uspoof_openFromSource(const char *confusables, int32_t confusablesLen, 0730 const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, 0731 int32_t *errType, UParseError *pe, UErrorCode *status); 0732 0733 0734 /** 0735 * Close a Spoof Checker, freeing any memory that was being held by 0736 * its implementation. 0737 * @stable ICU 4.2 0738 */ 0739 U_CAPI void U_EXPORT2 0740 uspoof_close(USpoofChecker *sc); 0741 0742 /** 0743 * Clone a Spoof Checker. The clone will be set to perform the same checks 0744 * as the original source. 0745 * 0746 * @param sc The source USpoofChecker 0747 * @param status The error code, set if this function encounters a problem. 0748 * @return 0749 * @stable ICU 4.2 0750 */ 0751 U_CAPI USpoofChecker * U_EXPORT2 0752 uspoof_clone(const USpoofChecker *sc, UErrorCode *status); 0753 0754 0755 /** 0756 * Specify the bitmask of checks that will be performed by {@link uspoof_check}. Calling this method 0757 * overwrites any checks that may have already been enabled. By default, all checks are enabled. 0758 * 0759 * To enable specific checks and disable all others, 0760 * OR together only the bit constants for the desired checks. 0761 * For example, to fail strings containing characters outside of 0762 * the set specified by {@link uspoof_setAllowedChars} and 0763 * also strings that contain digits from mixed numbering systems: 0764 * 0765 * <pre> 0766 * {@code 0767 * uspoof_setChecks(USPOOF_CHAR_LIMIT | USPOOF_MIXED_NUMBERS); 0768 * } 0769 * </pre> 0770 * 0771 * To disable specific checks and enable all others, 0772 * start with ALL_CHECKS and "AND away" the not-desired checks. 0773 * For example, if you are not planning to use the {@link uspoof_areConfusable} functionality, 0774 * it is good practice to disable the CONFUSABLE check: 0775 * 0776 * <pre> 0777 * {@code 0778 * uspoof_setChecks(USPOOF_ALL_CHECKS & ~USPOOF_CONFUSABLE); 0779 * } 0780 * </pre> 0781 * 0782 * Note that methods such as {@link uspoof_setAllowedChars}, {@link uspoof_setAllowedLocales}, and 0783 * {@link uspoof_setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they 0784 * enable onto the existing bitmask specified by this method. For more details, see the documentation of those 0785 * methods. 0786 * 0787 * @param sc The USpoofChecker 0788 * @param checks The set of checks that this spoof checker will perform. 0789 * The value is a bit set, obtained by OR-ing together 0790 * values from enum USpoofChecks. 0791 * @param status The error code, set if this function encounters a problem. 0792 * @stable ICU 4.2 0793 * 0794 */ 0795 U_CAPI void U_EXPORT2 0796 uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status); 0797 0798 /** 0799 * Get the set of checks that this Spoof Checker has been configured to perform. 0800 * 0801 * @param sc The USpoofChecker 0802 * @param status The error code, set if this function encounters a problem. 0803 * @return The set of checks that this spoof checker will perform. 0804 * The value is a bit set, obtained by OR-ing together 0805 * values from enum USpoofChecks. 0806 * @stable ICU 4.2 0807 * 0808 */ 0809 U_CAPI int32_t U_EXPORT2 0810 uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status); 0811 0812 /** 0813 * Set the loosest restriction level allowed for strings. The default if this is not called is 0814 * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and 0815 * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are 0816 * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}. 0817 * 0818 * @param sc The USpoofChecker 0819 * @param restrictionLevel The loosest restriction level allowed. 0820 * @see URestrictionLevel 0821 * @stable ICU 51 0822 */ 0823 U_CAPI void U_EXPORT2 0824 uspoof_setRestrictionLevel(USpoofChecker *sc, URestrictionLevel restrictionLevel); 0825 0826 0827 /** 0828 * Get the Restriction Level that will be tested if the checks include {@link USPOOF_RESTRICTION_LEVEL}. 0829 * 0830 * @return The restriction level 0831 * @see URestrictionLevel 0832 * @stable ICU 51 0833 */ 0834 U_CAPI URestrictionLevel U_EXPORT2 0835 uspoof_getRestrictionLevel(const USpoofChecker *sc); 0836 0837 /** 0838 * Limit characters that are acceptable in identifiers being checked to those 0839 * normally used with the languages associated with the specified locales. 0840 * Any previously specified list of locales is replaced by the new settings. 0841 * 0842 * A set of languages is determined from the locale(s), and 0843 * from those a set of acceptable Unicode scripts is determined. 0844 * Characters from this set of scripts, along with characters from 0845 * the "common" and "inherited" Unicode Script categories 0846 * will be permitted. 0847 * 0848 * Supplying an empty string removes all restrictions; 0849 * characters from any script will be allowed. 0850 * 0851 * The {@link USPOOF_CHAR_LIMIT} test is automatically enabled for this 0852 * USpoofChecker when calling this function with a non-empty list 0853 * of locales. 0854 * 0855 * The Unicode Set of characters that will be allowed is accessible 0856 * via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales() 0857 * will <i>replace</i> any previously applied set of allowed characters. 0858 * 0859 * Adjustments, such as additions or deletions of certain classes of characters, 0860 * can be made to the result of uspoof_setAllowedLocales() by 0861 * fetching the resulting set with uspoof_getAllowedChars(), 0862 * manipulating it with the Unicode Set API, then resetting the 0863 * spoof detectors limits with uspoof_setAllowedChars(). 0864 * 0865 * @param sc The USpoofChecker 0866 * @param localesList A list list of locales, from which the language 0867 * and associated script are extracted. The locales 0868 * are comma-separated if there is more than one. 0869 * White space may not appear within an individual locale, 0870 * but is ignored otherwise. 0871 * The locales are syntactically like those from the 0872 * HTTP Accept-Language header. 0873 * If the localesList is empty, no restrictions will be placed on 0874 * the allowed characters. 0875 * 0876 * @param status The error code, set if this function encounters a problem. 0877 * @stable ICU 4.2 0878 */ 0879 U_CAPI void U_EXPORT2 0880 uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode *status); 0881 0882 /** 0883 * Get a list of locales for the scripts that are acceptable in strings 0884 * to be checked. If no limitations on scripts have been specified, 0885 * an empty string will be returned. 0886 * 0887 * uspoof_setAllowedChars() will reset the list of allowed to be empty. 0888 * 0889 * The format of the returned list is the same as that supplied to 0890 * uspoof_setAllowedLocales(), but returned list may not be identical 0891 * to the originally specified string; the string may be reformatted, 0892 * and information other than languages from 0893 * the originally specified locales may be omitted. 0894 * 0895 * @param sc The USpoofChecker 0896 * @param status The error code, set if this function encounters a problem. 0897 * @return A string containing a list of locales corresponding 0898 * to the acceptable scripts, formatted like an 0899 * HTTP Accept Language value. 0900 * 0901 * @stable ICU 4.2 0902 */ 0903 U_CAPI const char * U_EXPORT2 0904 uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status); 0905 0906 0907 /** 0908 * Limit the acceptable characters to those specified by a Unicode Set. 0909 * Any previously specified character limit is 0910 * is replaced by the new settings. This includes limits on 0911 * characters that were set with the uspoof_setAllowedLocales() function. 0912 * 0913 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 0914 * USpoofChecker by this function. 0915 * 0916 * @param sc The USpoofChecker 0917 * @param chars A Unicode Set containing the list of 0918 * characters that are permitted. Ownership of the set 0919 * remains with the caller. The incoming set is cloned by 0920 * this function, so there are no restrictions on modifying 0921 * or deleting the USet after calling this function. 0922 * @param status The error code, set if this function encounters a problem. 0923 * @stable ICU 4.2 0924 */ 0925 U_CAPI void U_EXPORT2 0926 uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status); 0927 0928 0929 /** 0930 * Get a USet for the characters permitted in an identifier. 0931 * This corresponds to the limits imposed by the Set Allowed Characters 0932 * functions. Limitations imposed by other checks will not be 0933 * reflected in the set returned by this function. 0934 * 0935 * The returned set will be frozen, meaning that it cannot be modified 0936 * by the caller. 0937 * 0938 * Ownership of the returned set remains with the Spoof Detector. The 0939 * returned set will become invalid if the spoof detector is closed, 0940 * or if a new set of allowed characters is specified. 0941 * 0942 * 0943 * @param sc The USpoofChecker 0944 * @param status The error code, set if this function encounters a problem. 0945 * @return A USet containing the characters that are permitted by 0946 * the USPOOF_CHAR_LIMIT test. 0947 * @stable ICU 4.2 0948 */ 0949 U_CAPI const USet * U_EXPORT2 0950 uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status); 0951 0952 0953 /** 0954 * Check the specified string for possible security issues. 0955 * The text to be checked will typically be an identifier of some sort. 0956 * The set of checks to be performed is specified with uspoof_setChecks(). 0957 * 0958 * \note 0959 * Consider using the newer API, {@link uspoof_check2}, instead. 0960 * The newer API exposes additional information from the check procedure 0961 * and is otherwise identical to this method. 0962 * 0963 * @param sc The USpoofChecker 0964 * @param id The identifier to be checked for possible security issues, 0965 * in UTF-16 format. 0966 * @param length the length of the string to be checked, expressed in 0967 * 16 bit UTF-16 code units, or -1 if the string is 0968 * zero terminated. 0969 * @param position Deprecated in ICU 51. Always returns zero. 0970 * Originally, an out parameter for the index of the first 0971 * string position that failed a check. 0972 * This parameter may be NULL. 0973 * @param status The error code, set if an error occurred while attempting to 0974 * perform the check. 0975 * Spoofing or security issues detected with the input string are 0976 * not reported here, but through the function's return value. 0977 * @return An integer value with bits set for any potential security 0978 * or spoofing issues detected. The bits are defined by 0979 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 0980 * will be zero if the input string passes all of the 0981 * enabled checks. 0982 * @see uspoof_check2 0983 * @stable ICU 4.2 0984 */ 0985 U_CAPI int32_t U_EXPORT2 0986 uspoof_check(const USpoofChecker *sc, 0987 const UChar *id, int32_t length, 0988 int32_t *position, 0989 UErrorCode *status); 0990 0991 0992 /** 0993 * Check the specified string for possible security issues. 0994 * The text to be checked will typically be an identifier of some sort. 0995 * The set of checks to be performed is specified with uspoof_setChecks(). 0996 * 0997 * \note 0998 * Consider using the newer API, {@link uspoof_check2UTF8}, instead. 0999 * The newer API exposes additional information from the check procedure 1000 * and is otherwise identical to this method. 1001 * 1002 * @param sc The USpoofChecker 1003 * @param id A identifier to be checked for possible security issues, in UTF8 format. 1004 * @param length the length of the string to be checked, or -1 if the string is 1005 * zero terminated. 1006 * @param position Deprecated in ICU 51. Always returns zero. 1007 * Originally, an out parameter for the index of the first 1008 * string position that failed a check. 1009 * This parameter may be NULL. 1010 * @param status The error code, set if an error occurred while attempting to 1011 * perform the check. 1012 * Spoofing or security issues detected with the input string are 1013 * not reported here, but through the function's return value. 1014 * If the input contains invalid UTF-8 sequences, 1015 * a status of U_INVALID_CHAR_FOUND will be returned. 1016 * @return An integer value with bits set for any potential security 1017 * or spoofing issues detected. The bits are defined by 1018 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1019 * will be zero if the input string passes all of the 1020 * enabled checks. 1021 * @see uspoof_check2UTF8 1022 * @stable ICU 4.2 1023 */ 1024 U_CAPI int32_t U_EXPORT2 1025 uspoof_checkUTF8(const USpoofChecker *sc, 1026 const char *id, int32_t length, 1027 int32_t *position, 1028 UErrorCode *status); 1029 1030 1031 /** 1032 * Check the specified string for possible security issues. 1033 * The text to be checked will typically be an identifier of some sort. 1034 * The set of checks to be performed is specified with uspoof_setChecks(). 1035 * 1036 * @param sc The USpoofChecker 1037 * @param id The identifier to be checked for possible security issues, 1038 * in UTF-16 format. 1039 * @param length the length of the string to be checked, or -1 if the string is 1040 * zero terminated. 1041 * @param checkResult An instance of USpoofCheckResult to be filled with 1042 * details about the identifier. Can be NULL. 1043 * @param status The error code, set if an error occurred while attempting to 1044 * perform the check. 1045 * Spoofing or security issues detected with the input string are 1046 * not reported here, but through the function's return value. 1047 * @return An integer value with bits set for any potential security 1048 * or spoofing issues detected. The bits are defined by 1049 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1050 * will be zero if the input string passes all of the 1051 * enabled checks. Any information in this bitmask will be 1052 * consistent with the information saved in the optional 1053 * checkResult parameter. 1054 * @see uspoof_openCheckResult 1055 * @see uspoof_check2UTF8 1056 * @see uspoof_check2UnicodeString 1057 * @stable ICU 58 1058 */ 1059 U_CAPI int32_t U_EXPORT2 1060 uspoof_check2(const USpoofChecker *sc, 1061 const UChar* id, int32_t length, 1062 USpoofCheckResult* checkResult, 1063 UErrorCode *status); 1064 1065 /** 1066 * Check the specified string for possible security issues. 1067 * The text to be checked will typically be an identifier of some sort. 1068 * The set of checks to be performed is specified with uspoof_setChecks(). 1069 * 1070 * This version of {@link uspoof_check} accepts a USpoofCheckResult, which 1071 * returns additional information about the identifier. For more 1072 * information, see {@link uspoof_openCheckResult}. 1073 * 1074 * @param sc The USpoofChecker 1075 * @param id A identifier to be checked for possible security issues, in UTF8 format. 1076 * @param length the length of the string to be checked, or -1 if the string is 1077 * zero terminated. 1078 * @param checkResult An instance of USpoofCheckResult to be filled with 1079 * details about the identifier. Can be NULL. 1080 * @param status The error code, set if an error occurred while attempting to 1081 * perform the check. 1082 * Spoofing or security issues detected with the input string are 1083 * not reported here, but through the function's return value. 1084 * @return An integer value with bits set for any potential security 1085 * or spoofing issues detected. The bits are defined by 1086 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1087 * will be zero if the input string passes all of the 1088 * enabled checks. Any information in this bitmask will be 1089 * consistent with the information saved in the optional 1090 * checkResult parameter. 1091 * @see uspoof_openCheckResult 1092 * @see uspoof_check2 1093 * @see uspoof_check2UnicodeString 1094 * @stable ICU 58 1095 */ 1096 U_CAPI int32_t U_EXPORT2 1097 uspoof_check2UTF8(const USpoofChecker *sc, 1098 const char *id, int32_t length, 1099 USpoofCheckResult* checkResult, 1100 UErrorCode *status); 1101 1102 /** 1103 * Create a USpoofCheckResult, used by the {@link uspoof_check2} class of functions to return 1104 * information about the identifier. Information includes: 1105 * <ul> 1106 * <li>A bitmask of the checks that failed</li> 1107 * <li>The identifier's restriction level (UTS 39 section 5.2)</li> 1108 * <li>The set of numerics in the string (UTS 39 section 5.3)</li> 1109 * </ul> 1110 * The data held in a USpoofCheckResult is cleared whenever it is passed into a new call 1111 * of {@link uspoof_check2}. 1112 * 1113 * @param status The error code, set if this function encounters a problem. 1114 * @return the newly created USpoofCheckResult 1115 * @see uspoof_check2 1116 * @see uspoof_check2UTF8 1117 * @see uspoof_check2UnicodeString 1118 * @stable ICU 58 1119 */ 1120 U_CAPI USpoofCheckResult* U_EXPORT2 1121 uspoof_openCheckResult(UErrorCode *status); 1122 1123 /** 1124 * Close a USpoofCheckResult, freeing any memory that was being held by 1125 * its implementation. 1126 * 1127 * @param checkResult The instance of USpoofCheckResult to close 1128 * @stable ICU 58 1129 */ 1130 U_CAPI void U_EXPORT2 1131 uspoof_closeCheckResult(USpoofCheckResult *checkResult); 1132 1133 /** 1134 * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests 1135 * in question: USPOOF_RESTRICTION_LEVEL, USPOOF_CHAR_LIMIT, and so on. 1136 * 1137 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1138 * @param status The error code, set if an error occurred. 1139 * @return An integer value with bits set for any potential security 1140 * or spoofing issues detected. The bits are defined by 1141 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1142 * will be zero if the input string passes all of the 1143 * enabled checks. 1144 * @see uspoof_setChecks 1145 * @stable ICU 58 1146 */ 1147 U_CAPI int32_t U_EXPORT2 1148 uspoof_getCheckResultChecks(const USpoofCheckResult *checkResult, UErrorCode *status); 1149 1150 /** 1151 * Gets the restriction level that the text meets, if the USPOOF_RESTRICTION_LEVEL check 1152 * was enabled; otherwise, undefined. 1153 * 1154 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1155 * @param status The error code, set if an error occurred. 1156 * @return The restriction level contained in the USpoofCheckResult 1157 * @see uspoof_setRestrictionLevel 1158 * @stable ICU 58 1159 */ 1160 U_CAPI URestrictionLevel U_EXPORT2 1161 uspoof_getCheckResultRestrictionLevel(const USpoofCheckResult *checkResult, UErrorCode *status); 1162 1163 /** 1164 * Gets the set of numerics found in the string, if the USPOOF_MIXED_NUMBERS check was enabled; 1165 * otherwise, undefined. The set will contain the zero digit from each decimal number system found 1166 * in the input string. Ownership of the returned USet remains with the USpoofCheckResult. 1167 * The USet will be free'd when {@link uspoof_closeCheckResult} is called. 1168 * 1169 * @param checkResult The instance of USpoofCheckResult created by {@link uspoof_openCheckResult} 1170 * @return The set of numerics contained in the USpoofCheckResult 1171 * @param status The error code, set if an error occurred. 1172 * @stable ICU 58 1173 */ 1174 U_CAPI const USet* U_EXPORT2 1175 uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *status); 1176 1177 1178 /** 1179 * Check whether two specified strings are visually confusable. 1180 * 1181 * If the strings are confusable, the return value will be nonzero, as long as 1182 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). 1183 * 1184 * The bits in the return value correspond to flags for each of the classes of 1185 * confusables applicable to the two input strings. According to UTS 39 1186 * section 4, the possible flags are: 1187 * 1188 * <ul> 1189 * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> 1190 * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> 1191 * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> 1192 * </ul> 1193 * 1194 * If one or more of the above flags were not listed in uspoof_setChecks(), this 1195 * function will never report that class of confusable. The check 1196 * {@link USPOOF_CONFUSABLE} enables all three flags. 1197 * 1198 * 1199 * @param sc The USpoofChecker 1200 * @param id1 The first of the two identifiers to be compared for 1201 * confusability. The strings are in UTF-16 format. 1202 * @param length1 the length of the first identifier, expressed in 1203 * 16 bit UTF-16 code units, or -1 if the string is 1204 * nul terminated. 1205 * @param id2 The second of the two identifiers to be compared for 1206 * confusability. The identifiers are in UTF-16 format. 1207 * @param length2 The length of the second identifiers, expressed in 1208 * 16 bit UTF-16 code units, or -1 if the string is 1209 * nul terminated. 1210 * @param status The error code, set if an error occurred while attempting to 1211 * perform the check. 1212 * Confusability of the identifiers is not reported here, 1213 * but through this function's return value. 1214 * @return An integer value with bit(s) set corresponding to 1215 * the type of confusability found, as defined by 1216 * enum USpoofChecks. Zero is returned if the identifiers 1217 * are not confusable. 1218 * 1219 * @stable ICU 4.2 1220 */ 1221 U_CAPI int32_t U_EXPORT2 1222 uspoof_areConfusable(const USpoofChecker *sc, 1223 const UChar *id1, int32_t length1, 1224 const UChar *id2, int32_t length2, 1225 UErrorCode *status); 1226 1227 #ifndef U_HIDE_DRAFT_API 1228 /** 1229 * Check whether two specified strings are visually confusable when 1230 * displayed in a context with the given paragraph direction. 1231 * 1232 * If the strings are confusable, the return value will be nonzero, as long as 1233 * {@link USPOOF_CONFUSABLE} was enabled in uspoof_setChecks(). 1234 * 1235 * The bits in the return value correspond to flags for each of the classes of 1236 * confusables applicable to the two input strings. According to UTS 39 1237 * section 4, the possible flags are: 1238 * 1239 * <ul> 1240 * <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li> 1241 * <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li> 1242 * <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li> 1243 * </ul> 1244 * 1245 * If one or more of the above flags were not listed in uspoof_setChecks(), this 1246 * function will never report that class of confusable. The check 1247 * {@link USPOOF_CONFUSABLE} enables all three flags. 1248 * 1249 * 1250 * @param sc The USpoofChecker 1251 * @param direction The paragraph direction with which the identifiers are 1252 * displayed. Must be either UBIDI_LTR or UBIDI_RTL. 1253 * @param id1 The first of the two identifiers to be compared for 1254 * confusability. The strings are in UTF-16 format. 1255 * @param length1 the length of the first identifier, expressed in 1256 * 16 bit UTF-16 code units, or -1 if the string is 1257 * nul terminated. 1258 * @param id2 The second of the two identifiers to be compared for 1259 * confusability. The identifiers are in UTF-16 format. 1260 * @param length2 The length of the second identifiers, expressed in 1261 * 16 bit UTF-16 code units, or -1 if the string is 1262 * nul terminated. 1263 * @param status The error code, set if an error occurred while attempting to 1264 * perform the check. 1265 * Confusability of the identifiers is not reported here, 1266 * but through this function's return value. 1267 * @return An integer value with bit(s) set corresponding to 1268 * the type of confusability found, as defined by 1269 * enum USpoofChecks. Zero is returned if the identifiers 1270 * are not confusable. 1271 * 1272 * @draft ICU 74 1273 */ 1274 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusable(const USpoofChecker *sc, UBiDiDirection direction, 1275 const UChar *id1, int32_t length1, 1276 const UChar *id2, int32_t length2, 1277 UErrorCode *status); 1278 #endif /* U_HIDE_DRAFT_API */ 1279 1280 /** 1281 * A version of {@link uspoof_areConfusable} accepting strings in UTF-8 format. 1282 * 1283 * @param sc The USpoofChecker 1284 * @param id1 The first of the two identifiers to be compared for 1285 * confusability. The strings are in UTF-8 format. 1286 * @param length1 the length of the first identifiers, in bytes, or -1 1287 * if the string is nul terminated. 1288 * @param id2 The second of the two identifiers to be compared for 1289 * confusability. The strings are in UTF-8 format. 1290 * @param length2 The length of the second string in bytes, or -1 1291 * if the string is nul terminated. 1292 * @param status The error code, set if an error occurred while attempting to 1293 * perform the check. 1294 * Confusability of the strings is not reported here, 1295 * but through this function's return value. 1296 * @return An integer value with bit(s) set corresponding to 1297 * the type of confusability found, as defined by 1298 * enum USpoofChecks. Zero is returned if the strings 1299 * are not confusable. 1300 * 1301 * @stable ICU 4.2 1302 * 1303 * @see uspoof_areConfusable 1304 */ 1305 U_CAPI int32_t U_EXPORT2 1306 uspoof_areConfusableUTF8(const USpoofChecker *sc, 1307 const char *id1, int32_t length1, 1308 const char *id2, int32_t length2, 1309 UErrorCode *status); 1310 1311 #ifndef U_HIDE_DRAFT_API 1312 /** 1313 * A version of {@link uspoof_areBidiConfusable} accepting strings in UTF-8 format. 1314 * 1315 * @param sc The USpoofChecker 1316 * @param direction The paragraph direction with which the identifiers are 1317 * displayed. Must be either UBIDI_LTR or UBIDI_RTL. 1318 * @param id1 The first of the two identifiers to be compared for 1319 * confusability. The strings are in UTF-8 format. 1320 * @param length1 the length of the first identifiers, in bytes, or -1 1321 * if the string is nul terminated. 1322 * @param id2 The second of the two identifiers to be compared for 1323 * confusability. The strings are in UTF-8 format. 1324 * @param length2 The length of the second string in bytes, or -1 1325 * if the string is nul terminated. 1326 * @param status The error code, set if an error occurred while attempting to 1327 * perform the check. 1328 * Confusability of the strings is not reported here, 1329 * but through this function's return value. 1330 * @return An integer value with bit(s) set corresponding to 1331 * the type of confusability found, as defined by 1332 * enum USpoofChecks. Zero is returned if the strings 1333 * are not confusable. 1334 * 1335 * @draft ICU 74 1336 * 1337 * @see uspoof_areBidiConfusable 1338 */ 1339 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUTF8(const USpoofChecker *sc, UBiDiDirection direction, 1340 const char *id1, int32_t length1, 1341 const char *id2, int32_t length2, 1342 UErrorCode *status); 1343 #endif /* U_HIDE_DRAFT_API */ 1344 1345 /** 1346 * Get the "skeleton" for an identifier. 1347 * Skeletons are a transformation of the input identifier; 1348 * Two identifiers are confusable if their skeletons are identical. 1349 * See Unicode Technical Standard #39 for additional information. 1350 * 1351 * Using skeletons directly makes it possible to quickly check 1352 * whether an identifier is confusable with any of some large 1353 * set of existing identifiers, by creating an efficiently 1354 * searchable collection of the skeletons. 1355 * 1356 * @param sc The USpoofChecker 1357 * @param type Deprecated in ICU 58. You may pass any number. 1358 * Originally, controlled which of the Unicode confusable data 1359 * tables to use. 1360 * @param id The input identifier whose skeleton will be computed. 1361 * @param length The length of the input identifier, expressed in 16 bit 1362 * UTF-16 code units, or -1 if the string is zero terminated. 1363 * @param dest The output buffer, to receive the skeleton string. 1364 * @param destCapacity The length of the output buffer, in 16 bit units. 1365 * The destCapacity may be zero, in which case the function will 1366 * return the actual length of the skeleton. 1367 * @param status The error code, set if an error occurred while attempting to 1368 * perform the check. 1369 * @return The length of the skeleton string. The returned length 1370 * is always that of the complete skeleton, even when the 1371 * supplied buffer is too small (or of zero length) 1372 * 1373 * @stable ICU 4.2 1374 * @see uspoof_areConfusable 1375 */ 1376 U_CAPI int32_t U_EXPORT2 1377 uspoof_getSkeleton(const USpoofChecker *sc, 1378 uint32_t type, 1379 const UChar *id, int32_t length, 1380 UChar *dest, int32_t destCapacity, 1381 UErrorCode *status); 1382 1383 #ifndef U_HIDE_DRAFT_API 1384 /** 1385 * Get the "bidiSkeleton" for an identifier and a direction. 1386 * Skeletons are a transformation of the input identifier; 1387 * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; 1388 * they are RTL-confusable if their RTL bidiSkeletons are identical. 1389 * See Unicode Technical Standard #39 for additional information: 1390 * https://www.unicode.org/reports/tr39/#Confusable_Detection. 1391 * 1392 * Using skeletons directly makes it possible to quickly check 1393 * whether an identifier is confusable with any of some large 1394 * set of existing identifiers, by creating an efficiently 1395 * searchable collection of the skeletons. 1396 * 1397 * @param sc The USpoofChecker. 1398 * @param direction The context direction with which the identifier will be 1399 * displayed. Must be either UBIDI_LTR or UBIDI_RTL. 1400 * @param id The input identifier whose skeleton will be computed. 1401 * @param length The length of the input identifier, expressed in 16 bit 1402 * UTF-16 code units, or -1 if the string is zero terminated. 1403 * @param dest The output buffer, to receive the skeleton string. 1404 * @param destCapacity The length of the output buffer, in 16 bit units. 1405 * The destCapacity may be zero, in which case the function will 1406 * return the actual length of the skeleton. 1407 * @param status The error code, set if an error occurred while attempting to 1408 * perform the check. 1409 * @return The length of the skeleton string. The returned length 1410 * is always that of the complete skeleton, even when the 1411 * supplied buffer is too small (or of zero length) 1412 * 1413 * @draft ICU 74 1414 * @see uspoof_areBidiConfusable 1415 */ 1416 U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeleton(const USpoofChecker *sc, 1417 UBiDiDirection direction, 1418 const UChar *id, int32_t length, 1419 UChar *dest, int32_t destCapacity, UErrorCode *status); 1420 #endif /* U_HIDE_DRAFT_API */ 1421 1422 /** 1423 * Get the "skeleton" for an identifier. 1424 * Skeletons are a transformation of the input identifier; 1425 * Two identifiers are confusable if their skeletons are identical. 1426 * See Unicode Technical Standard #39 for additional information. 1427 * 1428 * Using skeletons directly makes it possible to quickly check 1429 * whether an identifier is confusable with any of some large 1430 * set of existing identifiers, by creating an efficiently 1431 * searchable collection of the skeletons. 1432 * 1433 * @param sc The USpoofChecker 1434 * @param type Deprecated in ICU 58. You may pass any number. 1435 * Originally, controlled which of the Unicode confusable data 1436 * tables to use. 1437 * @param id The UTF-8 format identifier whose skeleton will be computed. 1438 * @param length The length of the input string, in bytes, 1439 * or -1 if the string is zero terminated. 1440 * @param dest The output buffer, to receive the skeleton string. 1441 * @param destCapacity The length of the output buffer, in bytes. 1442 * The destCapacity may be zero, in which case the function will 1443 * return the actual length of the skeleton. 1444 * @param status The error code, set if an error occurred while attempting to 1445 * perform the check. Possible Errors include U_INVALID_CHAR_FOUND 1446 * for invalid UTF-8 sequences, and 1447 * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small 1448 * to hold the complete skeleton. 1449 * @return The length of the skeleton string, in bytes. The returned length 1450 * is always that of the complete skeleton, even when the 1451 * supplied buffer is too small (or of zero length) 1452 * 1453 * @stable ICU 4.2 1454 */ 1455 U_CAPI int32_t U_EXPORT2 1456 uspoof_getSkeletonUTF8(const USpoofChecker *sc, 1457 uint32_t type, 1458 const char *id, int32_t length, 1459 char *dest, int32_t destCapacity, 1460 UErrorCode *status); 1461 1462 #ifndef U_HIDE_DRAFT_API 1463 /** 1464 * Get the "bidiSkeleton" for an identifier and a direction. 1465 * Skeletons are a transformation of the input identifier; 1466 * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; 1467 * they are RTL-confusable if their RTL bidiSkeletons are identical. 1468 * See Unicode Technical Standard #39 for additional information: 1469 * https://www.unicode.org/reports/tr39/#Confusable_Detection. 1470 * 1471 * Using skeletons directly makes it possible to quickly check 1472 * whether an identifier is confusable with any of some large 1473 * set of existing identifiers, by creating an efficiently 1474 * searchable collection of the skeletons. 1475 * 1476 * @param sc The USpoofChecker 1477 * @param direction The context direction with which the identifier will be 1478 * displayed. Must be either UBIDI_LTR or UBIDI_RTL. 1479 * @param id The UTF-8 format identifier whose skeleton will be computed. 1480 * @param length The length of the input string, in bytes, 1481 * or -1 if the string is zero terminated. 1482 * @param dest The output buffer, to receive the skeleton string. 1483 * @param destCapacity The length of the output buffer, in bytes. 1484 * The destCapacity may be zero, in which case the function will 1485 * return the actual length of the skeleton. 1486 * @param status The error code, set if an error occurred while attempting to 1487 * perform the check. Possible Errors include U_INVALID_CHAR_FOUND 1488 * for invalid UTF-8 sequences, and 1489 * U_BUFFER_OVERFLOW_ERROR if the destination buffer is too small 1490 * to hold the complete skeleton. 1491 * @return The length of the skeleton string, in bytes. The returned length 1492 * is always that of the complete skeleton, even when the 1493 * supplied buffer is too small (or of zero length) 1494 * 1495 * @draft ICU 74 1496 */ 1497 U_CAPI int32_t U_EXPORT2 uspoof_getBidiSkeletonUTF8(const USpoofChecker *sc, UBiDiDirection direction, 1498 const char *id, int32_t length, char *dest, 1499 int32_t destCapacity, UErrorCode *status); 1500 #endif /* U_HIDE_DRAFT_API */ 1501 1502 /** 1503 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1504 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1505 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1506 * 1507 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1508 * be deleted by the caller. 1509 * 1510 * @param status The error code, set if a problem occurs while creating the set. 1511 * 1512 * @stable ICU 51 1513 */ 1514 U_CAPI const USet * U_EXPORT2 1515 uspoof_getInclusionSet(UErrorCode *status); 1516 1517 /** 1518 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1519 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1520 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1521 * 1522 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1523 * be deleted by the caller. 1524 * 1525 * @param status The error code, set if a problem occurs while creating the set. 1526 * 1527 * @stable ICU 51 1528 */ 1529 U_CAPI const USet * U_EXPORT2 1530 uspoof_getRecommendedSet(UErrorCode *status); 1531 1532 /** 1533 * Serialize the data for a spoof detector into a chunk of memory. 1534 * The flattened spoof detection tables can later be used to efficiently 1535 * instantiate a new Spoof Detector. 1536 * 1537 * The serialized spoof checker includes only the data compiled from the 1538 * Unicode data tables by uspoof_openFromSource(); it does not include 1539 * include any other state or configuration that may have been set. 1540 * 1541 * @param sc the Spoof Detector whose data is to be serialized. 1542 * @param data a pointer to 32-bit-aligned memory to be filled with the data, 1543 * can be NULL if capacity==0 1544 * @param capacity the number of bytes available at data, 1545 * or 0 for preflighting 1546 * @param status an in/out ICU UErrorCode; possible errors include: 1547 * - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization 1548 * - U_ILLEGAL_ARGUMENT_ERROR the data or capacity parameters are bad 1549 * @return the number of bytes written or needed for the spoof data 1550 * 1551 * @see utrie2_openFromSerialized() 1552 * @stable ICU 4.2 1553 */ 1554 U_CAPI int32_t U_EXPORT2 1555 uspoof_serialize(USpoofChecker *sc, 1556 void *data, int32_t capacity, 1557 UErrorCode *status); 1558 1559 U_CDECL_END 1560 1561 #if U_SHOW_CPLUSPLUS_API 1562 1563 U_NAMESPACE_BEGIN 1564 1565 /** 1566 * \class LocalUSpoofCheckerPointer 1567 * "Smart pointer" class, closes a USpoofChecker via uspoof_close(). 1568 * For most methods see the LocalPointerBase base class. 1569 * 1570 * @see LocalPointerBase 1571 * @see LocalPointer 1572 * @stable ICU 4.4 1573 */ 1574 /** 1575 * \cond 1576 * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER. 1577 * For now, suppress with a Doxygen cond 1578 */ 1579 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckerPointer, USpoofChecker, uspoof_close); 1580 /** \endcond */ 1581 1582 /** 1583 * \class LocalUSpoofCheckResultPointer 1584 * "Smart pointer" class, closes a USpoofCheckResult via `uspoof_closeCheckResult()`. 1585 * For most methods see the LocalPointerBase base class. 1586 * 1587 * @see LocalPointerBase 1588 * @see LocalPointer 1589 * @stable ICU 58 1590 */ 1591 1592 /** 1593 * \cond 1594 * Note: Doxygen is giving a bogus warning on this U_DEFINE_LOCAL_OPEN_POINTER. 1595 * For now, suppress with a Doxygen cond 1596 */ 1597 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSpoofCheckResultPointer, USpoofCheckResult, uspoof_closeCheckResult); 1598 /** \endcond */ 1599 1600 U_NAMESPACE_END 1601 1602 /** 1603 * Limit the acceptable characters to those specified by a Unicode Set. 1604 * Any previously specified character limit is 1605 * is replaced by the new settings. This includes limits on 1606 * characters that were set with the uspoof_setAllowedLocales() function. 1607 * 1608 * The USPOOF_CHAR_LIMIT test is automatically enabled for this 1609 * USoofChecker by this function. 1610 * 1611 * @param sc The USpoofChecker 1612 * @param chars A Unicode Set containing the list of 1613 * characters that are permitted. Ownership of the set 1614 * remains with the caller. The incoming set is cloned by 1615 * this function, so there are no restrictions on modifying 1616 * or deleting the UnicodeSet after calling this function. 1617 * @param status The error code, set if this function encounters a problem. 1618 * @stable ICU 4.2 1619 */ 1620 U_CAPI void U_EXPORT2 1621 uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const icu::UnicodeSet *chars, UErrorCode *status); 1622 1623 1624 /** 1625 * Get a UnicodeSet for the characters permitted in an identifier. 1626 * This corresponds to the limits imposed by the Set Allowed Characters / 1627 * UnicodeSet functions. Limitations imposed by other checks will not be 1628 * reflected in the set returned by this function. 1629 * 1630 * The returned set will be frozen, meaning that it cannot be modified 1631 * by the caller. 1632 * 1633 * Ownership of the returned set remains with the Spoof Detector. The 1634 * returned set will become invalid if the spoof detector is closed, 1635 * or if a new set of allowed characters is specified. 1636 * 1637 * 1638 * @param sc The USpoofChecker 1639 * @param status The error code, set if this function encounters a problem. 1640 * @return A UnicodeSet containing the characters that are permitted by 1641 * the USPOOF_CHAR_LIMIT test. 1642 * @stable ICU 4.2 1643 */ 1644 U_CAPI const icu::UnicodeSet * U_EXPORT2 1645 uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status); 1646 1647 /** 1648 * Check the specified string for possible security issues. 1649 * The text to be checked will typically be an identifier of some sort. 1650 * The set of checks to be performed is specified with uspoof_setChecks(). 1651 * 1652 * \note 1653 * Consider using the newer API, {@link uspoof_check2UnicodeString}, instead. 1654 * The newer API exposes additional information from the check procedure 1655 * and is otherwise identical to this method. 1656 * 1657 * @param sc The USpoofChecker 1658 * @param id A identifier to be checked for possible security issues. 1659 * @param position Deprecated in ICU 51. Always returns zero. 1660 * Originally, an out parameter for the index of the first 1661 * string position that failed a check. 1662 * This parameter may be nullptr. 1663 * @param status The error code, set if an error occurred while attempting to 1664 * perform the check. 1665 * Spoofing or security issues detected with the input string are 1666 * not reported here, but through the function's return value. 1667 * @return An integer value with bits set for any potential security 1668 * or spoofing issues detected. The bits are defined by 1669 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1670 * will be zero if the input string passes all of the 1671 * enabled checks. 1672 * @see uspoof_check2UnicodeString 1673 * @stable ICU 4.2 1674 */ 1675 U_CAPI int32_t U_EXPORT2 1676 uspoof_checkUnicodeString(const USpoofChecker *sc, 1677 const icu::UnicodeString &id, 1678 int32_t *position, 1679 UErrorCode *status); 1680 1681 /** 1682 * Check the specified string for possible security issues. 1683 * The text to be checked will typically be an identifier of some sort. 1684 * The set of checks to be performed is specified with uspoof_setChecks(). 1685 * 1686 * @param sc The USpoofChecker 1687 * @param id A identifier to be checked for possible security issues. 1688 * @param checkResult An instance of USpoofCheckResult to be filled with 1689 * details about the identifier. Can be nullptr. 1690 * @param status The error code, set if an error occurred while attempting to 1691 * perform the check. 1692 * Spoofing or security issues detected with the input string are 1693 * not reported here, but through the function's return value. 1694 * @return An integer value with bits set for any potential security 1695 * or spoofing issues detected. The bits are defined by 1696 * enum USpoofChecks. (returned_value & USPOOF_ALL_CHECKS) 1697 * will be zero if the input string passes all of the 1698 * enabled checks. Any information in this bitmask will be 1699 * consistent with the information saved in the optional 1700 * checkResult parameter. 1701 * @see uspoof_openCheckResult 1702 * @see uspoof_check2 1703 * @see uspoof_check2UTF8 1704 * @stable ICU 58 1705 */ 1706 U_CAPI int32_t U_EXPORT2 1707 uspoof_check2UnicodeString(const USpoofChecker *sc, 1708 const icu::UnicodeString &id, 1709 USpoofCheckResult* checkResult, 1710 UErrorCode *status); 1711 1712 /** 1713 * A version of {@link uspoof_areConfusable} accepting UnicodeStrings. 1714 * 1715 * @param sc The USpoofChecker 1716 * @param s1 The first of the two identifiers to be compared for 1717 * confusability. The strings are in UTF-8 format. 1718 * @param s2 The second of the two identifiers to be compared for 1719 * confusability. The strings are in UTF-8 format. 1720 * @param status The error code, set if an error occurred while attempting to 1721 * perform the check. 1722 * Confusability of the identifiers is not reported here, 1723 * but through this function's return value. 1724 * @return An integer value with bit(s) set corresponding to 1725 * the type of confusability found, as defined by 1726 * enum USpoofChecks. Zero is returned if the identifiers 1727 * are not confusable. 1728 * 1729 * @stable ICU 4.2 1730 * 1731 * @see uspoof_areConfusable 1732 */ 1733 U_CAPI int32_t U_EXPORT2 1734 uspoof_areConfusableUnicodeString(const USpoofChecker *sc, 1735 const icu::UnicodeString &s1, 1736 const icu::UnicodeString &s2, 1737 UErrorCode *status); 1738 1739 #ifndef U_HIDE_DRAFT_API 1740 /** 1741 * A version of {@link uspoof_areBidiConfusable} accepting UnicodeStrings. 1742 * 1743 * @param sc The USpoofChecker 1744 * @param direction The paragraph direction with which the identifiers are 1745 * displayed. Must be either UBIDI_LTR or UBIDI_RTL. 1746 * @param s1 The first of the two identifiers to be compared for 1747 * confusability. The strings are in UTF-8 format. 1748 * @param s2 The second of the two identifiers to be compared for 1749 * confusability. The strings are in UTF-8 format. 1750 * @param status The error code, set if an error occurred while attempting to 1751 * perform the check. 1752 * Confusability of the identifiers is not reported here, 1753 * but through this function's return value. 1754 * @return An integer value with bit(s) set corresponding to 1755 * the type of confusability found, as defined by 1756 * enum USpoofChecks. Zero is returned if the identifiers 1757 * are not confusable. 1758 * 1759 * @draft ICU 74 1760 * 1761 * @see uspoof_areBidiConfusable 1762 */ 1763 U_CAPI uint32_t U_EXPORT2 uspoof_areBidiConfusableUnicodeString(const USpoofChecker *sc, 1764 UBiDiDirection direction, 1765 const icu::UnicodeString &s1, 1766 const icu::UnicodeString &s2, 1767 UErrorCode *status); 1768 #endif /* U_HIDE_DRAFT_API */ 1769 1770 /** 1771 * Get the "skeleton" for an identifier. 1772 * Skeletons are a transformation of the input identifier; 1773 * Two identifiers are confusable if their skeletons are identical. 1774 * See Unicode Technical Standard #39 for additional information. 1775 * 1776 * Using skeletons directly makes it possible to quickly check 1777 * whether an identifier is confusable with any of some large 1778 * set of existing identifiers, by creating an efficiently 1779 * searchable collection of the skeletons. 1780 * 1781 * @param sc The USpoofChecker. 1782 * @param type Deprecated in ICU 58. You may pass any number. 1783 * Originally, controlled which of the Unicode confusable data 1784 * tables to use. 1785 * @param id The input identifier whose skeleton will be computed. 1786 * @param dest The output identifier, to receive the skeleton string. 1787 * @param status The error code, set if an error occurred while attempting to 1788 * perform the check. 1789 * @return A reference to the destination (skeleton) string. 1790 * 1791 * @stable ICU 4.2 1792 */ 1793 U_I18N_API icu::UnicodeString & U_EXPORT2 1794 uspoof_getSkeletonUnicodeString(const USpoofChecker *sc, 1795 uint32_t type, 1796 const icu::UnicodeString &id, 1797 icu::UnicodeString &dest, 1798 UErrorCode *status); 1799 1800 #ifndef U_HIDE_DRAFT_API 1801 /** 1802 * Get the "bidiSkeleton" for an identifier and a direction. 1803 * Skeletons are a transformation of the input identifier; 1804 * Two identifiers are LTR-confusable if their LTR bidiSkeletons are identical; 1805 * they are RTL-confusable if their RTL bidiSkeletons are identical. 1806 * See Unicode Technical Standard #39 for additional information. 1807 * https://www.unicode.org/reports/tr39/#Confusable_Detection. 1808 * 1809 * Using skeletons directly makes it possible to quickly check 1810 * whether an identifier is confusable with any of some large 1811 * set of existing identifiers, by creating an efficiently 1812 * searchable collection of the skeletons. 1813 * 1814 * @param sc The USpoofChecker. 1815 * @param direction The context direction with which the identifier will be 1816 * displayed. Must be either UBIDI_LTR or UBIDI_RTL. 1817 * @param id The input identifier whose bidiSkeleton will be computed. 1818 * @param dest The output identifier, to receive the skeleton string. 1819 * @param status The error code, set if an error occurred while attempting to 1820 * perform the check. 1821 * @return A reference to the destination (skeleton) string. 1822 * 1823 * @draft ICU 74 1824 */ 1825 U_I18N_API icu::UnicodeString &U_EXPORT2 uspoof_getBidiSkeletonUnicodeString( 1826 const USpoofChecker *sc, UBiDiDirection direction, const icu::UnicodeString &id, 1827 icu::UnicodeString &dest, UErrorCode *status); 1828 #endif /* U_HIDE_DRAFT_API */ 1829 1830 /** 1831 * Get the set of Candidate Characters for Inclusion in Identifiers, as defined 1832 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1833 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1834 * 1835 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1836 * be deleted by the caller. 1837 * 1838 * @param status The error code, set if a problem occurs while creating the set. 1839 * 1840 * @stable ICU 51 1841 */ 1842 U_CAPI const icu::UnicodeSet * U_EXPORT2 1843 uspoof_getInclusionUnicodeSet(UErrorCode *status); 1844 1845 /** 1846 * Get the set of characters from Recommended Scripts for Inclusion in Identifiers, as defined 1847 * in http://unicode.org/Public/security/latest/xidmodifications.txt 1848 * and documented in http://www.unicode.org/reports/tr39/, Unicode Security Mechanisms. 1849 * 1850 * The returned set is frozen. Ownership of the set remains with the ICU library; it must not 1851 * be deleted by the caller. 1852 * 1853 * @param status The error code, set if a problem occurs while creating the set. 1854 * 1855 * @stable ICU 51 1856 */ 1857 U_CAPI const icu::UnicodeSet * U_EXPORT2 1858 uspoof_getRecommendedUnicodeSet(UErrorCode *status); 1859 1860 #endif /* U_SHOW_CPLUSPLUS_API */ 1861 1862 #endif /* UCONFIG_NO_NORMALIZATION */ 1863 1864 #endif /* USPOOF_H */
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|