|
||||
Warning, file /include/unicode/uregex.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 ********************************************************************** 0005 * Copyright (C) 2004-2016, International Business Machines 0006 * Corporation and others. All Rights Reserved. 0007 ********************************************************************** 0008 * file name: uregex.h 0009 * encoding: UTF-8 0010 * indentation:4 0011 * 0012 * created on: 2004mar09 0013 * created by: Andy Heninger 0014 * 0015 * ICU Regular Expressions, API for C 0016 */ 0017 0018 /** 0019 * \file 0020 * \brief C API: Regular Expressions 0021 * 0022 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> 0023 */ 0024 0025 #ifndef UREGEX_H 0026 #define UREGEX_H 0027 0028 #include "unicode/utext.h" 0029 #include "unicode/utypes.h" 0030 0031 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 0032 0033 #include "unicode/parseerr.h" 0034 0035 #if U_SHOW_CPLUSPLUS_API 0036 #include "unicode/localpointer.h" 0037 #endif // U_SHOW_CPLUSPLUS_API 0038 0039 struct URegularExpression; 0040 /** 0041 * Structure representing a compiled regular expression, plus the results 0042 * of a match operation. 0043 * @stable ICU 3.0 0044 */ 0045 typedef struct URegularExpression URegularExpression; 0046 0047 0048 /** 0049 * Constants for Regular Expression Match Modes. 0050 * @stable ICU 2.4 0051 */ 0052 typedef enum URegexpFlag{ 0053 0054 #ifndef U_HIDE_DRAFT_API 0055 /** Forces normalization of pattern and strings. 0056 Not implemented yet, just a placeholder, hence draft. 0057 @draft ICU 2.4 */ 0058 UREGEX_CANON_EQ = 128, 0059 #endif /* U_HIDE_DRAFT_API */ 0060 /** Enable case insensitive matching. @stable ICU 2.4 */ 0061 UREGEX_CASE_INSENSITIVE = 2, 0062 0063 /** Allow white space and comments within patterns @stable ICU 2.4 */ 0064 UREGEX_COMMENTS = 4, 0065 0066 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. 0067 * @stable ICU 2.4 */ 0068 UREGEX_DOTALL = 32, 0069 0070 /** If set, treat the entire pattern as a literal string. 0071 * Metacharacters or escape sequences in the input sequence will be given 0072 * no special meaning. 0073 * 0074 * The flag UREGEX_CASE_INSENSITIVE retains its impact 0075 * on matching when used in conjunction with this flag. 0076 * The other flags become superfluous. 0077 * 0078 * @stable ICU 4.0 0079 */ 0080 UREGEX_LITERAL = 16, 0081 0082 /** Control behavior of "$" and "^" 0083 * If set, recognize line terminators within string, 0084 * otherwise, match only at start and end of input string. 0085 * @stable ICU 2.4 */ 0086 UREGEX_MULTILINE = 8, 0087 0088 /** Unix-only line endings. 0089 * When this mode is enabled, only \\u000a is recognized as a line ending 0090 * in the behavior of ., ^, and $. 0091 * @stable ICU 4.0 0092 */ 0093 UREGEX_UNIX_LINES = 1, 0094 0095 /** Unicode word boundaries. 0096 * If set, \b uses the Unicode TR 29 definition of word boundaries. 0097 * Warning: Unicode word boundaries are quite different from 0098 * traditional regular expression word boundaries. See 0099 * http://unicode.org/reports/tr29/#Word_Boundaries 0100 * @stable ICU 2.8 0101 */ 0102 UREGEX_UWORD = 256, 0103 0104 /** Error on Unrecognized backslash escapes. 0105 * If set, fail with an error on patterns that contain 0106 * backslash-escaped ASCII letters without a known special 0107 * meaning. If this flag is not set, these 0108 * escaped letters represent themselves. 0109 * @stable ICU 4.0 0110 */ 0111 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 0112 0113 } URegexpFlag; 0114 0115 /** 0116 * Open (compile) an ICU regular expression. Compiles the regular expression in 0117 * string form into an internal representation using the specified match mode flags. 0118 * The resulting regular expression handle can then be used to perform various 0119 * matching operations. 0120 * 0121 * 0122 * @param pattern The Regular Expression pattern to be compiled. 0123 * @param patternLength The length of the pattern, or -1 if the pattern is 0124 * NUL terminated. 0125 * @param flags Flags that alter the default matching behavior for 0126 * the regular expression, UREGEX_CASE_INSENSITIVE, for 0127 * example. For default behavior, set this parameter to zero. 0128 * See <code>enum URegexpFlag</code>. All desired flags 0129 * are bitwise-ORed together. 0130 * @param pe Receives the position (line and column numbers) of any syntax 0131 * error within the source regular expression string. If this 0132 * information is not wanted, pass NULL for this parameter. 0133 * @param status Receives error detected by this function. 0134 * @stable ICU 3.0 0135 * 0136 */ 0137 U_CAPI URegularExpression * U_EXPORT2 0138 uregex_open( const UChar *pattern, 0139 int32_t patternLength, 0140 uint32_t flags, 0141 UParseError *pe, 0142 UErrorCode *status); 0143 0144 /** 0145 * Open (compile) an ICU regular expression. Compiles the regular expression in 0146 * string form into an internal representation using the specified match mode flags. 0147 * The resulting regular expression handle can then be used to perform various 0148 * matching operations. 0149 * <p> 0150 * The contents of the pattern UText will be extracted and saved. Ownership of the 0151 * UText struct itself remains with the caller. This is to match the behavior of 0152 * uregex_open(). 0153 * 0154 * @param pattern The Regular Expression pattern to be compiled. 0155 * @param flags Flags that alter the default matching behavior for 0156 * the regular expression, UREGEX_CASE_INSENSITIVE, for 0157 * example. For default behavior, set this parameter to zero. 0158 * See <code>enum URegexpFlag</code>. All desired flags 0159 * are bitwise-ORed together. 0160 * @param pe Receives the position (line and column numbers) of any syntax 0161 * error within the source regular expression string. If this 0162 * information is not wanted, pass NULL for this parameter. 0163 * @param status Receives error detected by this function. 0164 * 0165 * @stable ICU 4.6 0166 */ 0167 U_CAPI URegularExpression * U_EXPORT2 0168 uregex_openUText(UText *pattern, 0169 uint32_t flags, 0170 UParseError *pe, 0171 UErrorCode *status); 0172 0173 #if !UCONFIG_NO_CONVERSION 0174 /** 0175 * Open (compile) an ICU regular expression. The resulting regular expression 0176 * handle can then be used to perform various matching operations. 0177 * <p> 0178 * This function is the same as uregex_open, except that the pattern 0179 * is supplied as an 8 bit char * string in the default code page. 0180 * 0181 * @param pattern The Regular Expression pattern to be compiled, 0182 * NUL terminated. 0183 * @param flags Flags that alter the default matching behavior for 0184 * the regular expression, UREGEX_CASE_INSENSITIVE, for 0185 * example. For default behavior, set this parameter to zero. 0186 * See <code>enum URegexpFlag</code>. All desired flags 0187 * are bitwise-ORed together. 0188 * @param pe Receives the position (line and column numbers) of any syntax 0189 * error within the source regular expression string. If this 0190 * information is not wanted, pass NULL for this parameter. 0191 * @param status Receives errors detected by this function. 0192 * @return The URegularExpression object representing the compiled 0193 * pattern. 0194 * 0195 * @stable ICU 3.0 0196 */ 0197 U_CAPI URegularExpression * U_EXPORT2 0198 uregex_openC( const char *pattern, 0199 uint32_t flags, 0200 UParseError *pe, 0201 UErrorCode *status); 0202 #endif 0203 0204 0205 0206 /** 0207 * Close the regular expression, recovering all resources (memory) it 0208 * was holding. 0209 * 0210 * @param regexp The regular expression to be closed. 0211 * @stable ICU 3.0 0212 */ 0213 U_CAPI void U_EXPORT2 0214 uregex_close(URegularExpression *regexp); 0215 0216 #if U_SHOW_CPLUSPLUS_API 0217 0218 U_NAMESPACE_BEGIN 0219 0220 /** 0221 * \class LocalURegularExpressionPointer 0222 * "Smart pointer" class, closes a URegularExpression via uregex_close(). 0223 * For most methods see the LocalPointerBase base class. 0224 * 0225 * @see LocalPointerBase 0226 * @see LocalPointer 0227 * @stable ICU 4.4 0228 */ 0229 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); 0230 0231 U_NAMESPACE_END 0232 0233 #endif 0234 0235 /** 0236 * Make a copy of a compiled regular expression. Cloning a regular 0237 * expression is faster than opening a second instance from the source 0238 * form of the expression, and requires less memory. 0239 * <p> 0240 * Note that the current input string and the position of any matched text 0241 * within it are not cloned; only the pattern itself and the 0242 * match mode flags are copied. 0243 * <p> 0244 * Cloning can be particularly useful to threaded applications that perform 0245 * multiple match operations in parallel. Each concurrent RE 0246 * operation requires its own instance of a URegularExpression. 0247 * 0248 * @param regexp The compiled regular expression to be cloned. 0249 * @param status Receives indication of any errors encountered 0250 * @return the cloned copy of the compiled regular expression. 0251 * @stable ICU 3.0 0252 */ 0253 U_CAPI URegularExpression * U_EXPORT2 0254 uregex_clone(const URegularExpression *regexp, UErrorCode *status); 0255 0256 /** 0257 * Returns a pointer to the source form of the pattern for this regular expression. 0258 * This function will work even if the pattern was originally specified as a UText. 0259 * 0260 * @param regexp The compiled regular expression. 0261 * @param patLength This output parameter will be set to the length of the 0262 * pattern string. A NULL pointer may be used here if the 0263 * pattern length is not needed, as would be the case if 0264 * the pattern is known in advance to be a NUL terminated 0265 * string. 0266 * @param status Receives errors detected by this function. 0267 * @return a pointer to the pattern string. The storage for the string is 0268 * owned by the regular expression object, and must not be 0269 * altered or deleted by the application. The returned string 0270 * will remain valid until the regular expression is closed. 0271 * @stable ICU 3.0 0272 */ 0273 U_CAPI const UChar * U_EXPORT2 0274 uregex_pattern(const URegularExpression *regexp, 0275 int32_t *patLength, 0276 UErrorCode *status); 0277 0278 /** 0279 * Returns the source text of the pattern for this regular expression. 0280 * This function will work even if the pattern was originally specified as a UChar string. 0281 * 0282 * @param regexp The compiled regular expression. 0283 * @param status Receives errors detected by this function. 0284 * @return the pattern text. The storage for the text is owned by the regular expression 0285 * object, and must not be altered or deleted. 0286 * 0287 * @stable ICU 4.6 0288 */ 0289 U_CAPI UText * U_EXPORT2 0290 uregex_patternUText(const URegularExpression *regexp, 0291 UErrorCode *status); 0292 0293 /** 0294 * Get the match mode flags that were specified when compiling this regular expression. 0295 * @param status Receives errors detected by this function. 0296 * @param regexp The compiled regular expression. 0297 * @return The match mode flags 0298 * @see URegexpFlag 0299 * @stable ICU 3.0 0300 */ 0301 U_CAPI int32_t U_EXPORT2 0302 uregex_flags(const URegularExpression *regexp, 0303 UErrorCode *status); 0304 0305 0306 /** 0307 * Set the subject text string upon which the regular expression will look for matches. 0308 * This function may be called any number of times, allowing the regular 0309 * expression pattern to be applied to different strings. 0310 * <p> 0311 * Regular expression matching operations work directly on the application's 0312 * string data. No copy is made. The subject string data must not be 0313 * altered after calling this function until after all regular expression 0314 * operations involving this string data are completed. 0315 * <p> 0316 * Zero length strings are permitted. In this case, no subsequent match 0317 * operation will dereference the text string pointer. 0318 * 0319 * @param regexp The compiled regular expression. 0320 * @param text The subject text string. 0321 * @param textLength The length of the subject text, or -1 if the string 0322 * is NUL terminated. 0323 * @param status Receives errors detected by this function. 0324 * @stable ICU 3.0 0325 */ 0326 U_CAPI void U_EXPORT2 0327 uregex_setText(URegularExpression *regexp, 0328 const UChar *text, 0329 int32_t textLength, 0330 UErrorCode *status); 0331 0332 0333 /** 0334 * Set the subject text string upon which the regular expression will look for matches. 0335 * This function may be called any number of times, allowing the regular 0336 * expression pattern to be applied to different strings. 0337 * <p> 0338 * Regular expression matching operations work directly on the application's 0339 * string data; only a shallow clone is made. The subject string data must not be 0340 * altered after calling this function until after all regular expression 0341 * operations involving this string data are completed. 0342 * 0343 * @param regexp The compiled regular expression. 0344 * @param text The subject text string. 0345 * @param status Receives errors detected by this function. 0346 * 0347 * @stable ICU 4.6 0348 */ 0349 U_CAPI void U_EXPORT2 0350 uregex_setUText(URegularExpression *regexp, 0351 UText *text, 0352 UErrorCode *status); 0353 0354 /** 0355 * Get the subject text that is currently associated with this 0356 * regular expression object. If the input was supplied using uregex_setText(), 0357 * that pointer will be returned. Otherwise, the characters in the input will 0358 * be extracted to a buffer and returned. In either case, ownership remains 0359 * with the regular expression object. 0360 * 0361 * This function will work even if the input was originally specified as a UText. 0362 * 0363 * @param regexp The compiled regular expression. 0364 * @param textLength The length of the string is returned in this output parameter. 0365 * A NULL pointer may be used here if the 0366 * text length is not needed, as would be the case if 0367 * the text is known in advance to be a NUL terminated 0368 * string. 0369 * @param status Receives errors detected by this function. 0370 * @return Pointer to the subject text string currently associated with 0371 * this regular expression. 0372 * @stable ICU 3.0 0373 */ 0374 U_CAPI const UChar * U_EXPORT2 0375 uregex_getText(URegularExpression *regexp, 0376 int32_t *textLength, 0377 UErrorCode *status); 0378 0379 /** 0380 * Get the subject text that is currently associated with this 0381 * regular expression object. 0382 * 0383 * This function will work even if the input was originally specified as a UChar string. 0384 * 0385 * @param regexp The compiled regular expression. 0386 * @param dest A mutable UText in which to store the current input. 0387 * If NULL, a new UText will be created as an immutable shallow clone 0388 * of the actual input string. 0389 * @param status Receives errors detected by this function. 0390 * @return The subject text currently associated with this regular expression. 0391 * If a pre-allocated UText was provided, it will always be used and returned. 0392 * 0393 * @stable ICU 4.6 0394 */ 0395 U_CAPI UText * U_EXPORT2 0396 uregex_getUText(URegularExpression *regexp, 0397 UText *dest, 0398 UErrorCode *status); 0399 0400 /** 0401 * Set the subject text string upon which the regular expression is looking for matches 0402 * without changing any other aspect of the matching state. 0403 * The new and previous text strings must have the same content. 0404 * 0405 * This function is intended for use in environments where ICU is operating on 0406 * strings that may move around in memory. It provides a mechanism for notifying 0407 * ICU that the string has been relocated, and providing a new UText to access the 0408 * string in its new position. 0409 * 0410 * Note that the regular expression implementation never copies the underlying text 0411 * of a string being matched, but always operates directly on the original text 0412 * provided by the user. Refreshing simply drops the references to the old text 0413 * and replaces them with references to the new. 0414 * 0415 * Caution: this function is normally used only by very specialized 0416 * system-level code. One example use case is with garbage collection 0417 * that moves the text in memory. 0418 * 0419 * @param regexp The compiled regular expression. 0420 * @param text The new (moved) text string. 0421 * @param status Receives errors detected by this function. 0422 * 0423 * @stable ICU 4.8 0424 */ 0425 U_CAPI void U_EXPORT2 0426 uregex_refreshUText(URegularExpression *regexp, 0427 UText *text, 0428 UErrorCode *status); 0429 0430 /** 0431 * Attempts to match the input string against the pattern. 0432 * To succeed, the match must extend to the end of the string, 0433 * or cover the complete match region. 0434 * 0435 * If startIndex >= zero the match operation starts at the specified 0436 * index and must extend to the end of the input string. Any region 0437 * that has been specified is reset. 0438 * 0439 * If startIndex == -1 the match must cover the input region, or the entire 0440 * input string if no region has been set. This directly corresponds to 0441 * Matcher.matches() in Java 0442 * 0443 * @param regexp The compiled regular expression. 0444 * @param startIndex The input string (native) index at which to begin matching, or -1 0445 * to match the input Region. 0446 * @param status Receives errors detected by this function. 0447 * @return true if there is a match 0448 * @stable ICU 3.0 0449 */ 0450 U_CAPI UBool U_EXPORT2 0451 uregex_matches(URegularExpression *regexp, 0452 int32_t startIndex, 0453 UErrorCode *status); 0454 0455 /** 0456 * 64bit version of uregex_matches. 0457 * Attempts to match the input string against the pattern. 0458 * To succeed, the match must extend to the end of the string, 0459 * or cover the complete match region. 0460 * 0461 * If startIndex >= zero the match operation starts at the specified 0462 * index and must extend to the end of the input string. Any region 0463 * that has been specified is reset. 0464 * 0465 * If startIndex == -1 the match must cover the input region, or the entire 0466 * input string if no region has been set. This directly corresponds to 0467 * Matcher.matches() in Java 0468 * 0469 * @param regexp The compiled regular expression. 0470 * @param startIndex The input string (native) index at which to begin matching, or -1 0471 * to match the input Region. 0472 * @param status Receives errors detected by this function. 0473 * @return true if there is a match 0474 * @stable ICU 4.6 0475 */ 0476 U_CAPI UBool U_EXPORT2 0477 uregex_matches64(URegularExpression *regexp, 0478 int64_t startIndex, 0479 UErrorCode *status); 0480 0481 /** 0482 * Attempts to match the input string, starting from the specified index, against the pattern. 0483 * The match may be of any length, and is not required to extend to the end 0484 * of the input string. Contrast with uregex_matches(). 0485 * 0486 * <p>If startIndex is >= 0 any input region that was set for this 0487 * URegularExpression is reset before the operation begins. 0488 * 0489 * <p>If the specified starting index == -1 the match begins at the start of the input 0490 * region, or at the start of the full string if no region has been specified. 0491 * This corresponds directly with Matcher.lookingAt() in Java. 0492 * 0493 * <p>If the match succeeds then more information can be obtained via the 0494 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 0495 * and <code>uregex_group()</code> functions.</p> 0496 * 0497 * @param regexp The compiled regular expression. 0498 * @param startIndex The input string (native) index at which to begin matching, or 0499 * -1 to match the Input Region 0500 * @param status A reference to a UErrorCode to receive any errors. 0501 * @return true if there is a match. 0502 * @stable ICU 3.0 0503 */ 0504 U_CAPI UBool U_EXPORT2 0505 uregex_lookingAt(URegularExpression *regexp, 0506 int32_t startIndex, 0507 UErrorCode *status); 0508 0509 /** 0510 * 64bit version of uregex_lookingAt. 0511 * Attempts to match the input string, starting from the specified index, against the pattern. 0512 * The match may be of any length, and is not required to extend to the end 0513 * of the input string. Contrast with uregex_matches(). 0514 * 0515 * <p>If startIndex is >= 0 any input region that was set for this 0516 * URegularExpression is reset before the operation begins. 0517 * 0518 * <p>If the specified starting index == -1 the match begins at the start of the input 0519 * region, or at the start of the full string if no region has been specified. 0520 * This corresponds directly with Matcher.lookingAt() in Java. 0521 * 0522 * <p>If the match succeeds then more information can be obtained via the 0523 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, 0524 * and <code>uregex_group()</code> functions.</p> 0525 * 0526 * @param regexp The compiled regular expression. 0527 * @param startIndex The input string (native) index at which to begin matching, or 0528 * -1 to match the Input Region 0529 * @param status A reference to a UErrorCode to receive any errors. 0530 * @return true if there is a match. 0531 * @stable ICU 4.6 0532 */ 0533 U_CAPI UBool U_EXPORT2 0534 uregex_lookingAt64(URegularExpression *regexp, 0535 int64_t startIndex, 0536 UErrorCode *status); 0537 0538 /** 0539 * Find the first matching substring of the input string that matches the pattern. 0540 * If startIndex is >= zero the search for a match begins at the specified index, 0541 * and any match region is reset. This corresponds directly with 0542 * Matcher.find(startIndex) in Java. 0543 * 0544 * If startIndex == -1 the search begins at the start of the input region, 0545 * or at the start of the full string if no region has been specified. 0546 * 0547 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 0548 * <code>uregex_group()</code> will provide more information regarding the match. 0549 * 0550 * @param regexp The compiled regular expression. 0551 * @param startIndex The position (native) in the input string to begin the search, or 0552 * -1 to search within the Input Region. 0553 * @param status A reference to a UErrorCode to receive any errors. 0554 * @return true if a match is found. 0555 * @stable ICU 3.0 0556 */ 0557 U_CAPI UBool U_EXPORT2 0558 uregex_find(URegularExpression *regexp, 0559 int32_t startIndex, 0560 UErrorCode *status); 0561 0562 /** 0563 * 64bit version of uregex_find. 0564 * Find the first matching substring of the input string that matches the pattern. 0565 * If startIndex is >= zero the search for a match begins at the specified index, 0566 * and any match region is reset. This corresponds directly with 0567 * Matcher.find(startIndex) in Java. 0568 * 0569 * If startIndex == -1 the search begins at the start of the input region, 0570 * or at the start of the full string if no region has been specified. 0571 * 0572 * If a match is found, <code>uregex_start(), uregex_end()</code>, and 0573 * <code>uregex_group()</code> will provide more information regarding the match. 0574 * 0575 * @param regexp The compiled regular expression. 0576 * @param startIndex The position (native) in the input string to begin the search, or 0577 * -1 to search within the Input Region. 0578 * @param status A reference to a UErrorCode to receive any errors. 0579 * @return true if a match is found. 0580 * @stable ICU 4.6 0581 */ 0582 U_CAPI UBool U_EXPORT2 0583 uregex_find64(URegularExpression *regexp, 0584 int64_t startIndex, 0585 UErrorCode *status); 0586 0587 /** 0588 * Find the next pattern match in the input string. Begin searching 0589 * the input at the location following the end of he previous match, 0590 * or at the start of the string (or region) if there is no 0591 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and 0592 * <code>uregex_group()</code> will provide more information regarding the match. 0593 * 0594 * @param regexp The compiled regular expression. 0595 * @param status A reference to a UErrorCode to receive any errors. 0596 * @return true if a match is found. 0597 * @see uregex_reset 0598 * @stable ICU 3.0 0599 */ 0600 U_CAPI UBool U_EXPORT2 0601 uregex_findNext(URegularExpression *regexp, 0602 UErrorCode *status); 0603 0604 /** 0605 * Get the number of capturing groups in this regular expression's pattern. 0606 * @param regexp The compiled regular expression. 0607 * @param status A reference to a UErrorCode to receive any errors. 0608 * @return the number of capture groups 0609 * @stable ICU 3.0 0610 */ 0611 U_CAPI int32_t U_EXPORT2 0612 uregex_groupCount(URegularExpression *regexp, 0613 UErrorCode *status); 0614 0615 /** 0616 * Get the group number corresponding to a named capture group. 0617 * The returned number can be used with any function that access 0618 * capture groups by number. 0619 * 0620 * The function returns an error status if the specified name does not 0621 * appear in the pattern. 0622 * 0623 * @param regexp The compiled regular expression. 0624 * @param groupName The capture group name. 0625 * @param nameLength The length of the name, or -1 if the name is a 0626 * nul-terminated string. 0627 * @param status A pointer to a UErrorCode to receive any errors. 0628 * 0629 * @stable ICU 55 0630 */ 0631 U_CAPI int32_t U_EXPORT2 0632 uregex_groupNumberFromName(URegularExpression *regexp, 0633 const UChar *groupName, 0634 int32_t nameLength, 0635 UErrorCode *status); 0636 0637 0638 /** 0639 * Get the group number corresponding to a named capture group. 0640 * The returned number can be used with any function that access 0641 * capture groups by number. 0642 * 0643 * The function returns an error status if the specified name does not 0644 * appear in the pattern. 0645 * 0646 * @param regexp The compiled regular expression. 0647 * @param groupName The capture group name, 0648 * platform invariant characters only. 0649 * @param nameLength The length of the name, or -1 if the name is 0650 * nul-terminated. 0651 * @param status A pointer to a UErrorCode to receive any errors. 0652 * 0653 * @stable ICU 55 0654 */ 0655 U_CAPI int32_t U_EXPORT2 0656 uregex_groupNumberFromCName(URegularExpression *regexp, 0657 const char *groupName, 0658 int32_t nameLength, 0659 UErrorCode *status); 0660 0661 /** Extract the string for the specified matching expression or subexpression. 0662 * Group #0 is the complete string of matched text. 0663 * Group #1 is the text matched by the first set of capturing parentheses. 0664 * 0665 * @param regexp The compiled regular expression. 0666 * @param groupNum The capture group to extract. Group 0 is the complete 0667 * match. The value of this parameter must be 0668 * less than or equal to the number of capture groups in 0669 * the pattern. 0670 * @param dest Buffer to receive the matching string data 0671 * @param destCapacity Capacity of the dest buffer. 0672 * @param status A reference to a UErrorCode to receive any errors. 0673 * @return Length of matching data, 0674 * or -1 if no applicable match. 0675 * @stable ICU 3.0 0676 */ 0677 U_CAPI int32_t U_EXPORT2 0678 uregex_group(URegularExpression *regexp, 0679 int32_t groupNum, 0680 UChar *dest, 0681 int32_t destCapacity, 0682 UErrorCode *status); 0683 0684 /** Returns a shallow immutable clone of the entire input string with the current index set 0685 * to the beginning of the requested capture group. The capture group length is also 0686 * returned via groupLength. 0687 * Group #0 is the complete string of matched text. 0688 * Group #1 is the text matched by the first set of capturing parentheses. 0689 * 0690 * @param regexp The compiled regular expression. 0691 * @param groupNum The capture group to extract. Group 0 is the complete 0692 * match. The value of this parameter must be 0693 * less than or equal to the number of capture groups in 0694 * the pattern. 0695 * @param dest A mutable UText in which to store the current input. 0696 * If NULL, a new UText will be created as an immutable shallow clone 0697 * of the entire input string. 0698 * @param groupLength The group length of the desired capture group. Output parameter. 0699 * @param status A reference to a UErrorCode to receive any errors. 0700 * @return The subject text currently associated with this regular expression. 0701 * If a pre-allocated UText was provided, it will always be used and returned. 0702 0703 * 0704 * @stable ICU 4.6 0705 */ 0706 U_CAPI UText * U_EXPORT2 0707 uregex_groupUText(URegularExpression *regexp, 0708 int32_t groupNum, 0709 UText *dest, 0710 int64_t *groupLength, 0711 UErrorCode *status); 0712 0713 /** 0714 * Returns the index in the input string of the start of the text matched by the 0715 * specified capture group during the previous match operation. Return -1 if 0716 * the capture group was not part of the last match. 0717 * Group #0 refers to the complete range of matched text. 0718 * Group #1 refers to the text matched by the first set of capturing parentheses. 0719 * 0720 * @param regexp The compiled regular expression. 0721 * @param groupNum The capture group number 0722 * @param status A reference to a UErrorCode to receive any errors. 0723 * @return the starting (native) position in the input of the text matched 0724 * by the specified group. 0725 * @stable ICU 3.0 0726 */ 0727 U_CAPI int32_t U_EXPORT2 0728 uregex_start(URegularExpression *regexp, 0729 int32_t groupNum, 0730 UErrorCode *status); 0731 0732 /** 0733 * 64bit version of uregex_start. 0734 * Returns the index in the input string of the start of the text matched by the 0735 * specified capture group during the previous match operation. Return -1 if 0736 * the capture group was not part of the last match. 0737 * Group #0 refers to the complete range of matched text. 0738 * Group #1 refers to the text matched by the first set of capturing parentheses. 0739 * 0740 * @param regexp The compiled regular expression. 0741 * @param groupNum The capture group number 0742 * @param status A reference to a UErrorCode to receive any errors. 0743 * @return the starting (native) position in the input of the text matched 0744 * by the specified group. 0745 * @stable ICU 4.6 0746 */ 0747 U_CAPI int64_t U_EXPORT2 0748 uregex_start64(URegularExpression *regexp, 0749 int32_t groupNum, 0750 UErrorCode *status); 0751 0752 /** 0753 * Returns the index in the input string of the position following the end 0754 * of the text matched by the specified capture group. 0755 * Return -1 if the capture group was not part of the last match. 0756 * Group #0 refers to the complete range of matched text. 0757 * Group #1 refers to the text matched by the first set of capturing parentheses. 0758 * 0759 * @param regexp The compiled regular expression. 0760 * @param groupNum The capture group number 0761 * @param status A reference to a UErrorCode to receive any errors. 0762 * @return the (native) index of the position following the last matched character. 0763 * @stable ICU 3.0 0764 */ 0765 U_CAPI int32_t U_EXPORT2 0766 uregex_end(URegularExpression *regexp, 0767 int32_t groupNum, 0768 UErrorCode *status); 0769 0770 /** 0771 * 64bit version of uregex_end. 0772 * Returns the index in the input string of the position following the end 0773 * of the text matched by the specified capture group. 0774 * Return -1 if the capture group was not part of the last match. 0775 * Group #0 refers to the complete range of matched text. 0776 * Group #1 refers to the text matched by the first set of capturing parentheses. 0777 * 0778 * @param regexp The compiled regular expression. 0779 * @param groupNum The capture group number 0780 * @param status A reference to a UErrorCode to receive any errors. 0781 * @return the (native) index of the position following the last matched character. 0782 * @stable ICU 4.6 0783 */ 0784 U_CAPI int64_t U_EXPORT2 0785 uregex_end64(URegularExpression *regexp, 0786 int32_t groupNum, 0787 UErrorCode *status); 0788 0789 /** 0790 * Reset any saved state from the previous match. Has the effect of 0791 * causing uregex_findNext to begin at the specified index, and causing 0792 * uregex_start(), uregex_end() and uregex_group() to return an error 0793 * indicating that there is no match information available. Clears any 0794 * match region that may have been set. 0795 * 0796 * @param regexp The compiled regular expression. 0797 * @param index The position (native) in the text at which a 0798 * uregex_findNext() should begin searching. 0799 * @param status A reference to a UErrorCode to receive any errors. 0800 * @stable ICU 3.0 0801 */ 0802 U_CAPI void U_EXPORT2 0803 uregex_reset(URegularExpression *regexp, 0804 int32_t index, 0805 UErrorCode *status); 0806 0807 /** 0808 * 64bit version of uregex_reset. 0809 * Reset any saved state from the previous match. Has the effect of 0810 * causing uregex_findNext to begin at the specified index, and causing 0811 * uregex_start(), uregex_end() and uregex_group() to return an error 0812 * indicating that there is no match information available. Clears any 0813 * match region that may have been set. 0814 * 0815 * @param regexp The compiled regular expression. 0816 * @param index The position (native) in the text at which a 0817 * uregex_findNext() should begin searching. 0818 * @param status A reference to a UErrorCode to receive any errors. 0819 * @stable ICU 4.6 0820 */ 0821 U_CAPI void U_EXPORT2 0822 uregex_reset64(URegularExpression *regexp, 0823 int64_t index, 0824 UErrorCode *status); 0825 0826 /** 0827 * Sets the limits of the matching region for this URegularExpression. 0828 * The region is the part of the input string that will be considered when matching. 0829 * Invoking this method resets any saved state from the previous match, 0830 * then sets the region to start at the index specified by the start parameter 0831 * and end at the index specified by the end parameter. 0832 * 0833 * Depending on the transparency and anchoring being used (see useTransparentBounds 0834 * and useAnchoringBounds), certain constructs such as anchors may behave differently 0835 * at or around the boundaries of the region 0836 * 0837 * The function will fail if start is greater than limit, or if either index 0838 * is less than zero or greater than the length of the string being matched. 0839 * 0840 * @param regexp The compiled regular expression. 0841 * @param regionStart The (native) index to begin searches at. 0842 * @param regionLimit The (native) index to end searches at (exclusive). 0843 * @param status A pointer to a UErrorCode to receive any errors. 0844 * @stable ICU 4.0 0845 */ 0846 U_CAPI void U_EXPORT2 0847 uregex_setRegion(URegularExpression *regexp, 0848 int32_t regionStart, 0849 int32_t regionLimit, 0850 UErrorCode *status); 0851 0852 /** 0853 * 64bit version of uregex_setRegion. 0854 * Sets the limits of the matching region for this URegularExpression. 0855 * The region is the part of the input string that will be considered when matching. 0856 * Invoking this method resets any saved state from the previous match, 0857 * then sets the region to start at the index specified by the start parameter 0858 * and end at the index specified by the end parameter. 0859 * 0860 * Depending on the transparency and anchoring being used (see useTransparentBounds 0861 * and useAnchoringBounds), certain constructs such as anchors may behave differently 0862 * at or around the boundaries of the region 0863 * 0864 * The function will fail if start is greater than limit, or if either index 0865 * is less than zero or greater than the length of the string being matched. 0866 * 0867 * @param regexp The compiled regular expression. 0868 * @param regionStart The (native) index to begin searches at. 0869 * @param regionLimit The (native) index to end searches at (exclusive). 0870 * @param status A pointer to a UErrorCode to receive any errors. 0871 * @stable ICU 4.6 0872 */ 0873 U_CAPI void U_EXPORT2 0874 uregex_setRegion64(URegularExpression *regexp, 0875 int64_t regionStart, 0876 int64_t regionLimit, 0877 UErrorCode *status); 0878 0879 /** 0880 * Set the matching region and the starting index for subsequent matches 0881 * in a single operation. 0882 * This is useful because the usual function for setting the starting 0883 * index, urgex_reset(), also resets any region limits. 0884 * 0885 * @param regexp The compiled regular expression. 0886 * @param regionStart The (native) index to begin searches at. 0887 * @param regionLimit The (native) index to end searches at (exclusive). 0888 * @param startIndex The index in the input text at which the next 0889 * match operation should begin. 0890 * @param status A pointer to a UErrorCode to receive any errors. 0891 * @stable ICU 4.6 0892 */ 0893 U_CAPI void U_EXPORT2 0894 uregex_setRegionAndStart(URegularExpression *regexp, 0895 int64_t regionStart, 0896 int64_t regionLimit, 0897 int64_t startIndex, 0898 UErrorCode *status); 0899 0900 /** 0901 * Reports the start index of the matching region. Any matches found are limited to 0902 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 0903 * 0904 * @param regexp The compiled regular expression. 0905 * @param status A pointer to a UErrorCode to receive any errors. 0906 * @return The starting (native) index of this matcher's region. 0907 * @stable ICU 4.0 0908 */ 0909 U_CAPI int32_t U_EXPORT2 0910 uregex_regionStart(const URegularExpression *regexp, 0911 UErrorCode *status); 0912 0913 /** 0914 * 64bit version of uregex_regionStart. 0915 * Reports the start index of the matching region. Any matches found are limited to 0916 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). 0917 * 0918 * @param regexp The compiled regular expression. 0919 * @param status A pointer to a UErrorCode to receive any errors. 0920 * @return The starting (native) index of this matcher's region. 0921 * @stable ICU 4.6 0922 */ 0923 U_CAPI int64_t U_EXPORT2 0924 uregex_regionStart64(const URegularExpression *regexp, 0925 UErrorCode *status); 0926 0927 /** 0928 * Reports the end index (exclusive) of the matching region for this URegularExpression. 0929 * Any matches found are limited to to the region bounded by regionStart (inclusive) 0930 * and regionEnd (exclusive). 0931 * 0932 * @param regexp The compiled regular expression. 0933 * @param status A pointer to a UErrorCode to receive any errors. 0934 * @return The ending point (native) of this matcher's region. 0935 * @stable ICU 4.0 0936 */ 0937 U_CAPI int32_t U_EXPORT2 0938 uregex_regionEnd(const URegularExpression *regexp, 0939 UErrorCode *status); 0940 0941 /** 0942 * 64bit version of uregex_regionEnd. 0943 * Reports the end index (exclusive) of the matching region for this URegularExpression. 0944 * Any matches found are limited to to the region bounded by regionStart (inclusive) 0945 * and regionEnd (exclusive). 0946 * 0947 * @param regexp The compiled regular expression. 0948 * @param status A pointer to a UErrorCode to receive any errors. 0949 * @return The ending point (native) of this matcher's region. 0950 * @stable ICU 4.6 0951 */ 0952 U_CAPI int64_t U_EXPORT2 0953 uregex_regionEnd64(const URegularExpression *regexp, 0954 UErrorCode *status); 0955 0956 /** 0957 * Queries the transparency of region bounds for this URegularExpression. 0958 * See useTransparentBounds for a description of transparent and opaque bounds. 0959 * By default, matching boundaries are opaque. 0960 * 0961 * @param regexp The compiled regular expression. 0962 * @param status A pointer to a UErrorCode to receive any errors. 0963 * @return true if this matcher is using opaque bounds, false if it is not. 0964 * @stable ICU 4.0 0965 */ 0966 U_CAPI UBool U_EXPORT2 0967 uregex_hasTransparentBounds(const URegularExpression *regexp, 0968 UErrorCode *status); 0969 0970 0971 /** 0972 * Sets the transparency of region bounds for this URegularExpression. 0973 * Invoking this function with an argument of true will set matches to use transparent bounds. 0974 * If the boolean argument is false, then opaque bounds will be used. 0975 * 0976 * Using transparent bounds, the boundaries of the matching region are transparent 0977 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can 0978 * see text beyond the boundaries of the region while checking for a match. 0979 * 0980 * With opaque bounds, no text outside of the matching region is visible to lookahead, 0981 * lookbehind, and boundary matching constructs. 0982 * 0983 * By default, opaque bounds are used. 0984 * 0985 * @param regexp The compiled regular expression. 0986 * @param b true for transparent bounds; false for opaque bounds 0987 * @param status A pointer to a UErrorCode to receive any errors. 0988 * @stable ICU 4.0 0989 **/ 0990 U_CAPI void U_EXPORT2 0991 uregex_useTransparentBounds(URegularExpression *regexp, 0992 UBool b, 0993 UErrorCode *status); 0994 0995 0996 /** 0997 * Return true if this URegularExpression is using anchoring bounds. 0998 * By default, anchoring region bounds are used. 0999 * 1000 * @param regexp The compiled regular expression. 1001 * @param status A pointer to a UErrorCode to receive any errors. 1002 * @return true if this matcher is using anchoring bounds. 1003 * @stable ICU 4.0 1004 */ 1005 U_CAPI UBool U_EXPORT2 1006 uregex_hasAnchoringBounds(const URegularExpression *regexp, 1007 UErrorCode *status); 1008 1009 1010 /** 1011 * Set whether this URegularExpression is using Anchoring Bounds for its region. 1012 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start 1013 * and end of the region. Without Anchoring Bounds, anchors will only match at 1014 * the positions they would in the complete text. 1015 * 1016 * Anchoring Bounds are the default for regions. 1017 * 1018 * @param regexp The compiled regular expression. 1019 * @param b true if to enable anchoring bounds; false to disable them. 1020 * @param status A pointer to a UErrorCode to receive any errors. 1021 * @stable ICU 4.0 1022 */ 1023 U_CAPI void U_EXPORT2 1024 uregex_useAnchoringBounds(URegularExpression *regexp, 1025 UBool b, 1026 UErrorCode *status); 1027 1028 /** 1029 * Return true if the most recent matching operation touched the 1030 * end of the text being processed. In this case, additional input text could 1031 * change the results of that match. 1032 * 1033 * @param regexp The compiled regular expression. 1034 * @param status A pointer to a UErrorCode to receive any errors. 1035 * @return true if the most recent match hit the end of input 1036 * @stable ICU 4.0 1037 */ 1038 U_CAPI UBool U_EXPORT2 1039 uregex_hitEnd(const URegularExpression *regexp, 1040 UErrorCode *status); 1041 1042 /** 1043 * Return true the most recent match succeeded and additional input could cause 1044 * it to fail. If this function returns false and a match was found, then more input 1045 * might change the match but the match won't be lost. If a match was not found, 1046 * then requireEnd has no meaning. 1047 * 1048 * @param regexp The compiled regular expression. 1049 * @param status A pointer to a UErrorCode to receive any errors. 1050 * @return true if more input could cause the most recent match to no longer match. 1051 * @stable ICU 4.0 1052 */ 1053 U_CAPI UBool U_EXPORT2 1054 uregex_requireEnd(const URegularExpression *regexp, 1055 UErrorCode *status); 1056 1057 1058 1059 1060 1061 /** 1062 * Replaces every substring of the input that matches the pattern 1063 * with the given replacement string. This is a convenience function that 1064 * provides a complete find-and-replace-all operation. 1065 * 1066 * This method scans the input string looking for matches of the pattern. 1067 * Input that is not part of any match is copied unchanged to the 1068 * destination buffer. Matched regions are replaced in the output 1069 * buffer by the replacement string. The replacement string may contain 1070 * references to capture groups; these take the form of $1, $2, etc. 1071 * 1072 * @param regexp The compiled regular expression. 1073 * @param replacementText A string containing the replacement text. 1074 * @param replacementLength The length of the replacement string, or 1075 * -1 if it is NUL terminated. 1076 * @param destBuf A (UChar *) buffer that will receive the result. 1077 * @param destCapacity The capacity of the destination buffer. 1078 * @param status A reference to a UErrorCode to receive any errors. 1079 * @return The length of the string resulting from the find 1080 * and replace operation. In the event that the 1081 * destination capacity is inadequate, the return value 1082 * is still the full length of the untruncated string. 1083 * @stable ICU 3.0 1084 */ 1085 U_CAPI int32_t U_EXPORT2 1086 uregex_replaceAll(URegularExpression *regexp, 1087 const UChar *replacementText, 1088 int32_t replacementLength, 1089 UChar *destBuf, 1090 int32_t destCapacity, 1091 UErrorCode *status); 1092 1093 /** 1094 * Replaces every substring of the input that matches the pattern 1095 * with the given replacement string. This is a convenience function that 1096 * provides a complete find-and-replace-all operation. 1097 * 1098 * This method scans the input string looking for matches of the pattern. 1099 * Input that is not part of any match is copied unchanged to the 1100 * destination buffer. Matched regions are replaced in the output 1101 * buffer by the replacement string. The replacement string may contain 1102 * references to capture groups; these take the form of $1, $2, etc. 1103 * 1104 * @param regexp The compiled regular expression. 1105 * @param replacement A string containing the replacement text. 1106 * @param dest A mutable UText that will receive the result. 1107 * If NULL, a new UText will be created (which may not be mutable). 1108 * @param status A reference to a UErrorCode to receive any errors. 1109 * @return A UText containing the results of the find and replace. 1110 * If a pre-allocated UText was provided, it will always be used and returned. 1111 * 1112 * @stable ICU 4.6 1113 */ 1114 U_CAPI UText * U_EXPORT2 1115 uregex_replaceAllUText(URegularExpression *regexp, 1116 UText *replacement, 1117 UText *dest, 1118 UErrorCode *status); 1119 1120 /** 1121 * Replaces the first substring of the input that matches the pattern 1122 * with the given replacement string. This is a convenience function that 1123 * provides a complete find-and-replace operation. 1124 * 1125 * This method scans the input string looking for a match of the pattern. 1126 * All input that is not part of the match is copied unchanged to the 1127 * destination buffer. The matched region is replaced in the output 1128 * buffer by the replacement string. The replacement string may contain 1129 * references to capture groups; these take the form of $1, $2, etc. 1130 * 1131 * @param regexp The compiled regular expression. 1132 * @param replacementText A string containing the replacement text. 1133 * @param replacementLength The length of the replacement string, or 1134 * -1 if it is NUL terminated. 1135 * @param destBuf A (UChar *) buffer that will receive the result. 1136 * @param destCapacity The capacity of the destination buffer. 1137 * @param status a reference to a UErrorCode to receive any errors. 1138 * @return The length of the string resulting from the find 1139 * and replace operation. In the event that the 1140 * destination capacity is inadequate, the return value 1141 * is still the full length of the untruncated string. 1142 * @stable ICU 3.0 1143 */ 1144 U_CAPI int32_t U_EXPORT2 1145 uregex_replaceFirst(URegularExpression *regexp, 1146 const UChar *replacementText, 1147 int32_t replacementLength, 1148 UChar *destBuf, 1149 int32_t destCapacity, 1150 UErrorCode *status); 1151 1152 /** 1153 * Replaces the first substring of the input that matches the pattern 1154 * with the given replacement string. This is a convenience function that 1155 * provides a complete find-and-replace operation. 1156 * 1157 * This method scans the input string looking for a match of the pattern. 1158 * All input that is not part of the match is copied unchanged to the 1159 * destination buffer. The matched region is replaced in the output 1160 * buffer by the replacement string. The replacement string may contain 1161 * references to capture groups; these take the form of $1, $2, etc. 1162 * 1163 * @param regexp The compiled regular expression. 1164 * @param replacement A string containing the replacement text. 1165 * @param dest A mutable UText that will receive the result. 1166 * If NULL, a new UText will be created (which may not be mutable). 1167 * @param status A reference to a UErrorCode to receive any errors. 1168 * @return A UText containing the results of the find and replace. 1169 * If a pre-allocated UText was provided, it will always be used and returned. 1170 * 1171 * @stable ICU 4.6 1172 */ 1173 U_CAPI UText * U_EXPORT2 1174 uregex_replaceFirstUText(URegularExpression *regexp, 1175 UText *replacement, 1176 UText *dest, 1177 UErrorCode *status); 1178 1179 /** 1180 * Implements a replace operation intended to be used as part of an 1181 * incremental find-and-replace. 1182 * 1183 * <p>The input string, starting from the end of the previous match and ending at 1184 * the start of the current match, is appended to the destination string. Then the 1185 * replacement string is appended to the output string, 1186 * including handling any substitutions of captured text.</p> 1187 * 1188 * <p>A note on preflight computation of buffersize and error handling: 1189 * Calls to uregex_appendReplacement() and uregex_appendTail() are 1190 * designed to be chained, one after another, with the destination 1191 * buffer pointer and buffer capacity updated after each in preparation 1192 * to for the next. If the destination buffer is exhausted partway through such a 1193 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal 1194 * ICU conventions are for a function to perform no action if it is 1195 * called with an error status, but for this one case, uregex_appendRepacement() 1196 * will operate normally so that buffer size computations will complete 1197 * correctly. 1198 * 1199 * <p>For simple, prepackaged, non-incremental find-and-replace 1200 * operations, see replaceFirst() or replaceAll().</p> 1201 * 1202 * @param regexp The regular expression object. 1203 * @param replacementText The string that will replace the matched portion of the 1204 * input string as it is copied to the destination buffer. 1205 * The replacement text may contain references ($1, for 1206 * example) to capture groups from the match. 1207 * @param replacementLength The length of the replacement text string, 1208 * or -1 if the string is NUL terminated. 1209 * @param destBuf The buffer into which the results of the 1210 * find-and-replace are placed. On return, this pointer 1211 * will be updated to refer to the beginning of the 1212 * unused portion of buffer, leaving it in position for 1213 * a subsequent call to this function. 1214 * @param destCapacity The size of the output buffer, On return, this 1215 * parameter will be updated to reflect the space remaining 1216 * unused in the output buffer. 1217 * @param status A reference to a UErrorCode to receive any errors. 1218 * @return The length of the result string. In the event that 1219 * destCapacity is inadequate, the full length of the 1220 * untruncated output string is returned. 1221 * 1222 * @stable ICU 3.0 1223 * 1224 */ 1225 U_CAPI int32_t U_EXPORT2 1226 uregex_appendReplacement(URegularExpression *regexp, 1227 const UChar *replacementText, 1228 int32_t replacementLength, 1229 UChar **destBuf, 1230 int32_t *destCapacity, 1231 UErrorCode *status); 1232 1233 /** 1234 * Implements a replace operation intended to be used as part of an 1235 * incremental find-and-replace. 1236 * 1237 * <p>The input string, starting from the end of the previous match and ending at 1238 * the start of the current match, is appended to the destination string. Then the 1239 * replacement string is appended to the output string, 1240 * including handling any substitutions of captured text.</p> 1241 * 1242 * <p>For simple, prepackaged, non-incremental find-and-replace 1243 * operations, see replaceFirst() or replaceAll().</p> 1244 * 1245 * @param regexp The regular expression object. 1246 * @param replacementText The string that will replace the matched portion of the 1247 * input string as it is copied to the destination buffer. 1248 * The replacement text may contain references ($1, for 1249 * example) to capture groups from the match. 1250 * @param dest A mutable UText that will receive the result. Must not be NULL. 1251 * @param status A reference to a UErrorCode to receive any errors. 1252 * 1253 * @stable ICU 4.6 1254 */ 1255 U_CAPI void U_EXPORT2 1256 uregex_appendReplacementUText(URegularExpression *regexp, 1257 UText *replacementText, 1258 UText *dest, 1259 UErrorCode *status); 1260 1261 /** 1262 * As the final step in a find-and-replace operation, append the remainder 1263 * of the input string, starting at the position following the last match, 1264 * to the destination string. <code>uregex_appendTail()</code> is intended 1265 * to be invoked after one or more invocations of the 1266 * <code>uregex_appendReplacement()</code> function. 1267 * 1268 * @param regexp The regular expression object. This is needed to 1269 * obtain the input string and with the position 1270 * of the last match within it. 1271 * @param destBuf The buffer in which the results of the 1272 * find-and-replace are placed. On return, the pointer 1273 * will be updated to refer to the beginning of the 1274 * unused portion of buffer. 1275 * @param destCapacity The size of the output buffer, On return, this 1276 * value will be updated to reflect the space remaining 1277 * unused in the output buffer. 1278 * @param status A reference to a UErrorCode to receive any errors. 1279 * @return The length of the result string. In the event that 1280 * destCapacity is inadequate, the full length of the 1281 * untruncated output string is returned. 1282 * 1283 * @stable ICU 3.0 1284 */ 1285 U_CAPI int32_t U_EXPORT2 1286 uregex_appendTail(URegularExpression *regexp, 1287 UChar **destBuf, 1288 int32_t *destCapacity, 1289 UErrorCode *status); 1290 1291 /** 1292 * As the final step in a find-and-replace operation, append the remainder 1293 * of the input string, starting at the position following the last match, 1294 * to the destination string. <code>uregex_appendTailUText()</code> is intended 1295 * to be invoked after one or more invocations of the 1296 * <code>uregex_appendReplacementUText()</code> function. 1297 * 1298 * @param regexp The regular expression object. This is needed to 1299 * obtain the input string and with the position 1300 * of the last match within it. 1301 * @param dest A mutable UText that will receive the result. Must not be NULL. 1302 * 1303 * @param status Error code 1304 * 1305 * @return The destination UText. 1306 * 1307 * @stable ICU 4.6 1308 */ 1309 U_CAPI UText * U_EXPORT2 1310 uregex_appendTailUText(URegularExpression *regexp, 1311 UText *dest, 1312 UErrorCode *status); 1313 1314 /** 1315 * Split a string into fields. Somewhat like split() from Perl. 1316 * The pattern matches identify delimiters that separate the input 1317 * into fields. The input data between the matches becomes the 1318 * fields themselves. 1319 * 1320 * Each of the fields is copied from the input string to the destination 1321 * buffer, and NUL terminated. The position of each field within 1322 * the destination buffer is returned in the destFields array. 1323 * 1324 * If the delimiter pattern includes capture groups, the captured text will 1325 * also appear in the destination array of output strings, interspersed 1326 * with the fields. This is similar to Perl, but differs from Java, 1327 * which ignores the presence of capture groups in the pattern. 1328 * 1329 * Trailing empty fields will always be returned, assuming sufficient 1330 * destination capacity. This differs from the default behavior for Java 1331 * and Perl where trailing empty fields are not returned. 1332 * 1333 * The number of strings produced by the split operation is returned. 1334 * This count includes the strings from capture groups in the delimiter pattern. 1335 * This behavior differs from Java, which ignores capture groups. 1336 * 1337 * @param regexp The compiled regular expression. 1338 * @param destBuf A (UChar *) buffer to receive the fields that 1339 * are extracted from the input string. These 1340 * field pointers will refer to positions within the 1341 * destination buffer supplied by the caller. Any 1342 * extra positions within the destFields array will be 1343 * set to NULL. 1344 * @param destCapacity The capacity of the destBuf. 1345 * @param requiredCapacity The actual capacity required of the destBuf. 1346 * If destCapacity is too small, requiredCapacity will return 1347 * the total capacity required to hold all of the output, and 1348 * a U_BUFFER_OVERFLOW_ERROR will be returned. 1349 * @param destFields An array to be filled with the position of each 1350 * of the extracted fields within destBuf. 1351 * @param destFieldsCapacity The number of elements in the destFields array. 1352 * If the number of fields found is less than destFieldsCapacity, 1353 * the extra destFields elements are set to zero. 1354 * If destFieldsCapacity is too small, the trailing part of the 1355 * input, including any field delimiters, is treated as if it 1356 * were the last field - it is copied to the destBuf, and 1357 * its position is in the destBuf is stored in the last element 1358 * of destFields. This behavior mimics that of Perl. It is not 1359 * an error condition, and no error status is returned when all destField 1360 * positions are used. 1361 * @param status A reference to a UErrorCode to receive any errors. 1362 * @return The number of fields into which the input string was split. 1363 * @stable ICU 3.0 1364 */ 1365 U_CAPI int32_t U_EXPORT2 1366 uregex_split( URegularExpression *regexp, 1367 UChar *destBuf, 1368 int32_t destCapacity, 1369 int32_t *requiredCapacity, 1370 UChar *destFields[], 1371 int32_t destFieldsCapacity, 1372 UErrorCode *status); 1373 1374 /** 1375 * Split a string into fields. Somewhat like split() from Perl. 1376 * The pattern matches identify delimiters that separate the input 1377 * into fields. The input data between the matches becomes the 1378 * fields themselves. 1379 * <p> 1380 * The behavior of this function is not very closely aligned with uregex_split(); 1381 * instead, it is based on (and implemented directly on top of) the C++ split method. 1382 * 1383 * @param regexp The compiled regular expression. 1384 * @param destFields An array of mutable UText structs to receive the results of the split. 1385 * If a field is NULL, a new UText is allocated to contain the results for 1386 * that field. This new UText is not guaranteed to be mutable. 1387 * @param destFieldsCapacity The number of elements in the destination array. 1388 * If the number of fields found is less than destCapacity, the 1389 * extra strings in the destination array are not altered. 1390 * If the number of destination strings is less than the number 1391 * of fields, the trailing part of the input string, including any 1392 * field delimiters, is placed in the last destination string. 1393 * This behavior mimics that of Perl. It is not an error condition, and no 1394 * error status is returned when all destField positions are used. 1395 * @param status A reference to a UErrorCode to receive any errors. 1396 * @return The number of fields into which the input string was split. 1397 * 1398 * @stable ICU 4.6 1399 */ 1400 U_CAPI int32_t U_EXPORT2 1401 uregex_splitUText(URegularExpression *regexp, 1402 UText *destFields[], 1403 int32_t destFieldsCapacity, 1404 UErrorCode *status); 1405 1406 /** 1407 * Set a processing time limit for match operations with this URegularExpression. 1408 * 1409 * Some patterns, when matching certain strings, can run in exponential time. 1410 * For practical purposes, the match operation may appear to be in an 1411 * infinite loop. 1412 * When a limit is set a match operation will fail with an error if the 1413 * limit is exceeded. 1414 * <p> 1415 * The units of the limit are steps of the match engine. 1416 * Correspondence with actual processor time will depend on the speed 1417 * of the processor and the details of the specific pattern, but will 1418 * typically be on the order of milliseconds. 1419 * <p> 1420 * By default, the matching time is not limited. 1421 * <p> 1422 * 1423 * @param regexp The compiled regular expression. 1424 * @param limit The limit value, or 0 for no limit. 1425 * @param status A reference to a UErrorCode to receive any errors. 1426 * @stable ICU 4.0 1427 */ 1428 U_CAPI void U_EXPORT2 1429 uregex_setTimeLimit(URegularExpression *regexp, 1430 int32_t limit, 1431 UErrorCode *status); 1432 1433 /** 1434 * Get the time limit for for matches with this URegularExpression. 1435 * A return value of zero indicates that there is no limit. 1436 * 1437 * @param regexp The compiled regular expression. 1438 * @param status A reference to a UErrorCode to receive any errors. 1439 * @return the maximum allowed time for a match, in units of processing steps. 1440 * @stable ICU 4.0 1441 */ 1442 U_CAPI int32_t U_EXPORT2 1443 uregex_getTimeLimit(const URegularExpression *regexp, 1444 UErrorCode *status); 1445 1446 /** 1447 * Set the amount of heap storage available for use by the match backtracking stack. 1448 * <p> 1449 * ICU uses a backtracking regular expression engine, with the backtrack stack 1450 * maintained on the heap. This function sets the limit to the amount of memory 1451 * that can be used for this purpose. A backtracking stack overflow will 1452 * result in an error from the match operation that caused it. 1453 * <p> 1454 * A limit is desirable because a malicious or poorly designed pattern can use 1455 * excessive memory, potentially crashing the process. A limit is enabled 1456 * by default. 1457 * <p> 1458 * @param regexp The compiled regular expression. 1459 * @param limit The maximum size, in bytes, of the matching backtrack stack. 1460 * A value of zero means no limit. 1461 * The limit must be greater than or equal to zero. 1462 * @param status A reference to a UErrorCode to receive any errors. 1463 * 1464 * @stable ICU 4.0 1465 */ 1466 U_CAPI void U_EXPORT2 1467 uregex_setStackLimit(URegularExpression *regexp, 1468 int32_t limit, 1469 UErrorCode *status); 1470 1471 /** 1472 * Get the size of the heap storage available for use by the back tracking stack. 1473 * 1474 * @return the maximum backtracking stack size, in bytes, or zero if the 1475 * stack size is unlimited. 1476 * @stable ICU 4.0 1477 */ 1478 U_CAPI int32_t U_EXPORT2 1479 uregex_getStackLimit(const URegularExpression *regexp, 1480 UErrorCode *status); 1481 1482 1483 /** 1484 * Function pointer for a regular expression matching callback function. 1485 * When set, a callback function will be called periodically during matching 1486 * operations. If the call back function returns false, the matching 1487 * operation will be terminated early. 1488 * 1489 * Note: the callback function must not call other functions on this 1490 * URegularExpression. 1491 * 1492 * @param context context pointer. The callback function will be invoked 1493 * with the context specified at the time that 1494 * uregex_setMatchCallback() is called. 1495 * @param steps the accumulated processing time, in match steps, 1496 * for this matching operation. 1497 * @return true to continue the matching operation. 1498 * false to terminate the matching operation. 1499 * @stable ICU 4.0 1500 */ 1501 U_CDECL_BEGIN 1502 typedef UBool U_CALLCONV URegexMatchCallback ( 1503 const void *context, 1504 int32_t steps); 1505 U_CDECL_END 1506 1507 /** 1508 * Set a callback function for this URegularExpression. 1509 * During matching operations the function will be called periodically, 1510 * giving the application the opportunity to terminate a long-running 1511 * match. 1512 * 1513 * @param regexp The compiled regular expression. 1514 * @param callback A pointer to the user-supplied callback function. 1515 * @param context User context pointer. The value supplied at the 1516 * time the callback function is set will be saved 1517 * and passed to the callback each time that it is called. 1518 * @param status A reference to a UErrorCode to receive any errors. 1519 * @stable ICU 4.0 1520 */ 1521 U_CAPI void U_EXPORT2 1522 uregex_setMatchCallback(URegularExpression *regexp, 1523 URegexMatchCallback *callback, 1524 const void *context, 1525 UErrorCode *status); 1526 1527 1528 /** 1529 * Get the callback function for this URegularExpression. 1530 * 1531 * @param regexp The compiled regular expression. 1532 * @param callback Out parameter, receives a pointer to the user-supplied 1533 * callback function. 1534 * @param context Out parameter, receives the user context pointer that 1535 * was set when uregex_setMatchCallback() was called. 1536 * @param status A reference to a UErrorCode to receive any errors. 1537 * @stable ICU 4.0 1538 */ 1539 U_CAPI void U_EXPORT2 1540 uregex_getMatchCallback(const URegularExpression *regexp, 1541 URegexMatchCallback **callback, 1542 const void **context, 1543 UErrorCode *status); 1544 1545 /** 1546 * Function pointer for a regular expression find callback function. 1547 * 1548 * When set, a callback function will be called during a find operation 1549 * and for operations that depend on find, such as findNext, split and some replace 1550 * operations like replaceFirst. 1551 * The callback will usually be called after each attempt at a match, but this is not a 1552 * guarantee that the callback will be invoked at each character. For finds where the 1553 * match engine is invoked at each character, this may be close to true, but less likely 1554 * for more optimized loops where the pattern is known to only start, and the match 1555 * engine invoked, at certain characters. 1556 * When invoked, this callback will specify the index at which a match operation is about 1557 * to be attempted, giving the application the opportunity to terminate a long-running 1558 * find operation. 1559 * 1560 * If the call back function returns false, the find operation will be terminated early. 1561 * 1562 * Note: the callback function must not call other functions on this 1563 * URegularExpression 1564 * 1565 * @param context context pointer. The callback function will be invoked 1566 * with the context specified at the time that 1567 * uregex_setFindProgressCallback() is called. 1568 * @param matchIndex the next index at which a match attempt will be attempted for this 1569 * find operation. If this callback interrupts the search, this is the 1570 * index at which a find/findNext operation may be re-initiated. 1571 * @return true to continue the matching operation. 1572 * false to terminate the matching operation. 1573 * @stable ICU 4.6 1574 */ 1575 U_CDECL_BEGIN 1576 typedef UBool U_CALLCONV URegexFindProgressCallback ( 1577 const void *context, 1578 int64_t matchIndex); 1579 U_CDECL_END 1580 1581 1582 /** 1583 * Set the find progress callback function for this URegularExpression. 1584 * 1585 * @param regexp The compiled regular expression. 1586 * @param callback A pointer to the user-supplied callback function. 1587 * @param context User context pointer. The value supplied at the 1588 * time the callback function is set will be saved 1589 * and passed to the callback each time that it is called. 1590 * @param status A reference to a UErrorCode to receive any errors. 1591 * @stable ICU 4.6 1592 */ 1593 U_CAPI void U_EXPORT2 1594 uregex_setFindProgressCallback(URegularExpression *regexp, 1595 URegexFindProgressCallback *callback, 1596 const void *context, 1597 UErrorCode *status); 1598 1599 /** 1600 * Get the find progress callback function for this URegularExpression. 1601 * 1602 * @param regexp The compiled regular expression. 1603 * @param callback Out parameter, receives a pointer to the user-supplied 1604 * callback function. 1605 * @param context Out parameter, receives the user context pointer that 1606 * was set when uregex_setFindProgressCallback() was called. 1607 * @param status A reference to a UErrorCode to receive any errors. 1608 * @stable ICU 4.6 1609 */ 1610 U_CAPI void U_EXPORT2 1611 uregex_getFindProgressCallback(const URegularExpression *regexp, 1612 URegexFindProgressCallback **callback, 1613 const void **context, 1614 UErrorCode *status); 1615 1616 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 1617 #endif /* UREGEX_H */
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |