|
||||
Warning, file /include/unicode/search.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 ********************************************************************** 0005 * Copyright (C) 2001-2011 IBM and others. All rights reserved. 0006 ********************************************************************** 0007 * Date Name Description 0008 * 03/22/2000 helena Creation. 0009 ********************************************************************** 0010 */ 0011 0012 #ifndef SEARCH_H 0013 #define SEARCH_H 0014 0015 #include "unicode/utypes.h" 0016 0017 #if U_SHOW_CPLUSPLUS_API 0018 0019 /** 0020 * \file 0021 * \brief C++ API: SearchIterator object. 0022 */ 0023 0024 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 0025 0026 #include "unicode/uobject.h" 0027 #include "unicode/unistr.h" 0028 #include "unicode/chariter.h" 0029 #include "unicode/brkiter.h" 0030 #include "unicode/usearch.h" 0031 0032 /** 0033 * @stable ICU 2.0 0034 */ 0035 struct USearch; 0036 /** 0037 * @stable ICU 2.0 0038 */ 0039 typedef struct USearch USearch; 0040 0041 U_NAMESPACE_BEGIN 0042 0043 /** 0044 * 0045 * <tt>SearchIterator</tt> is an abstract base class that provides 0046 * methods to search for a pattern within a text string. Instances of 0047 * <tt>SearchIterator</tt> maintain a current position and scans over the 0048 * target text, returning the indices the pattern is matched and the length 0049 * of each match. 0050 * <p> 0051 * <tt>SearchIterator</tt> defines a protocol for text searching. 0052 * Subclasses provide concrete implementations of various search algorithms. 0053 * For example, <tt>StringSearch</tt> implements language-sensitive pattern 0054 * matching based on the comparison rules defined in a 0055 * <tt>RuleBasedCollator</tt> object. 0056 * <p> 0057 * Other options for searching includes using a BreakIterator to restrict 0058 * the points at which matches are detected. 0059 * <p> 0060 * <tt>SearchIterator</tt> provides an API that is similar to that of 0061 * other text iteration classes such as <tt>BreakIterator</tt>. Using 0062 * this class, it is easy to scan through text looking for all occurrences of 0063 * a given pattern. The following example uses a <tt>StringSearch</tt> 0064 * object to find all instances of "fox" in the target string. Any other 0065 * subclass of <tt>SearchIterator</tt> can be used in an identical 0066 * manner. 0067 * <pre><code> 0068 * UnicodeString target("The quick brown fox jumped over the lazy fox"); 0069 * UnicodeString pattern("fox"); 0070 * 0071 * SearchIterator *iter = new StringSearch(pattern, target); 0072 * UErrorCode error = U_ZERO_ERROR; 0073 * for (int pos = iter->first(error); pos != USEARCH_DONE; 0074 * pos = iter->next(error)) { 0075 * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); 0076 * } 0077 * </code></pre> 0078 * 0079 * @see StringSearch 0080 * @see RuleBasedCollator 0081 */ 0082 class U_I18N_API SearchIterator : public UObject { 0083 0084 public: 0085 0086 // public constructors and destructors ------------------------------- 0087 0088 /** 0089 * Copy constructor that creates a SearchIterator instance with the same 0090 * behavior, and iterating over the same text. 0091 * @param other the SearchIterator instance to be copied. 0092 * @stable ICU 2.0 0093 */ 0094 SearchIterator(const SearchIterator &other); 0095 0096 /** 0097 * Destructor. Cleans up the search iterator data struct. 0098 * @stable ICU 2.0 0099 */ 0100 virtual ~SearchIterator(); 0101 0102 // public get and set methods ---------------------------------------- 0103 0104 /** 0105 * Sets the index to point to the given position, and clears any state 0106 * that's affected. 0107 * <p> 0108 * This method takes the argument index and sets the position in the text 0109 * string accordingly without checking if the index is pointing to a 0110 * valid starting point to begin searching. 0111 * @param position within the text to be set. If position is less 0112 * than or greater than the text range for searching, 0113 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 0114 * @param status for errors if it occurs 0115 * @stable ICU 2.0 0116 */ 0117 virtual void setOffset(int32_t position, UErrorCode &status) = 0; 0118 0119 /** 0120 * Return the current index in the text being searched. 0121 * If the iteration has gone past the end of the text 0122 * (or past the beginning for a backwards search), USEARCH_DONE 0123 * is returned. 0124 * @return current index in the text being searched. 0125 * @stable ICU 2.0 0126 */ 0127 virtual int32_t getOffset(void) const = 0; 0128 0129 /** 0130 * Sets the text searching attributes located in the enum 0131 * USearchAttribute with values from the enum USearchAttributeValue. 0132 * USEARCH_DEFAULT can be used for all attributes for resetting. 0133 * @param attribute text attribute (enum USearchAttribute) to be set 0134 * @param value text attribute value 0135 * @param status for errors if it occurs 0136 * @stable ICU 2.0 0137 */ 0138 void setAttribute(USearchAttribute attribute, 0139 USearchAttributeValue value, 0140 UErrorCode &status); 0141 0142 /** 0143 * Gets the text searching attributes 0144 * @param attribute text attribute (enum USearchAttribute) to be retrieve 0145 * @return text attribute value 0146 * @stable ICU 2.0 0147 */ 0148 USearchAttributeValue getAttribute(USearchAttribute attribute) const; 0149 0150 /** 0151 * Returns the index to the match in the text string that was searched. 0152 * This call returns a valid result only after a successful call to 0153 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 0154 * Just after construction, or after a searching method returns 0155 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. 0156 * <p> 0157 * Use getMatchedLength to get the matched string length. 0158 * @return index of a substring within the text string that is being 0159 * searched. 0160 * @see #first 0161 * @see #next 0162 * @see #previous 0163 * @see #last 0164 * @stable ICU 2.0 0165 */ 0166 int32_t getMatchedStart(void) const; 0167 0168 /** 0169 * Returns the length of text in the string which matches the search 0170 * pattern. This call returns a valid result only after a successful call 0171 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 0172 * Just after construction, or after a searching method returns 0173 * <tt>USEARCH_DONE</tt>, this method will return 0. 0174 * @return The length of the match in the target text, or 0 if there 0175 * is no match currently. 0176 * @see #first 0177 * @see #next 0178 * @see #previous 0179 * @see #last 0180 * @stable ICU 2.0 0181 */ 0182 int32_t getMatchedLength(void) const; 0183 0184 /** 0185 * Returns the text that was matched by the most recent call to 0186 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. 0187 * If the iterator is not pointing at a valid match (e.g. just after 0188 * construction or after <tt>USEARCH_DONE</tt> has been returned, 0189 * returns an empty string. 0190 * @param result stores the matched string or an empty string if a match 0191 * is not found. 0192 * @see #first 0193 * @see #next 0194 * @see #previous 0195 * @see #last 0196 * @stable ICU 2.0 0197 */ 0198 void getMatchedText(UnicodeString &result) const; 0199 0200 /** 0201 * Set the BreakIterator that will be used to restrict the points 0202 * at which matches are detected. The user is responsible for deleting 0203 * the breakiterator. 0204 * @param breakiter A BreakIterator that will be used to restrict the 0205 * points at which matches are detected. If a match is 0206 * found, but the match's start or end index is not a 0207 * boundary as determined by the <tt>BreakIterator</tt>, 0208 * the match will be rejected and another will be searched 0209 * for. If this parameter is <tt>nullptr</tt>, no break 0210 * detection is attempted. 0211 * @param status for errors if it occurs 0212 * @see BreakIterator 0213 * @stable ICU 2.0 0214 */ 0215 void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); 0216 0217 /** 0218 * Returns the BreakIterator that is used to restrict the points at 0219 * which matches are detected. This will be the same object that was 0220 * passed to the constructor or to <tt>setBreakIterator</tt>. 0221 * Note that <tt>nullptr</tt> is a legal value; it means that break 0222 * detection should not be attempted. 0223 * @return BreakIterator used to restrict matchings. 0224 * @see #setBreakIterator 0225 * @stable ICU 2.0 0226 */ 0227 const BreakIterator * getBreakIterator(void) const; 0228 0229 /** 0230 * Set the string text to be searched. Text iteration will hence begin at 0231 * the start of the text string. This method is useful if you want to 0232 * re-use an iterator to search for the same pattern within a different 0233 * body of text. The user is responsible for deleting the text. 0234 * @param text string to be searched. 0235 * @param status for errors. If the text length is 0, 0236 * an U_ILLEGAL_ARGUMENT_ERROR is returned. 0237 * @stable ICU 2.0 0238 */ 0239 virtual void setText(const UnicodeString &text, UErrorCode &status); 0240 0241 /** 0242 * Set the string text to be searched. Text iteration will hence begin at 0243 * the start of the text string. This method is useful if you want to 0244 * re-use an iterator to search for the same pattern within a different 0245 * body of text. 0246 * <p> 0247 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 0248 * will be done during searching for this version. The block of text 0249 * in <tt>CharacterIterator</tt> will be used as it is. 0250 * The user is responsible for deleting the text. 0251 * @param text string iterator to be searched. 0252 * @param status for errors if any. If the text length is 0 then an 0253 * U_ILLEGAL_ARGUMENT_ERROR is returned. 0254 * @stable ICU 2.0 0255 */ 0256 virtual void setText(CharacterIterator &text, UErrorCode &status); 0257 0258 /** 0259 * Return the string text to be searched. 0260 * @return text string to be searched. 0261 * @stable ICU 2.0 0262 */ 0263 const UnicodeString & getText(void) const; 0264 0265 // operator overloading ---------------------------------------------- 0266 0267 /** 0268 * Equality operator. 0269 * @param that SearchIterator instance to be compared. 0270 * @return true if both BreakIterators are of the same class, have the 0271 * same behavior, terates over the same text and have the same 0272 * attributes. false otherwise. 0273 * @stable ICU 2.0 0274 */ 0275 virtual bool operator==(const SearchIterator &that) const; 0276 0277 /** 0278 * Not-equal operator. 0279 * @param that SearchIterator instance to be compared. 0280 * @return false if operator== returns true, and vice versa. 0281 * @stable ICU 2.0 0282 */ 0283 bool operator!=(const SearchIterator &that) const; 0284 0285 // public methods ---------------------------------------------------- 0286 0287 /** 0288 * Returns a copy of SearchIterator with the same behavior, and 0289 * iterating over the same text, as this one. Note that all data will be 0290 * replicated, except for the text string to be searched. 0291 * @return cloned object 0292 * @stable ICU 2.0 0293 */ 0294 virtual SearchIterator* safeClone(void) const = 0; 0295 0296 /** 0297 * Returns the first index at which the string text matches the search 0298 * pattern. The iterator is adjusted so that its current index (as 0299 * returned by <tt>getOffset</tt>) is the match position if one 0300 * was found. 0301 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 0302 * the iterator will be adjusted to the index USEARCH_DONE 0303 * @param status for errors if it occurs 0304 * @return The character index of the first match, or 0305 * <tt>USEARCH_DONE</tt> if there are no matches. 0306 * @see #getOffset 0307 * @stable ICU 2.0 0308 */ 0309 int32_t first(UErrorCode &status); 0310 0311 /** 0312 * Returns the first index equal or greater than <tt>position</tt> at which the 0313 * string text matches the search pattern. The iterator is adjusted so 0314 * that its current index (as returned by <tt>getOffset</tt>) is the 0315 * match position if one was found. 0316 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the 0317 * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. 0318 * @param position where search if to start from. If position is less 0319 * than or greater than the text range for searching, 0320 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 0321 * @param status for errors if it occurs 0322 * @return The character index of the first match following 0323 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 0324 * matches. 0325 * @see #getOffset 0326 * @stable ICU 2.0 0327 */ 0328 int32_t following(int32_t position, UErrorCode &status); 0329 0330 /** 0331 * Returns the last index in the target text at which it matches the 0332 * search pattern. The iterator is adjusted so that its current index 0333 * (as returned by <tt>getOffset</tt>) is the match position if one was 0334 * found. 0335 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 0336 * the iterator will be adjusted to the index USEARCH_DONE. 0337 * @param status for errors if it occurs 0338 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 0339 * there are no matches. 0340 * @see #getOffset 0341 * @stable ICU 2.0 0342 */ 0343 int32_t last(UErrorCode &status); 0344 0345 /** 0346 * Returns the first index less than <tt>position</tt> at which the string 0347 * text matches the search pattern. The iterator is adjusted so that its 0348 * current index (as returned by <tt>getOffset</tt>) is the match 0349 * position if one was found. If a match is not found, 0350 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 0351 * adjusted to the index USEARCH_DONE 0352 * <p> 0353 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the 0354 * result match is always less than <tt>position</tt>. 0355 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across 0356 * <tt>position</tt>. 0357 * 0358 * @param position where search is to start from. If position is less 0359 * than or greater than the text range for searching, 0360 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned 0361 * @param status for errors if it occurs 0362 * @return The character index of the first match preceding 0363 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 0364 * no matches. 0365 * @see #getOffset 0366 * @stable ICU 2.0 0367 */ 0368 int32_t preceding(int32_t position, UErrorCode &status); 0369 0370 /** 0371 * Returns the index of the next point at which the text matches the 0372 * search pattern, starting from the current position 0373 * The iterator is adjusted so that its current index (as returned by 0374 * <tt>getOffset</tt>) is the match position if one was found. 0375 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 0376 * the iterator will be adjusted to a position after the end of the text 0377 * string. 0378 * @param status for errors if it occurs 0379 * @return The index of the next match after the current position, 0380 * or <tt>USEARCH_DONE</tt> if there are no more matches. 0381 * @see #getOffset 0382 * @stable ICU 2.0 0383 */ 0384 int32_t next(UErrorCode &status); 0385 0386 /** 0387 * Returns the index of the previous point at which the string text 0388 * matches the search pattern, starting at the current position. 0389 * The iterator is adjusted so that its current index (as returned by 0390 * <tt>getOffset</tt>) is the match position if one was found. 0391 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and 0392 * the iterator will be adjusted to the index USEARCH_DONE 0393 * @param status for errors if it occurs 0394 * @return The index of the previous match before the current position, 0395 * or <tt>USEARCH_DONE</tt> if there are no more matches. 0396 * @see #getOffset 0397 * @stable ICU 2.0 0398 */ 0399 int32_t previous(UErrorCode &status); 0400 0401 /** 0402 * Resets the iteration. 0403 * Search will begin at the start of the text string if a forward 0404 * iteration is initiated before a backwards iteration. Otherwise if a 0405 * backwards iteration is initiated before a forwards iteration, the 0406 * search will begin at the end of the text string. 0407 * @stable ICU 2.0 0408 */ 0409 virtual void reset(); 0410 0411 protected: 0412 // protected data members --------------------------------------------- 0413 0414 /** 0415 * C search data struct 0416 * @stable ICU 2.0 0417 */ 0418 USearch *m_search_; 0419 0420 /** 0421 * Break iterator. 0422 * Currently the C++ breakiterator does not have getRules etc to reproduce 0423 * another in C. Hence we keep the original around and do the verification 0424 * at the end of the match. The user is responsible for deleting this 0425 * break iterator. 0426 * @stable ICU 2.0 0427 */ 0428 BreakIterator *m_breakiterator_; 0429 0430 /** 0431 * Unicode string version of the search text 0432 * @stable ICU 2.0 0433 */ 0434 UnicodeString m_text_; 0435 0436 // protected constructors and destructors ----------------------------- 0437 0438 /** 0439 * Default constructor. 0440 * Initializes data to the default values. 0441 * @stable ICU 2.0 0442 */ 0443 SearchIterator(); 0444 0445 /** 0446 * Constructor for use by subclasses. 0447 * @param text The target text to be searched. 0448 * @param breakiter A {@link BreakIterator} that is used to restrict the 0449 * points at which matches are detected. If 0450 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 0451 * match, but the match's start or end index is not a 0452 * boundary as determined by the <tt>BreakIterator</tt>, 0453 * the match is rejected and <tt>handleNext</tt> or 0454 * <tt>handlePrev</tt> is called again. If this parameter 0455 * is <tt>nullptr</tt>, no break detection is attempted. 0456 * @see #handleNext 0457 * @see #handlePrev 0458 * @stable ICU 2.0 0459 */ 0460 SearchIterator(const UnicodeString &text, 0461 BreakIterator *breakiter = nullptr); 0462 0463 /** 0464 * Constructor for use by subclasses. 0465 * <p> 0466 * Note: No parsing of the text within the <tt>CharacterIterator</tt> 0467 * will be done during searching for this version. The block of text 0468 * in <tt>CharacterIterator</tt> will be used as it is. 0469 * @param text The target text to be searched. 0470 * @param breakiter A {@link BreakIterator} that is used to restrict the 0471 * points at which matches are detected. If 0472 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 0473 * match, but the match's start or end index is not a 0474 * boundary as determined by the <tt>BreakIterator</tt>, 0475 * the match is rejected and <tt>handleNext</tt> or 0476 * <tt>handlePrev</tt> is called again. If this parameter 0477 * is <tt>nullptr</tt>, no break detection is attempted. 0478 * @see #handleNext 0479 * @see #handlePrev 0480 * @stable ICU 2.0 0481 */ 0482 SearchIterator(CharacterIterator &text, BreakIterator *breakiter = nullptr); 0483 0484 // protected methods -------------------------------------------------- 0485 0486 /** 0487 * Assignment operator. Sets this iterator to have the same behavior, 0488 * and iterate over the same text, as the one passed in. 0489 * @param that instance to be copied. 0490 * @stable ICU 2.0 0491 */ 0492 SearchIterator & operator=(const SearchIterator &that); 0493 0494 /** 0495 * Abstract method which subclasses override to provide the mechanism 0496 * for finding the next match in the target text. This allows different 0497 * subclasses to provide different search algorithms. 0498 * <p> 0499 * If a match is found, the implementation should return the index at 0500 * which the match starts and should call 0501 * <tt>setMatchLength</tt> with the number of characters 0502 * in the target text that make up the match. If no match is found, the 0503 * method should return USEARCH_DONE. 0504 * <p> 0505 * @param position The index in the target text at which the search 0506 * should start. 0507 * @param status for error codes if it occurs. 0508 * @return index at which the match starts, else if match is not found 0509 * USEARCH_DONE is returned 0510 * @see #setMatchLength 0511 * @stable ICU 2.0 0512 */ 0513 virtual int32_t handleNext(int32_t position, UErrorCode &status) 0514 = 0; 0515 0516 /** 0517 * Abstract method which subclasses override to provide the mechanism for 0518 * finding the previous match in the target text. This allows different 0519 * subclasses to provide different search algorithms. 0520 * <p> 0521 * If a match is found, the implementation should return the index at 0522 * which the match starts and should call 0523 * <tt>setMatchLength</tt> with the number of characters 0524 * in the target text that make up the match. If no match is found, the 0525 * method should return USEARCH_DONE. 0526 * <p> 0527 * @param position The index in the target text at which the search 0528 * should start. 0529 * @param status for error codes if it occurs. 0530 * @return index at which the match starts, else if match is not found 0531 * USEARCH_DONE is returned 0532 * @see #setMatchLength 0533 * @stable ICU 2.0 0534 */ 0535 virtual int32_t handlePrev(int32_t position, UErrorCode &status) 0536 = 0; 0537 0538 /** 0539 * Sets the length of the currently matched string in the text string to 0540 * be searched. 0541 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 0542 * methods should call this when they find a match in the target text. 0543 * @param length length of the matched text. 0544 * @see #handleNext 0545 * @see #handlePrev 0546 * @stable ICU 2.0 0547 */ 0548 virtual void setMatchLength(int32_t length); 0549 0550 /** 0551 * Sets the offset of the currently matched string in the text string to 0552 * be searched. 0553 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> 0554 * methods should call this when they find a match in the target text. 0555 * @param position start offset of the matched text. 0556 * @see #handleNext 0557 * @see #handlePrev 0558 * @stable ICU 2.0 0559 */ 0560 virtual void setMatchStart(int32_t position); 0561 0562 /** 0563 * sets match not found 0564 * @stable ICU 2.0 0565 */ 0566 void setMatchNotFound(); 0567 }; 0568 0569 inline bool SearchIterator::operator!=(const SearchIterator &that) const 0570 { 0571 return !operator==(that); 0572 } 0573 U_NAMESPACE_END 0574 0575 #endif /* #if !UCONFIG_NO_COLLATION */ 0576 0577 #endif /* U_SHOW_CPLUSPLUS_API */ 0578 0579 #endif 0580
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |