Back to home page

EIC code displayed by LXR

 
 

    


Warning, file /include/unicode/search.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 // © 2016 and later: Unicode, Inc. and others.
0002 // License & terms of use: http://www.unicode.org/copyright.html
0003 /*
0004 **********************************************************************
0005 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
0006 **********************************************************************
0007 *   Date        Name        Description
0008 *  03/22/2000   helena      Creation.
0009 **********************************************************************
0010 */
0011 
0012 #ifndef SEARCH_H
0013 #define SEARCH_H
0014 
0015 #include "unicode/utypes.h"
0016 
0017 #if U_SHOW_CPLUSPLUS_API
0018 
0019 /**
0020  * \file 
0021  * \brief C++ API: SearchIterator object.
0022  */
0023  
0024 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
0025 
0026 #include "unicode/uobject.h"
0027 #include "unicode/unistr.h"
0028 #include "unicode/chariter.h"
0029 #include "unicode/brkiter.h"
0030 #include "unicode/usearch.h"
0031 
0032 /**
0033 * @stable ICU 2.0
0034 */
0035 struct USearch;
0036 /**
0037 * @stable ICU 2.0
0038 */
0039 typedef struct USearch USearch;
0040 
0041 U_NAMESPACE_BEGIN
0042 
0043 /**
0044  *
0045  * <tt>SearchIterator</tt> is an abstract base class that provides 
0046  * methods to search for a pattern within a text string. Instances of
0047  * <tt>SearchIterator</tt> maintain a current position and scans over the 
0048  * target text, returning the indices the pattern is matched and the length 
0049  * of each match.
0050  * <p>
0051  * <tt>SearchIterator</tt> defines a protocol for text searching. 
0052  * Subclasses provide concrete implementations of various search algorithms. 
0053  * For example, <tt>StringSearch</tt> implements language-sensitive pattern 
0054  * matching based on the comparison rules defined in a 
0055  * <tt>RuleBasedCollator</tt> object. 
0056  * <p> 
0057  * Other options for searching includes using a BreakIterator to restrict 
0058  * the points at which matches are detected.
0059  * <p>
0060  * <tt>SearchIterator</tt> provides an API that is similar to that of
0061  * other text iteration classes such as <tt>BreakIterator</tt>. Using 
0062  * this class, it is easy to scan through text looking for all occurrences of 
0063  * a given pattern. The following example uses a <tt>StringSearch</tt> 
0064  * object to find all instances of "fox" in the target string. Any other 
0065  * subclass of <tt>SearchIterator</tt> can be used in an identical 
0066  * manner.
0067  * <pre><code>
0068  * UnicodeString target("The quick brown fox jumped over the lazy fox");
0069  * UnicodeString pattern("fox");
0070  *
0071  * SearchIterator *iter  = new StringSearch(pattern, target);
0072  * UErrorCode      error = U_ZERO_ERROR;
0073  * for (int pos = iter->first(error); pos != USEARCH_DONE; 
0074  *                               pos = iter->next(error)) {
0075  *     printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength());
0076  * }
0077  * </code></pre>
0078  *
0079  * @see StringSearch
0080  * @see RuleBasedCollator
0081  */
0082 class U_I18N_API SearchIterator : public UObject {
0083 
0084 public:
0085 
0086     // public constructors and destructors -------------------------------
0087 
0088     /** 
0089     * Copy constructor that creates a SearchIterator instance with the same 
0090     * behavior, and iterating over the same text. 
0091     * @param other the SearchIterator instance to be copied.
0092     * @stable ICU 2.0
0093     */
0094     SearchIterator(const SearchIterator &other);
0095 
0096     /**
0097      * Destructor. Cleans up the search iterator data struct.
0098      * @stable ICU 2.0
0099      */
0100     virtual ~SearchIterator();
0101 
0102     // public get and set methods ----------------------------------------
0103 
0104     /**
0105      * Sets the index to point to the given position, and clears any state 
0106      * that's affected.
0107      * <p>
0108      * This method takes the argument index and sets the position in the text 
0109      * string accordingly without checking if the index is pointing to a 
0110      * valid starting point to begin searching. 
0111      * @param position within the text to be set. If position is less
0112      *             than or greater than the text range for searching, 
0113      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
0114      * @param status for errors if it occurs
0115      * @stable ICU 2.0
0116      */
0117     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
0118 
0119     /**
0120      * Return the current index in the text being searched.
0121      * If the iteration has gone past the end of the text
0122      * (or past the beginning for a backwards search), USEARCH_DONE
0123      * is returned.
0124      * @return current index in the text being searched.
0125      * @stable ICU 2.0
0126      */
0127     virtual int32_t getOffset(void) const = 0;
0128 
0129     /**
0130     * Sets the text searching attributes located in the enum 
0131     * USearchAttribute with values from the enum USearchAttributeValue.
0132     * USEARCH_DEFAULT can be used for all attributes for resetting.
0133     * @param attribute text attribute (enum USearchAttribute) to be set
0134     * @param value text attribute value
0135     * @param status for errors if it occurs
0136     * @stable ICU 2.0
0137     */
0138     void setAttribute(USearchAttribute       attribute,
0139                       USearchAttributeValue  value,
0140                       UErrorCode            &status);
0141 
0142     /**    
0143     * Gets the text searching attributes
0144     * @param attribute text attribute (enum USearchAttribute) to be retrieve
0145     * @return text attribute value
0146     * @stable ICU 2.0
0147     */
0148     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
0149     
0150     /**
0151     * Returns the index to the match in the text string that was searched.
0152     * This call returns a valid result only after a successful call to 
0153     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
0154     * Just after construction, or after a searching method returns 
0155     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
0156     * <p>
0157     * Use getMatchedLength to get the matched string length.
0158     * @return index of a substring within the text string that is being 
0159     *         searched.
0160     * @see #first
0161     * @see #next
0162     * @see #previous
0163     * @see #last
0164     * @stable ICU 2.0
0165     */
0166     int32_t getMatchedStart(void) const;
0167 
0168     /**
0169      * Returns the length of text in the string which matches the search 
0170      * pattern. This call returns a valid result only after a successful call 
0171      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
0172      * Just after construction, or after a searching method returns 
0173      * <tt>USEARCH_DONE</tt>, this method will return 0.
0174      * @return The length of the match in the target text, or 0 if there
0175      *         is no match currently.
0176      * @see #first
0177      * @see #next
0178      * @see #previous
0179      * @see #last
0180      * @stable ICU 2.0
0181      */
0182     int32_t getMatchedLength(void) const;
0183     
0184     /**
0185      * Returns the text that was matched by the most recent call to 
0186      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
0187      * If the iterator is not pointing at a valid match (e.g. just after 
0188      * construction or after <tt>USEARCH_DONE</tt> has been returned, 
0189      * returns an empty string. 
0190      * @param result stores the matched string or an empty string if a match
0191      *        is not found.
0192      * @see #first
0193      * @see #next
0194      * @see #previous
0195      * @see #last
0196      * @stable ICU 2.0
0197      */
0198     void getMatchedText(UnicodeString &result) const;
0199     
0200     /**
0201      * Set the BreakIterator that will be used to restrict the points
0202      * at which matches are detected. The user is responsible for deleting 
0203      * the breakiterator.
0204      * @param breakiter A BreakIterator that will be used to restrict the 
0205      *                points at which matches are detected. If a match is 
0206      *                found, but the match's start or end index is not a 
0207      *                boundary as determined by the <tt>BreakIterator</tt>, 
0208      *                the match will be rejected and another will be searched 
0209      *                for. If this parameter is <tt>nullptr</tt>, no break
0210      *                detection is attempted.
0211      * @param status for errors if it occurs
0212      * @see BreakIterator
0213      * @stable ICU 2.0
0214      */
0215     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
0216     
0217     /**
0218      * Returns the BreakIterator that is used to restrict the points at 
0219      * which matches are detected.  This will be the same object that was 
0220      * passed to the constructor or to <tt>setBreakIterator</tt>.
0221      * Note that <tt>nullptr</tt> is a legal value; it means that break
0222      * detection should not be attempted.
0223      * @return BreakIterator used to restrict matchings.
0224      * @see #setBreakIterator
0225      * @stable ICU 2.0
0226      */
0227     const BreakIterator * getBreakIterator(void) const;
0228 
0229     /**
0230      * Set the string text to be searched. Text iteration will hence begin at 
0231      * the start of the text string. This method is useful if you want to 
0232      * re-use an iterator to search for the same pattern within a different 
0233      * body of text. The user is responsible for deleting the text.
0234      * @param text string to be searched.
0235      * @param status for errors. If the text length is 0, 
0236      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
0237      * @stable ICU 2.0
0238      */
0239     virtual void setText(const UnicodeString &text, UErrorCode &status);    
0240 
0241     /**
0242      * Set the string text to be searched. Text iteration will hence begin at 
0243      * the start of the text string. This method is useful if you want to 
0244      * re-use an iterator to search for the same pattern within a different 
0245      * body of text.
0246      * <p>
0247      * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
0248      * will be done during searching for this version. The block of text 
0249      * in <tt>CharacterIterator</tt> will be used as it is.
0250      * The user is responsible for deleting the text.
0251      * @param text string iterator to be searched.
0252      * @param status for errors if any. If the text length is 0 then an 
0253      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
0254      * @stable ICU 2.0
0255      */
0256     virtual void setText(CharacterIterator &text, UErrorCode &status);
0257     
0258     /**
0259      * Return the string text to be searched.
0260      * @return text string to be searched.
0261      * @stable ICU 2.0
0262      */
0263     const UnicodeString & getText(void) const;
0264 
0265     // operator overloading ----------------------------------------------
0266 
0267     /**
0268      * Equality operator. 
0269      * @param that SearchIterator instance to be compared.
0270      * @return true if both BreakIterators are of the same class, have the 
0271      *         same behavior, terates over the same text and have the same
0272      *         attributes. false otherwise.
0273      * @stable ICU 2.0
0274      */
0275     virtual bool operator==(const SearchIterator &that) const;
0276 
0277     /**
0278      * Not-equal operator. 
0279      * @param that SearchIterator instance to be compared.
0280      * @return false if operator== returns true, and vice versa.
0281      * @stable ICU 2.0
0282      */
0283     bool operator!=(const SearchIterator &that) const;
0284 
0285     // public methods ----------------------------------------------------
0286 
0287     /**
0288      * Returns a copy of SearchIterator with the same behavior, and 
0289      * iterating over the same text, as this one. Note that all data will be
0290      * replicated, except for the text string to be searched.
0291      * @return cloned object
0292      * @stable ICU 2.0
0293      */
0294     virtual SearchIterator* safeClone(void) const = 0;
0295 
0296     /**
0297      * Returns the first index at which the string text matches the search 
0298      * pattern. The iterator is adjusted so that its current index (as 
0299      * returned by <tt>getOffset</tt>) is the match position if one 
0300      * was found.
0301      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
0302      * the iterator will be adjusted to the index USEARCH_DONE
0303      * @param  status for errors if it occurs
0304      * @return The character index of the first match, or 
0305      *         <tt>USEARCH_DONE</tt> if there are no matches.
0306      * @see #getOffset
0307      * @stable ICU 2.0
0308      */
0309     int32_t first(UErrorCode &status);
0310 
0311     /**
0312      * Returns the first index equal or greater than <tt>position</tt> at which the 
0313      * string text matches the search pattern. The iterator is adjusted so 
0314      * that its current index (as returned by <tt>getOffset</tt>) is the 
0315      * match position if one was found.
0316      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
0317      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
0318      * @param  position where search if to start from. If position is less
0319      *             than or greater than the text range for searching, 
0320      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
0321      * @param  status for errors if it occurs
0322      * @return The character index of the first match following 
0323      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no 
0324      *         matches.
0325      * @see #getOffset
0326      * @stable ICU 2.0
0327      */
0328     int32_t following(int32_t position, UErrorCode &status);
0329     
0330     /**
0331      * Returns the last index in the target text at which it matches the 
0332      * search pattern. The iterator is adjusted so that its current index 
0333      * (as returned by <tt>getOffset</tt>) is the match position if one was 
0334      * found.
0335      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
0336      * the iterator will be adjusted to the index USEARCH_DONE.
0337      * @param  status for errors if it occurs
0338      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if 
0339      *         there are no matches.
0340      * @see #getOffset
0341      * @stable ICU 2.0
0342      */
0343     int32_t last(UErrorCode &status);
0344 
0345     /**
0346      * Returns the first index less than <tt>position</tt> at which the string 
0347      * text matches the search pattern. The iterator is adjusted so that its 
0348      * current index (as returned by <tt>getOffset</tt>) is the match 
0349      * position if one was found. If a match is not found, 
0350      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be 
0351      * adjusted to the index USEARCH_DONE
0352      * <p>
0353      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
0354      * result match is always less than <tt>position</tt>.
0355      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
0356      * <tt>position</tt>.
0357      *
0358      * @param  position where search is to start from. If position is less
0359      *             than or greater than the text range for searching, 
0360      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
0361      * @param  status for errors if it occurs
0362      * @return The character index of the first match preceding 
0363      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are 
0364      *         no matches.
0365      * @see #getOffset
0366      * @stable ICU 2.0
0367      */
0368     int32_t preceding(int32_t position, UErrorCode &status);
0369 
0370     /**
0371      * Returns the index of the next point at which the text matches the
0372      * search pattern, starting from the current position
0373      * The iterator is adjusted so that its current index (as returned by 
0374      * <tt>getOffset</tt>) is the match position if one was found.
0375      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
0376      * the iterator will be adjusted to a position after the end of the text 
0377      * string.
0378      * @param  status for errors if it occurs
0379      * @return The index of the next match after the current position,
0380      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
0381      * @see #getOffset
0382      * @stable ICU 2.0
0383      */
0384      int32_t next(UErrorCode &status);
0385 
0386     /**
0387      * Returns the index of the previous point at which the string text 
0388      * matches the search pattern, starting at the current position.
0389      * The iterator is adjusted so that its current index (as returned by 
0390      * <tt>getOffset</tt>) is the match position if one was found.
0391      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
0392      * the iterator will be adjusted to the index USEARCH_DONE
0393      * @param  status for errors if it occurs
0394      * @return The index of the previous match before the current position,
0395      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
0396      * @see #getOffset
0397      * @stable ICU 2.0
0398      */
0399     int32_t previous(UErrorCode &status);
0400 
0401     /** 
0402     * Resets the iteration.
0403     * Search will begin at the start of the text string if a forward 
0404     * iteration is initiated before a backwards iteration. Otherwise if a 
0405     * backwards iteration is initiated before a forwards iteration, the 
0406     * search will begin at the end of the text string.    
0407     * @stable ICU 2.0
0408     */
0409     virtual void reset();
0410 
0411 protected:
0412     // protected data members ---------------------------------------------
0413 
0414     /**
0415     * C search data struct
0416     * @stable ICU 2.0
0417     */
0418     USearch *m_search_;
0419 
0420     /**
0421     * Break iterator.
0422     * Currently the C++ breakiterator does not have getRules etc to reproduce
0423     * another in C. Hence we keep the original around and do the verification
0424     * at the end of the match. The user is responsible for deleting this
0425     * break iterator.
0426     * @stable ICU 2.0
0427     */
0428     BreakIterator *m_breakiterator_;
0429     
0430     /**
0431     * Unicode string version of the search text
0432     * @stable ICU 2.0
0433     */
0434     UnicodeString  m_text_;
0435 
0436     // protected constructors and destructors -----------------------------
0437 
0438     /**
0439     * Default constructor.
0440     * Initializes data to the default values.
0441     * @stable ICU 2.0
0442     */
0443     SearchIterator();
0444 
0445     /**
0446      * Constructor for use by subclasses.
0447      * @param text The target text to be searched.
0448      * @param breakiter A {@link BreakIterator} that is used to restrict the 
0449      *                points at which matches are detected. If 
0450      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
0451      *                match, but the match's start or end index is not a 
0452      *                boundary as determined by the <tt>BreakIterator</tt>, 
0453      *                the match is rejected and <tt>handleNext</tt> or 
0454      *                <tt>handlePrev</tt> is called again. If this parameter 
0455      *                is <tt>nullptr</tt>, no break detection is attempted.
0456      * @see #handleNext
0457      * @see #handlePrev
0458      * @stable ICU 2.0
0459      */
0460     SearchIterator(const UnicodeString &text, 
0461                          BreakIterator *breakiter = nullptr);
0462 
0463     /**
0464      * Constructor for use by subclasses.
0465      * <p>
0466      * Note: No parsing of the text within the <tt>CharacterIterator</tt> 
0467      * will be done during searching for this version. The block of text 
0468      * in <tt>CharacterIterator</tt> will be used as it is.
0469      * @param text The target text to be searched.
0470      * @param breakiter A {@link BreakIterator} that is used to restrict the 
0471      *                points at which matches are detected. If 
0472      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a 
0473      *                match, but the match's start or end index is not a 
0474      *                boundary as determined by the <tt>BreakIterator</tt>, 
0475      *                the match is rejected and <tt>handleNext</tt> or 
0476      *                <tt>handlePrev</tt> is called again. If this parameter 
0477      *                is <tt>nullptr</tt>, no break detection is attempted.
0478      * @see #handleNext
0479      * @see #handlePrev
0480      * @stable ICU 2.0
0481      */
0482     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = nullptr);
0483 
0484     // protected methods --------------------------------------------------
0485 
0486     /**
0487      * Assignment operator. Sets this iterator to have the same behavior,
0488      * and iterate over the same text, as the one passed in.
0489      * @param that instance to be copied.
0490      * @stable ICU 2.0
0491      */
0492     SearchIterator & operator=(const SearchIterator &that);
0493 
0494     /**
0495      * Abstract method which subclasses override to provide the mechanism
0496      * for finding the next match in the target text. This allows different
0497      * subclasses to provide different search algorithms.
0498      * <p>
0499      * If a match is found, the implementation should return the index at
0500      * which the match starts and should call 
0501      * <tt>setMatchLength</tt> with the number of characters 
0502      * in the target text that make up the match. If no match is found, the 
0503      * method should return USEARCH_DONE.
0504      * <p>
0505      * @param position The index in the target text at which the search 
0506      *                 should start.
0507      * @param status for error codes if it occurs.
0508      * @return index at which the match starts, else if match is not found 
0509      *         USEARCH_DONE is returned
0510      * @see #setMatchLength
0511      * @stable ICU 2.0
0512      */
0513     virtual int32_t handleNext(int32_t position, UErrorCode &status) 
0514                                                                          = 0;
0515 
0516     /**
0517      * Abstract method which subclasses override to provide the mechanism for
0518      * finding the previous match in the target text. This allows different
0519      * subclasses to provide different search algorithms.
0520      * <p>
0521      * If a match is found, the implementation should return the index at
0522      * which the match starts and should call 
0523      * <tt>setMatchLength</tt> with the number of characters 
0524      * in the target text that make up the match. If no match is found, the 
0525      * method should return USEARCH_DONE.
0526      * <p>
0527      * @param position The index in the target text at which the search 
0528      *                 should start.
0529      * @param status for error codes if it occurs.
0530      * @return index at which the match starts, else if match is not found 
0531      *         USEARCH_DONE is returned
0532      * @see #setMatchLength
0533      * @stable ICU 2.0
0534      */
0535      virtual int32_t handlePrev(int32_t position, UErrorCode &status) 
0536                                                                          = 0;
0537 
0538     /**
0539      * Sets the length of the currently matched string in the text string to
0540      * be searched.
0541      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
0542      * methods should call this when they find a match in the target text.
0543      * @param length length of the matched text.
0544      * @see #handleNext
0545      * @see #handlePrev
0546      * @stable ICU 2.0
0547      */
0548     virtual void setMatchLength(int32_t length);
0549 
0550     /**
0551      * Sets the offset of the currently matched string in the text string to
0552      * be searched.
0553      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
0554      * methods should call this when they find a match in the target text.
0555      * @param position start offset of the matched text.
0556      * @see #handleNext
0557      * @see #handlePrev
0558      * @stable ICU 2.0
0559      */
0560     virtual void setMatchStart(int32_t position);
0561 
0562     /**
0563     * sets match not found 
0564     * @stable ICU 2.0
0565     */
0566     void setMatchNotFound();
0567 };
0568 
0569 inline bool SearchIterator::operator!=(const SearchIterator &that) const
0570 {
0571    return !operator==(that); 
0572 }
0573 U_NAMESPACE_END
0574 
0575 #endif /* #if !UCONFIG_NO_COLLATION */
0576 
0577 #endif /* U_SHOW_CPLUSPLUS_API */
0578 
0579 #endif
0580