|
||||
Warning, file /include/unicode/edits.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 0004 // edits.h 0005 // created: 2016dec30 Markus W. Scherer 0006 0007 #ifndef __EDITS_H__ 0008 #define __EDITS_H__ 0009 0010 #include "unicode/utypes.h" 0011 0012 #if U_SHOW_CPLUSPLUS_API 0013 0014 #include "unicode/uobject.h" 0015 0016 /** 0017 * \file 0018 * \brief C++ API: C++ class Edits for low-level string transformations on styled text. 0019 */ 0020 0021 U_NAMESPACE_BEGIN 0022 0023 class UnicodeString; 0024 0025 /** 0026 * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions 0027 * in linear progression. Does not support moving/reordering of text. 0028 * 0029 * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to 0030 * instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and 0031 * {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity, 0032 * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one 0033 * mapping between code points in the source and destination strings. 0034 * 0035 * After all edits have been added, instances of this class should be considered immutable, and an 0036 * {@link Edits::Iterator} can be used for queries. 0037 * 0038 * There are four flavors of Edits::Iterator: 0039 * 0040 * <ul> 0041 * <li>{@link #getFineIterator()} retains full granularity of change edits. 0042 * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling 0043 * next() on the iterator, skips over no-change edits (unchanged regions). 0044 * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change 0045 * edits are automatically merged during the construction phase.) 0046 * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when 0047 * calling next() on the iterator, skips over no-change edits (unchanged regions). 0048 * </ul> 0049 * 0050 * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the 0051 * following fine edits: 0052 * <ul> 0053 * <li>abc ⇨ abc (no-change) 0054 * <li>ß ⇨ ss (change) 0055 * <li>D ⇨ d (change) 0056 * <li>e ⇨ e (no-change) 0057 * <li>F ⇨ f (change) 0058 * </ul> 0059 * and the following coarse edits (note how adjacent change edits get merged together): 0060 * <ul> 0061 * <li>abc ⇨ abc (no-change) 0062 * <li>ßD ⇨ ssd (change) 0063 * <li>e ⇨ e (no-change) 0064 * <li>F ⇨ f (change) 0065 * </ul> 0066 * 0067 * The "fine changes" and "coarse changes" iterators will step through only the change edits when their 0068 * `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when 0069 * their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()` 0070 * methods are used to walk through the string. 0071 * 0072 * For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in 0073 * UCharacterCaseTest.java. 0074 * 0075 * An Edits object tracks a separate UErrorCode, but ICU string transformation functions 0076 * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. 0077 * 0078 * @stable ICU 59 0079 */ 0080 class U_COMMON_API Edits final : public UMemory { 0081 public: 0082 /** 0083 * Constructs an empty object. 0084 * @stable ICU 59 0085 */ 0086 Edits() : 0087 array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0), 0088 errorCode_(U_ZERO_ERROR) {} 0089 /** 0090 * Copy constructor. 0091 * @param other source edits 0092 * @stable ICU 60 0093 */ 0094 Edits(const Edits &other) : 0095 array(stackArray), capacity(STACK_CAPACITY), length(other.length), 0096 delta(other.delta), numChanges(other.numChanges), 0097 errorCode_(other.errorCode_) { 0098 copyArray(other); 0099 } 0100 /** 0101 * Move constructor, might leave src empty. 0102 * This object will have the same contents that the source object had. 0103 * @param src source edits 0104 * @stable ICU 60 0105 */ 0106 Edits(Edits &&src) noexcept : 0107 array(stackArray), capacity(STACK_CAPACITY), length(src.length), 0108 delta(src.delta), numChanges(src.numChanges), 0109 errorCode_(src.errorCode_) { 0110 moveArray(src); 0111 } 0112 0113 /** 0114 * Destructor. 0115 * @stable ICU 59 0116 */ 0117 ~Edits(); 0118 0119 /** 0120 * Assignment operator. 0121 * @param other source edits 0122 * @return *this 0123 * @stable ICU 60 0124 */ 0125 Edits &operator=(const Edits &other); 0126 0127 /** 0128 * Move assignment operator, might leave src empty. 0129 * This object will have the same contents that the source object had. 0130 * The behavior is undefined if *this and src are the same object. 0131 * @param src source edits 0132 * @return *this 0133 * @stable ICU 60 0134 */ 0135 Edits &operator=(Edits &&src) noexcept; 0136 0137 /** 0138 * Resets the data but may not release memory. 0139 * @stable ICU 59 0140 */ 0141 void reset() noexcept; 0142 0143 /** 0144 * Adds a no-change edit: a record for an unchanged segment of text. 0145 * Normally called from inside ICU string transformation functions, not user code. 0146 * @stable ICU 59 0147 */ 0148 void addUnchanged(int32_t unchangedLength); 0149 /** 0150 * Adds a change edit: a record for a text replacement/insertion/deletion. 0151 * Normally called from inside ICU string transformation functions, not user code. 0152 * @stable ICU 59 0153 */ 0154 void addReplace(int32_t oldLength, int32_t newLength); 0155 /** 0156 * Sets the UErrorCode if an error occurred while recording edits. 0157 * Preserves older error codes in the outErrorCode. 0158 * Normally called from inside ICU string transformation functions, not user code. 0159 * @param outErrorCode Set to an error code if it does not contain one already 0160 * and an error occurred while recording edits. 0161 * Otherwise unchanged. 0162 * @return true if U_FAILURE(outErrorCode) 0163 * @stable ICU 59 0164 */ 0165 UBool copyErrorTo(UErrorCode &outErrorCode) const; 0166 0167 /** 0168 * How much longer is the new text compared with the old text? 0169 * @return new length minus old length 0170 * @stable ICU 59 0171 */ 0172 int32_t lengthDelta() const { return delta; } 0173 /** 0174 * @return true if there are any change edits 0175 * @stable ICU 59 0176 */ 0177 UBool hasChanges() const { return numChanges != 0; } 0178 0179 /** 0180 * @return the number of change edits 0181 * @stable ICU 60 0182 */ 0183 int32_t numberOfChanges() const { return numChanges; } 0184 0185 /** 0186 * Access to the list of edits. 0187 * 0188 * At any moment in time, an instance of this class points to a single edit: a "window" into a span 0189 * of the source string and the corresponding span of the destination string. The source string span 0190 * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string 0191 * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. 0192 * 0193 * The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`, 0194 * and `findDestinationIndex(int32_t, UErrorCode &)` methods. 0195 * Calling any of these methods mutates the iterator to make it point to the corresponding edit. 0196 * 0197 * For more information, see the documentation for {@link Edits}. 0198 * 0199 * @see getCoarseIterator 0200 * @see getFineIterator 0201 * @stable ICU 59 0202 */ 0203 struct U_COMMON_API Iterator final : public UMemory { 0204 /** 0205 * Default constructor, empty iterator. 0206 * @stable ICU 60 0207 */ 0208 Iterator() : 0209 array(nullptr), index(0), length(0), 0210 remaining(0), onlyChanges_(false), coarse(false), 0211 dir(0), changed(false), oldLength_(0), newLength_(0), 0212 srcIndex(0), replIndex(0), destIndex(0) {} 0213 /** 0214 * Copy constructor. 0215 * @stable ICU 59 0216 */ 0217 Iterator(const Iterator &other) = default; 0218 /** 0219 * Assignment operator. 0220 * @stable ICU 59 0221 */ 0222 Iterator &operator=(const Iterator &other) = default; 0223 0224 /** 0225 * Advances the iterator to the next edit. 0226 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 0227 * or else the function returns immediately. Check for U_FAILURE() 0228 * on output or use with function chaining. (See User Guide for details.) 0229 * @return true if there is another edit 0230 * @stable ICU 59 0231 */ 0232 UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } 0233 0234 /** 0235 * Moves the iterator to the edit that contains the source index. 0236 * The source index may be found in a no-change edit 0237 * even if normal iteration would skip no-change edits. 0238 * Normal iteration can continue from a found edit. 0239 * 0240 * The iterator state before this search logically does not matter. 0241 * (It may affect the performance of the search.) 0242 * 0243 * The iterator state after this search is undefined 0244 * if the source index is out of bounds for the source string. 0245 * 0246 * @param i source index 0247 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 0248 * or else the function returns immediately. Check for U_FAILURE() 0249 * on output or use with function chaining. (See User Guide for details.) 0250 * @return true if the edit for the source index was found 0251 * @stable ICU 59 0252 */ 0253 UBool findSourceIndex(int32_t i, UErrorCode &errorCode) { 0254 return findIndex(i, true, errorCode) == 0; 0255 } 0256 0257 /** 0258 * Moves the iterator to the edit that contains the destination index. 0259 * The destination index may be found in a no-change edit 0260 * even if normal iteration would skip no-change edits. 0261 * Normal iteration can continue from a found edit. 0262 * 0263 * The iterator state before this search logically does not matter. 0264 * (It may affect the performance of the search.) 0265 * 0266 * The iterator state after this search is undefined 0267 * if the source index is out of bounds for the source string. 0268 * 0269 * @param i destination index 0270 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 0271 * or else the function returns immediately. Check for U_FAILURE() 0272 * on output or use with function chaining. (See User Guide for details.) 0273 * @return true if the edit for the destination index was found 0274 * @stable ICU 60 0275 */ 0276 UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) { 0277 return findIndex(i, false, errorCode) == 0; 0278 } 0279 0280 /** 0281 * Computes the destination index corresponding to the given source index. 0282 * If the source index is inside a change edit (not at its start), 0283 * then the destination index at the end of that edit is returned, 0284 * since there is no information about index mapping inside a change edit. 0285 * 0286 * (This means that indexes to the start and middle of an edit, 0287 * for example around a grapheme cluster, are mapped to indexes 0288 * encompassing the entire edit. 0289 * The alternative, mapping an interior index to the start, 0290 * would map such an interval to an empty one.) 0291 * 0292 * This operation will usually but not always modify this object. 0293 * The iterator state after this search is undefined. 0294 * 0295 * @param i source index 0296 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 0297 * or else the function returns immediately. Check for U_FAILURE() 0298 * on output or use with function chaining. (See User Guide for details.) 0299 * @return destination index; undefined if i is not 0..string length 0300 * @stable ICU 60 0301 */ 0302 int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); 0303 0304 /** 0305 * Computes the source index corresponding to the given destination index. 0306 * If the destination index is inside a change edit (not at its start), 0307 * then the source index at the end of that edit is returned, 0308 * since there is no information about index mapping inside a change edit. 0309 * 0310 * (This means that indexes to the start and middle of an edit, 0311 * for example around a grapheme cluster, are mapped to indexes 0312 * encompassing the entire edit. 0313 * The alternative, mapping an interior index to the start, 0314 * would map such an interval to an empty one.) 0315 * 0316 * This operation will usually but not always modify this object. 0317 * The iterator state after this search is undefined. 0318 * 0319 * @param i destination index 0320 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 0321 * or else the function returns immediately. Check for U_FAILURE() 0322 * on output or use with function chaining. (See User Guide for details.) 0323 * @return source index; undefined if i is not 0..string length 0324 * @stable ICU 60 0325 */ 0326 int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode); 0327 0328 /** 0329 * Returns whether the edit currently represented by the iterator is a change edit. 0330 * 0331 * @return true if this edit replaces oldLength() units with newLength() different ones. 0332 * false if oldLength units remain unchanged. 0333 * @stable ICU 59 0334 */ 0335 UBool hasChange() const { return changed; } 0336 0337 /** 0338 * The length of the current span in the source string, which starts at {@link #sourceIndex}. 0339 * 0340 * @return the number of units in the original string which are replaced or remain unchanged. 0341 * @stable ICU 59 0342 */ 0343 int32_t oldLength() const { return oldLength_; } 0344 0345 /** 0346 * The length of the current span in the destination string, which starts at 0347 * {@link #destinationIndex}, or in the replacement string, which starts at 0348 * {@link #replacementIndex}. 0349 * 0350 * @return the number of units in the modified string, if hasChange() is true. 0351 * Same as oldLength if hasChange() is false. 0352 * @stable ICU 59 0353 */ 0354 int32_t newLength() const { return newLength_; } 0355 0356 /** 0357 * The start index of the current span in the source string; the span has length 0358 * {@link #oldLength}. 0359 * 0360 * @return the current index into the source string 0361 * @stable ICU 59 0362 */ 0363 int32_t sourceIndex() const { return srcIndex; } 0364 0365 /** 0366 * The start index of the current span in the replacement string; the span has length 0367 * {@link #newLength}. Well-defined only if the current edit is a change edit. 0368 * 0369 * The *replacement string* is the concatenation of all substrings of the destination 0370 * string corresponding to change edits. 0371 * 0372 * This method is intended to be used together with operations that write only replacement 0373 * characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option). 0374 * The source string can then be modified in-place. 0375 * 0376 * @return the current index into the replacement-characters-only string, 0377 * not counting unchanged spans 0378 * @stable ICU 59 0379 */ 0380 int32_t replacementIndex() const { 0381 // TODO: Throw an exception if we aren't in a change edit? 0382 return replIndex; 0383 } 0384 0385 /** 0386 * The start index of the current span in the destination string; the span has length 0387 * {@link #newLength}. 0388 * 0389 * @return the current index into the full destination string 0390 * @stable ICU 59 0391 */ 0392 int32_t destinationIndex() const { return destIndex; } 0393 0394 #ifndef U_HIDE_INTERNAL_API 0395 /** 0396 * A string representation of the current edit represented by the iterator for debugging. You 0397 * should not depend on the contents of the return string. 0398 * @internal 0399 */ 0400 UnicodeString& toString(UnicodeString& appendTo) const; 0401 #endif // U_HIDE_INTERNAL_API 0402 0403 private: 0404 friend class Edits; 0405 0406 Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); 0407 0408 int32_t readLength(int32_t head); 0409 void updateNextIndexes(); 0410 void updatePreviousIndexes(); 0411 UBool noNext(); 0412 UBool next(UBool onlyChanges, UErrorCode &errorCode); 0413 UBool previous(UErrorCode &errorCode); 0414 /** @return -1: error or i<0; 0: found; 1: i>=string length */ 0415 int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode); 0416 0417 const uint16_t *array; 0418 int32_t index, length; 0419 // 0 if we are not within compressed equal-length changes. 0420 // Otherwise the number of remaining changes, including the current one. 0421 int32_t remaining; 0422 UBool onlyChanges_, coarse; 0423 0424 int8_t dir; // iteration direction: back(<0), initial(0), forward(>0) 0425 UBool changed; 0426 int32_t oldLength_, newLength_; 0427 int32_t srcIndex, replIndex, destIndex; 0428 }; 0429 0430 /** 0431 * Returns an Iterator for coarse-grained change edits 0432 * (adjacent change edits are treated as one). 0433 * Can be used to perform simple string updates. 0434 * Skips no-change edits. 0435 * @return an Iterator that merges adjacent changes. 0436 * @stable ICU 59 0437 */ 0438 Iterator getCoarseChangesIterator() const { 0439 return Iterator(array, length, true, true); 0440 } 0441 0442 /** 0443 * Returns an Iterator for coarse-grained change and no-change edits 0444 * (adjacent change edits are treated as one). 0445 * Can be used to perform simple string updates. 0446 * Adjacent change edits are treated as one edit. 0447 * @return an Iterator that merges adjacent changes. 0448 * @stable ICU 59 0449 */ 0450 Iterator getCoarseIterator() const { 0451 return Iterator(array, length, false, true); 0452 } 0453 0454 /** 0455 * Returns an Iterator for fine-grained change edits 0456 * (full granularity of change edits is retained). 0457 * Can be used for modifying styled text. 0458 * Skips no-change edits. 0459 * @return an Iterator that separates adjacent changes. 0460 * @stable ICU 59 0461 */ 0462 Iterator getFineChangesIterator() const { 0463 return Iterator(array, length, true, false); 0464 } 0465 0466 /** 0467 * Returns an Iterator for fine-grained change and no-change edits 0468 * (full granularity of change edits is retained). 0469 * Can be used for modifying styled text. 0470 * @return an Iterator that separates adjacent changes. 0471 * @stable ICU 59 0472 */ 0473 Iterator getFineIterator() const { 0474 return Iterator(array, length, false, false); 0475 } 0476 0477 /** 0478 * Merges the two input Edits and appends the result to this object. 0479 * 0480 * Consider two string transformations (for example, normalization and case mapping) 0481 * where each records Edits in addition to writing an output string.<br> 0482 * Edits ab reflect how substrings of input string a 0483 * map to substrings of intermediate string b.<br> 0484 * Edits bc reflect how substrings of intermediate string b 0485 * map to substrings of output string c.<br> 0486 * This function merges ab and bc such that the additional edits 0487 * recorded in this object reflect how substrings of input string a 0488 * map to substrings of output string c. 0489 * 0490 * If unrelated Edits are passed in where the output string of the first 0491 * has a different length than the input string of the second, 0492 * then a U_ILLEGAL_ARGUMENT_ERROR is reported. 0493 * 0494 * @param ab reflects how substrings of input string a 0495 * map to substrings of intermediate string b. 0496 * @param bc reflects how substrings of intermediate string b 0497 * map to substrings of output string c. 0498 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 0499 * or else the function returns immediately. Check for U_FAILURE() 0500 * on output or use with function chaining. (See User Guide for details.) 0501 * @return *this, with the merged edits appended 0502 * @stable ICU 60 0503 */ 0504 Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode); 0505 0506 private: 0507 void releaseArray() noexcept; 0508 Edits ©Array(const Edits &other); 0509 Edits &moveArray(Edits &src) noexcept; 0510 0511 void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } 0512 int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } 0513 0514 void append(int32_t r); 0515 UBool growArray(); 0516 0517 static const int32_t STACK_CAPACITY = 100; 0518 uint16_t *array; 0519 int32_t capacity; 0520 int32_t length; 0521 int32_t delta; 0522 int32_t numChanges; 0523 UErrorCode errorCode_; 0524 uint16_t stackArray[STACK_CAPACITY]; 0525 }; 0526 0527 U_NAMESPACE_END 0528 0529 #endif /* U_SHOW_CPLUSPLUS_API */ 0530 0531 #endif // __EDITS_H__
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |