include/opencascade/NCollection_UtfIterator.hxx

0001 // Created on: 2013-01-28
0002 // Created by: Kirill GAVRILOV
0003 // Copyright (c) 2013-2014 OPEN CASCADE SAS
0004 //
0005 // This file is part of Open CASCADE Technology software library.
0006 //
0007 // This library is free software; you can redistribute it and/or modify it under
0008 // the terms of the GNU Lesser General Public License version 2.1 as published
0009 // by the Free Software Foundation, with special exception defined in the file
0010 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
0011 // distribution for complete text of the license and disclaimer of any warranty.
0012 //
0013 // Alternatively, this file may be used under the terms of Open CASCADE
0014 // commercial license or contractual agreement.
0015
0016 #ifndef NCollection_UtfIterator_HeaderFile
0017 #define NCollection_UtfIterator_HeaderFile
0018
0019 #include <Standard_Handle.hxx>
0020
0021 //! Template class for Unicode strings support.
0022 //!
0023 //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
0024 //! and convert it from one to another.
0025 //! The current value of iterator is returned as UTF-32 Unicode symbol.
0026 //!
0027 //! Here and below term "Unicode symbol" is used as
0028 //! synonym of "Unicode code point".
0029 template<typename Type>
0030 class NCollection_UtfIterator
0031 {
0032
0033 public:
0034
0035   //! Constructor.
0036   //! @param theString buffer to iterate
0037   NCollection_UtfIterator (const Type* theString)
0038   : myPosition(theString),
0039     myPosNext(theString),
0040     myCharIndex(0),
0041     myCharUtf32(0)
0042   {
0043     if (theString != NULL)
0044     {
0045       ++(*this);
0046       myCharIndex = 0;
0047     }
0048   }
0049
0050   //! Initialize iterator within specified NULL-terminated string.
0051   void Init (const Type* theString)
0052   {
0053     myPosition  = theString;
0054     myPosNext   = theString;
0055     myCharUtf32 = 0;
0056     if (theString != NULL)
0057     {
0058       ++(*this);
0059     }
0060     myCharIndex = 0;
0061   }
0062
0063   //! Pre-increment operator. Reads the next unicode symbol.
0064   //! Notice - no protection against overrun!
0065   NCollection_UtfIterator& operator++()
0066   {
0067     myPosition = myPosNext;
0068     ++myCharIndex;
0069     readNext (static_cast<const typename CharTypeChooser<Type>::type*>(0));
0070     return *this;
0071   }
0072
0073   //! Post-increment operator.
0074   //! Notice - no protection against overrun!
0075   NCollection_UtfIterator operator++ (int )
0076   {
0077     NCollection_UtfIterator aCopy = *this;
0078     ++*this;
0079     return aCopy;
0080   }
0081
0082   //! Equality operator.
0083   bool operator== (const NCollection_UtfIterator& theRight) const
0084   {
0085     return myPosition == theRight.myPosition;
0086   }
0087
0088   //! Return true if Unicode symbol is within valid range.
0089   bool IsValid() const
0090   {
0091     return myCharUtf32 <= UTF32_MAX_LEGAL;
0092   }
0093
0094   //! Dereference operator.
0095   //! @return the UTF-32 codepoint of the symbol currently pointed by iterator.
0096   Standard_Utf32Char operator*() const
0097   {
0098     return myCharUtf32;
0099   }
0100
0101   //! Buffer-fetching getter.
0102   const Type* BufferHere() const { return myPosition; }
0103
0104   //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
0105   Type* ChangeBufferHere() { return (Type* )myPosition; }
0106
0107   //! Buffer-fetching getter.
0108   const Type* BufferNext() const { return myPosNext; }
0109
0110   //! @return the index displacement from iterator initialization
0111   //!         (first symbol has index 0)
0112   Standard_Integer Index() const
0113   {
0114     return myCharIndex;
0115   }
0116
0117   //! @return the advance in bytes to store current symbol in UTF-8.
0118   //! 0 means an invalid symbol;
0119   //! 1-4 bytes are valid range.
0120   Standard_Integer AdvanceBytesUtf8() const;
0121
0122   //! @return the advance in bytes to store current symbol in UTF-16.
0123   //! 0 means an invalid symbol;
0124   //! 2 bytes is a general case;
0125   //! 4 bytes for surrogate pair.
0126   Standard_Integer AdvanceBytesUtf16() const;
0127
0128   //! @return the advance in bytes to store current symbol in UTF-16.
0129   //! 0 means an invalid symbol;
0130   //! 1 16-bit code unit is a general case;
0131   //! 2 16-bit code units for surrogate pair.
0132   Standard_Integer AdvanceCodeUnitsUtf16() const;
0133
0134   //! @return the advance in bytes to store current symbol in UTF-32.
0135   //! Always 4 bytes (method for consistency).
0136   Standard_Integer AdvanceBytesUtf32() const
0137   {
0138     return Standard_Integer(sizeof(Standard_Utf32Char));
0139   }
0140
0141   //! Fill the UTF-8 buffer within current Unicode symbol.
0142   //! Use method AdvanceUtf8() to allocate buffer with enough size.
0143   //! @param theBuffer buffer to fill
0144   //! @return new buffer position (for next char)
0145   Standard_Utf8Char*  GetUtf8 (Standard_Utf8Char*  theBuffer) const;
0146   Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
0147
0148   //! Fill the UTF-16 buffer within current Unicode symbol.
0149   //! Use method AdvanceUtf16() to allocate buffer with enough size.
0150   //! @param theBuffer buffer to fill
0151   //! @return new buffer position (for next char)
0152   Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
0153
0154   //! Fill the UTF-32 buffer within current Unicode symbol.
0155   //! Use method AdvanceUtf32() to allocate buffer with enough size.
0156   //! @param theBuffer buffer to fill
0157   //! @return new buffer position (for next char)
0158   Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
0159
0160   //! @return the advance in TypeWrite chars needed to store current symbol
0161   template<typename TypeWrite>
0162   inline Standard_Integer AdvanceBytesUtf() const
0163   {
0164     return advanceBytes(static_cast<const typename CharTypeChooser<TypeWrite>::type*>(0));
0165   }
0166
0167   //! Fill the UTF-** buffer within current Unicode symbol.
0168   //! Use method AdvanceUtf**() to allocate buffer with enough size.
0169   //! @param theBuffer buffer to fill
0170   //! @return new buffer position (for next char)
0171   template<typename TypeWrite>
0172   inline TypeWrite* GetUtf (TypeWrite* theBuffer) const
0173   {
0174     return (TypeWrite*)(getUtf (reinterpret_cast<typename CharTypeChooser<TypeWrite>::type*>(theBuffer)));
0175   }
0176
0177 private:
0178
0179   //! Helper template class dispatching its argument class
0180   //! to the equivalent (by size) character (Unicode code unit) type.
0181   //! The code unit type is defined as nested typedef "type".
0182   //!
0183   //! In practice this is relevant for wchar_t type:
0184   //! typename CharTypeChooser<wchar_t>::type resolves to
0185   //! Standard_Utf16Char on Windows and to Standard_Utf32Char on Linux.
0186   template <typename TypeChar>
0187   class CharTypeChooser :
0188     public   std::conditional< sizeof(TypeChar) == 1, Standard_Utf8Char,
0189     typename std::conditional< sizeof(TypeChar) == 2, Standard_Utf16Char,
0190     typename std::conditional< sizeof(TypeChar) == 4, Standard_Utf32Char, void >::type >::type >
0191   {
0192   };
0193
0194   //! Helper function for reading a single Unicode symbol from the UTF-8 string.
0195   //! Updates internal state appropriately.
0196   void readUTF8();
0197
0198   //! Helper function for reading a single Unicode symbol from the UTF-16 string.
0199   //! Updates internal state appropriately.
0200   void readUTF16();
0201
0202   //! Helper overload methods to dispatch reading function depending on code unit size
0203   void readNext (const Standard_Utf8Char*)  { readUTF8(); }
0204   void readNext (const Standard_Utf16Char*) { readUTF16(); }
0205   void readNext (const Standard_Utf32Char*) { myCharUtf32 = *myPosNext++; }
0206
0207   //! Helper overload methods to dispatch advance function depending on code unit size
0208   Standard_Integer advanceBytes (const Standard_Utf8Char*)  const { return AdvanceBytesUtf8(); }
0209   Standard_Integer advanceBytes (const Standard_Utf16Char*) const { return AdvanceBytesUtf16(); }
0210   Standard_Integer advanceBytes (const Standard_Utf32Char*) const { return AdvanceBytesUtf32(); }
0211
0212   //! Helper overload methods to dispatch getter function depending on code unit size
0213   Standard_Utf8Char*  getUtf (Standard_Utf8Char*  theBuffer) const { return GetUtf8 (theBuffer); }
0214   Standard_Utf16Char* getUtf (Standard_Utf16Char* theBuffer) const { return GetUtf16(theBuffer); }
0215   Standard_Utf32Char* getUtf (Standard_Utf32Char* theBuffer) const { return GetUtf32(theBuffer); }
0216
0217 private: //! @name unicode magic numbers
0218
0219   static const unsigned char      UTF8_BYTES_MINUS_ONE[256];
0220   static const Standard_Utf32Char offsetsFromUTF8[6];
0221   static const unsigned char      UTF8_FIRST_BYTE_MARK[7];
0222   static const Standard_Utf32Char UTF8_BYTE_MASK;
0223   static const Standard_Utf32Char UTF8_BYTE_MARK;
0224   static const Standard_Utf32Char UTF16_SURROGATE_HIGH_START;
0225   static const Standard_Utf32Char UTF16_SURROGATE_HIGH_END;
0226   static const Standard_Utf32Char UTF16_SURROGATE_LOW_START;
0227   static const Standard_Utf32Char UTF16_SURROGATE_LOW_END;
0228   static const Standard_Utf32Char UTF16_SURROGATE_HIGH_SHIFT;
0229   static const Standard_Utf32Char UTF16_SURROGATE_LOW_BASE;
0230   static const Standard_Utf32Char UTF16_SURROGATE_LOW_MASK;
0231   static const Standard_Utf32Char UTF32_MAX_BMP;
0232   static const Standard_Utf32Char UTF32_MAX_LEGAL;
0233
0234 private: //! @name private fields
0235
0236   const Type*        myPosition;  //!< buffer position of the first element in the current symbol
0237   const Type*        myPosNext;   //!< buffer position of the first element in the next symbol
0238   Standard_Integer   myCharIndex; //!< index displacement from iterator initialization
0239   Standard_Utf32Char myCharUtf32; //!< Unicode symbol stored at the current buffer position
0240
0241 };
0242
0243 typedef NCollection_UtfIterator<Standard_Utf8Char>  NCollection_Utf8Iter;
0244 typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
0245 typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
0246 typedef NCollection_UtfIterator<Standard_WideChar>  NCollection_UtfWideIter;
0247
0248 // template implementation
0249 #include "NCollection_UtfIterator.lxx"
0250
0251 #endif // _NCollection_UtfIterator_H__