include/opencascade/NCollection_UtfIterator.lxx

0001 // Created on: 2013-01-28
0002 // Created by: Kirill GAVRILOV
0003 // Copyright (c) 2013-2014 OPEN CASCADE SAS
0004 //
0005 // This file is part of Open CASCADE Technology software library.
0006 //
0007 // This library is free software; you can redistribute it and/or modify it under
0008 // the terms of the GNU Lesser General Public License version 2.1 as published
0009 // by the Free Software Foundation, with special exception defined in the file
0010 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
0011 // distribution for complete text of the license and disclaimer of any warranty.
0012 //
0013 // Alternatively, this file may be used under the terms of Open CASCADE
0014 // commercial license or contractual agreement.
0015
0016 // Portions of code are copyrighted by Unicode, Inc.
0017 //
0018 // Copyright (c) 2001-2004 Unicode, Inc.
0019 //
0020 // Disclaimer
0021 //
0022 // This source code is provided as is by Unicode, Inc. No claims are
0023 // made as to fitness for any particular purpose. No warranties of any
0024 // kind are expressed or implied. The recipient agrees to determine
0025 // applicability of information provided. If this file has been
0026 // purchased on magnetic or optical media from Unicode, Inc., the
0027 // sole remedy for any claim will be exchange of defective media
0028 // within 90 days of receipt.
0029 //
0030 // Limitations on Rights to Redistribute This Code
0031 //
0032 // Unicode, Inc. hereby grants the right to freely use the information
0033 // supplied in this file in the creation of products supporting the
0034 // Unicode Standard, and to make copies of this file in any form
0035 // for internal or external distribution as long as this notice
0036 // remains attached.
0037
0038 //! The first character in a UTF-8 sequence indicates how many bytes
0039 //! to read (among other things).
0040 template<typename Type>
0041 const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] =
0042 {
0043   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0044   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0045   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0046   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0047   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0048   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0049   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0050   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
0051 };
0052
0053 //! Magic values subtracted from a buffer value during UTF-8 conversion.
0054 //! This table contains as many values as there might be trailing bytes
0055 //! in a UTF-8 sequence.
0056 template<typename Type>
0057 const Standard_Utf32Char NCollection_UtfIterator<Type>::offsetsFromUTF8[6] =
0058 {
0059   0x00000000UL, 0x00003080UL, 0x000E2080UL,
0060   0x03C82080UL, 0xFA082080UL, 0x82082080UL
0061 };
0062
0063 //! The first character in a UTF-8 sequence indicates how many bytes to read.
0064 template<typename Type>
0065 const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
0066
0067 // =======================================================================
0068 // function : readUTF8
0069 // purpose  : Get a UTF-8 character; leave the tracking pointer at the start of the next character.
0070 //            Not protected against invalid UTF-8.
0071 // =======================================================================
0072 template<typename Type>
0073 inline void NCollection_UtfIterator<Type>::readUTF8()
0074 {
0075   // unsigned arithmetic used
0076   Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext;
0077   const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos];
0078   myCharUtf32 = 0;
0079   switch (aBytesToRead)
0080   {
0081     case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
0082       Standard_FALLTHROUGH
0083     case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
0084       Standard_FALLTHROUGH
0085     case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
0086       Standard_FALLTHROUGH
0087     case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
0088       Standard_FALLTHROUGH
0089     case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
0090       Standard_FALLTHROUGH
0091     case 0: myCharUtf32 += *aPos++;
0092   }
0093   myCharUtf32 -= offsetsFromUTF8[aBytesToRead];
0094   myPosNext = (Type* )aPos;
0095 }
0096
0097 // magic numbers
0098 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF;
0099 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80;
0100 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800;
0101 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END   = 0xDBFF;
0102 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START  = 0xDC00;
0103 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END    = 0xDFFF;
0104 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10;
0105 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE   = 0x0010000UL;
0106 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK   = 0x3FFUL;
0107 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF32_MAX_BMP   = 0x0000FFFFUL;
0108 template<typename Type> const Standard_Utf32Char NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL;
0109
0110 // =======================================================================
0111 // function : readUTF16
0112 // purpose  :
0113 // =======================================================================
0114 template<typename Type> inline
0115 void NCollection_UtfIterator<Type>::readUTF16()
0116 {
0117   Standard_Utf32Char aChar = *myPosNext++;
0118   // if we have the first half of the surrogate pair
0119   if (aChar >= UTF16_SURROGATE_HIGH_START
0120    && aChar <= UTF16_SURROGATE_HIGH_END)
0121   {
0122     const Standard_Utf32Char aChar2 = *myPosNext;
0123     // complete the surrogate pair
0124     if (aChar2 >= UTF16_SURROGATE_LOW_START
0125      && aChar2 <= UTF16_SURROGATE_LOW_END)
0126     {
0127       aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT)
0128             + (aChar2 - UTF16_SURROGATE_LOW_START)   + UTF16_SURROGATE_LOW_BASE;
0129       ++myPosNext;
0130     }
0131   }
0132   myCharUtf32 = aChar;
0133 }
0134
0135 // =======================================================================
0136 // function : AdvanceBytesUtf8
0137 // purpose  :
0138 // =======================================================================
0139 template<typename Type> inline
0140 Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const
0141 {
0142   if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
0143    && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
0144    {
0145     // UTF-16 surrogate values are illegal in UTF-32
0146     return 0;
0147   }
0148   else if (myCharUtf32 < Standard_Utf32Char(0x80))
0149   {
0150     return 1;
0151   }
0152   else if (myCharUtf32 < Standard_Utf32Char(0x800))
0153   {
0154     return 2;
0155   }
0156   else if (myCharUtf32 < Standard_Utf32Char(0x10000))
0157   {
0158     return 3;
0159   }
0160   else if (myCharUtf32 <= UTF32_MAX_LEGAL)
0161   {
0162     return 4;
0163   }
0164   else
0165   {
0166     // illegal
0167     return 0;
0168   }
0169 }
0170
0171 // =======================================================================
0172 // function : GetUtf8
0173 // purpose  :
0174 // =======================================================================
0175 template<typename Type> inline
0176 Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const
0177 {
0178   // unsigned arithmetic used
0179   return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
0180 }
0181
0182 // =======================================================================
0183 // function : GetUtf8
0184 // purpose  :
0185 // =======================================================================
0186 template<typename Type> inline
0187 Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const
0188 {
0189   Standard_Utf32Char aChar = myCharUtf32;
0190   if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
0191    && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
0192   {
0193     // UTF-16 surrogate values are illegal in UTF-32
0194     return theBuffer;
0195   }
0196   else if (myCharUtf32 < Standard_Utf32Char(0x80))
0197   {
0198     *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]);
0199     return theBuffer;
0200   }
0201   else if (myCharUtf32 < Standard_Utf32Char(0x800))
0202   {
0203     *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
0204     *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]);
0205     return theBuffer + 2;
0206   }
0207   else if (myCharUtf32 < Standard_Utf32Char(0x10000))
0208   {
0209     theBuffer += 3;
0210     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
0211     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
0212     *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]);
0213     return theBuffer + 3;
0214   }
0215   else if (myCharUtf32 <= UTF32_MAX_LEGAL)
0216   {
0217     theBuffer += 4;
0218     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
0219     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
0220     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
0221     *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]);
0222     return theBuffer + 4;
0223   }
0224   else
0225   {
0226     // illegal
0227     return theBuffer;
0228   }
0229 }
0230
0231 // =======================================================================
0232 // function : AdvanceBytesUtf16
0233 // purpose  :
0234 // =======================================================================
0235 template<typename Type> inline
0236 Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const
0237 {
0238   return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char);
0239 }
0240
0241 // =======================================================================
0242 // function : AdvanceCodeUnitsUtf16
0243 // purpose  :
0244 // =======================================================================
0245 template<typename Type> inline
0246 Standard_Integer NCollection_UtfIterator<Type>::AdvanceCodeUnitsUtf16() const
0247 {
0248   if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
0249   {
0250     // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
0251     if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
0252      && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
0253     {
0254       return 0;
0255     }
0256     else
0257     {
0258       return 1;
0259     }
0260   }
0261   else if (myCharUtf32 > UTF32_MAX_LEGAL)
0262   {
0263     // illegal
0264     return 0;
0265   }
0266   else
0267   {
0268     // target is a character in range 0xFFFF - 0x10FFFF
0269     // surrogate pair
0270     return 2;
0271   }
0272 }
0273
0274 // =======================================================================
0275 // function : GetUtf16
0276 // purpose  :
0277 // =======================================================================
0278 template<typename Type> inline
0279 Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const
0280 {
0281   if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
0282   {
0283     // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
0284     if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
0285      && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
0286     {
0287       return theBuffer;
0288     }
0289     else
0290     {
0291       *theBuffer++ = Standard_Utf16Char(myCharUtf32);
0292       return theBuffer;
0293     }
0294   }
0295   else if (myCharUtf32 > UTF32_MAX_LEGAL)
0296   {
0297     // illegal
0298     return theBuffer;
0299   }
0300   else
0301   {
0302     // surrogate pair
0303     Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE;
0304     *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START);
0305     *theBuffer++ = Standard_Utf16Char((aChar &  UTF16_SURROGATE_LOW_MASK)   + UTF16_SURROGATE_LOW_START);
0306     return theBuffer;
0307   }
0308 }
0309
0310 // =======================================================================
0311 // function : GetUtf32
0312 // purpose  :
0313 // =======================================================================
0314 template<typename Type> inline
0315 Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const
0316 {
0317   *theBuffer++ = myCharUtf32;
0318   return theBuffer;
0319 }