Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:39:17

0001 //
0002 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
0003 //
0004 // Distributed under the Boost Software License, Version 1.0.
0005 // https://www.boost.org/LICENSE_1_0.txt
0006 
0007 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
0008 #define BOOST_LOCALE_UTF_HPP_INCLUDED
0009 
0010 #include <boost/locale/config.hpp>
0011 #include <cstdint>
0012 
0013 namespace boost { namespace locale {
0014     /// \brief Namespace that holds basic operations on UTF encoded sequences
0015     ///
0016     /// All functions defined in this namespace do not require linking with Boost.Locale library
0017     namespace utf {
0018         /// \brief The integral type that can hold a Unicode code point
0019         using code_point = uint32_t;
0020 
0021         /// \brief Special constant that defines illegal code point
0022         constexpr code_point illegal = 0xFFFFFFFFu;
0023         /// \brief Special constant that defines incomplete code point
0024         constexpr code_point incomplete = 0xFFFFFFFEu;
0025 
0026         /// Either a length/size or an error (illegal/incomplete)
0027         using len_or_error = code_point;
0028 
0029         /// \brief the function checks if \a v is a valid code point
0030         inline bool is_valid_codepoint(code_point v)
0031         {
0032             if(v > 0x10FFFF)
0033                 return false;
0034             if(0xD800 <= v && v <= 0xDFFF) // surrogates
0035                 return false;
0036             return true;
0037         }
0038 
0039 #ifdef BOOST_LOCALE_DOXYGEN
0040 
0041         /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
0042         template<typename CharType, int size = sizeof(CharType)>
0043         struct utf_traits {
0044             /// The type of the character
0045             typedef CharType char_type;
0046 
0047             /// Read one code point from the range [p,e) and return it.
0048             ///
0049             /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
0050             /// - If illegal sequence detected returns \ref illegal
0051             ///
0052             /// Requirements
0053             ///
0054             /// - Iterator is valid input iterator
0055             ///
0056             /// Postconditions
0057             ///
0058             /// - p points to the last consumed character
0059             template<typename Iterator>
0060             static code_point decode(Iterator& p, Iterator e);
0061 
0062             /// Maximal width of valid sequence in the code units:
0063             ///
0064             /// - UTF-8  - 4
0065             /// - UTF-16 - 2
0066             /// - UTF-32 - 1
0067             static constexpr int max_width;
0068 
0069             /// The width of specific code point in the code units.
0070             ///
0071             /// Requirement: value is a valid Unicode code point
0072             /// Returns value in range [1..max_width]
0073             static int width(code_point value);
0074 
0075             /// Get the size of the trail part of variable length encoded sequence.
0076             ///
0077             /// Returns -1 if C is not valid lead character
0078             static int trail_length(char_type c);
0079             /// Returns true if c is trail code unit, always false for UTF-32
0080             static bool is_trail(char_type c);
0081             /// Returns true if c is lead code unit, always true of UTF-32
0082             static bool is_lead(char_type c);
0083 
0084             /// Convert valid Unicode code point \a value to the UTF sequence.
0085             ///
0086             /// Requirements:
0087             ///
0088             /// - \a value is valid code point
0089             /// - \a out is an output iterator should be able to accept at least width(value) units
0090             ///
0091             /// Returns the iterator past the last written code unit.
0092             template<typename Iterator>
0093             static Iterator encode(code_point value, Iterator out);
0094 
0095             /// Decodes valid UTF sequence that is pointed by p into code point.
0096             ///
0097             /// If the sequence is invalid or points to end the behavior is undefined
0098             template<typename Iterator>
0099             static code_point decode_valid(Iterator& p);
0100         };
0101 
0102 #else
0103 
0104         template<typename CharType, int size = sizeof(CharType)>
0105         struct utf_traits;
0106 
0107         template<typename CharType>
0108         struct utf_traits<CharType, 1> {
0109             typedef CharType char_type;
0110 
0111             static int trail_length(char_type ci)
0112             {
0113                 unsigned char c = ci;
0114                 if(c < 128)
0115                     return 0;
0116                 if(BOOST_UNLIKELY(c < 194))
0117                     return -1;
0118                 if(c < 224)
0119                     return 1;
0120                 if(c < 240)
0121                     return 2;
0122                 if(BOOST_LIKELY(c <= 244))
0123                     return 3;
0124                 return -1;
0125             }
0126 
0127             static constexpr int max_width = 4;
0128 
0129             static int width(code_point value)
0130             {
0131                 if(value <= 0x7F)
0132                     return 1;
0133                 else if(value <= 0x7FF)
0134                     return 2;
0135                 else if(BOOST_LIKELY(value <= 0xFFFF))
0136                     return 3;
0137                 else
0138                     return 4;
0139             }
0140 
0141             static bool is_trail(char_type ci)
0142             {
0143                 unsigned char c = ci;
0144                 return (c & 0xC0) == 0x80;
0145             }
0146 
0147             static bool is_lead(char_type ci) { return !is_trail(ci); }
0148 
0149             template<typename Iterator>
0150             static code_point decode(Iterator& p, Iterator e)
0151             {
0152                 if(BOOST_UNLIKELY(p == e))
0153                     return incomplete;
0154 
0155                 unsigned char lead = *p++;
0156 
0157                 // First byte is fully validated here
0158                 int trail_size = trail_length(lead);
0159 
0160                 if(BOOST_UNLIKELY(trail_size < 0))
0161                     return illegal;
0162 
0163                 // Ok as only ASCII may be of size = 0
0164                 // also optimize for ASCII text
0165                 if(trail_size == 0)
0166                     return lead;
0167 
0168                 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0169 
0170                 // Read the rest
0171                 unsigned char tmp;
0172                 switch(trail_size) {
0173                     case 3:
0174                         if(BOOST_UNLIKELY(p == e))
0175                             return incomplete;
0176                         tmp = *p++;
0177                         if(!is_trail(tmp))
0178                             return illegal;
0179                         c = (c << 6) | (tmp & 0x3F);
0180                         BOOST_FALLTHROUGH;
0181                     case 2:
0182                         if(BOOST_UNLIKELY(p == e))
0183                             return incomplete;
0184                         tmp = *p++;
0185                         if(!is_trail(tmp))
0186                             return illegal;
0187                         c = (c << 6) | (tmp & 0x3F);
0188                         BOOST_FALLTHROUGH;
0189                     case 1:
0190                         if(BOOST_UNLIKELY(p == e))
0191                             return incomplete;
0192                         tmp = *p++;
0193                         if(!is_trail(tmp))
0194                             return illegal;
0195                         c = (c << 6) | (tmp & 0x3F);
0196                 }
0197 
0198                 // Check code point validity: no surrogates and
0199                 // valid range
0200                 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
0201                     return illegal;
0202 
0203                 // make sure it is the most compact representation
0204                 if(BOOST_UNLIKELY(width(c) != trail_size + 1))
0205                     return illegal;
0206 
0207                 return c;
0208             }
0209 
0210             template<typename Iterator>
0211             static code_point decode_valid(Iterator& p)
0212             {
0213                 unsigned char lead = *p++;
0214                 if(lead < 192)
0215                     return lead;
0216 
0217                 int trail_size;
0218 
0219                 if(lead < 224)
0220                     trail_size = 1;
0221                 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
0222                     trail_size = 2;
0223                 else
0224                     trail_size = 3;
0225 
0226                 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0227 
0228                 switch(trail_size) {
0229                     case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
0230                     case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
0231                     case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
0232                 }
0233 
0234                 return c;
0235             }
0236 
0237             template<typename Iterator>
0238             static Iterator encode(code_point value, Iterator out)
0239             {
0240                 if(value <= 0x7F)
0241                     *out++ = static_cast<char_type>(value);
0242                 else if(value <= 0x7FF) {
0243                     *out++ = static_cast<char_type>((value >> 6) | 0xC0);
0244                     *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0245                 } else if(BOOST_LIKELY(value <= 0xFFFF)) {
0246                     *out++ = static_cast<char_type>((value >> 12) | 0xE0);
0247                     *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0248                     *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0249                 } else {
0250                     *out++ = static_cast<char_type>((value >> 18) | 0xF0);
0251                     *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
0252                     *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0253                     *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0254                 }
0255                 return out;
0256             }
0257         }; // utf8
0258 
0259         template<typename CharType>
0260         struct utf_traits<CharType, 2> {
0261             typedef CharType char_type;
0262 
0263             // See RFC 2781
0264             static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; }
0265             static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; }
0266             static code_point combine_surrogate(uint16_t w1, uint16_t w2)
0267             {
0268                 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
0269             }
0270             static int trail_length(char_type c)
0271             {
0272                 if(is_first_surrogate(c))
0273                     return 1;
0274                 if(is_second_surrogate(c))
0275                     return -1;
0276                 return 0;
0277             }
0278 
0279             /// Returns true if c is trail code unit, always false for UTF-32
0280             static bool is_trail(char_type c) { return is_second_surrogate(c); }
0281             /// Returns true if c is lead code unit, always true of UTF-32
0282             static bool is_lead(char_type c) { return !is_second_surrogate(c); }
0283 
0284             template<typename It>
0285             static code_point decode(It& current, It last)
0286             {
0287                 if(BOOST_UNLIKELY(current == last))
0288                     return incomplete;
0289                 uint16_t w1 = *current++;
0290                 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
0291                     return w1;
0292                 if(w1 > 0xDBFF)
0293                     return illegal;
0294                 if(current == last)
0295                     return incomplete;
0296                 uint16_t w2 = *current++;
0297                 if(w2 < 0xDC00 || 0xDFFF < w2)
0298                     return illegal;
0299                 return combine_surrogate(w1, w2);
0300             }
0301             template<typename It>
0302             static code_point decode_valid(It& current)
0303             {
0304                 uint16_t w1 = *current++;
0305                 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
0306                     return w1;
0307                 uint16_t w2 = *current++;
0308                 return combine_surrogate(w1, w2);
0309             }
0310 
0311             static constexpr int max_width = 2;
0312             static int width(code_point u) { return u >= 0x10000 ? 2 : 1; }
0313             template<typename It>
0314             static It encode(code_point u, It out)
0315             {
0316                 if(BOOST_LIKELY(u <= 0xFFFF))
0317                     *out++ = static_cast<char_type>(u);
0318                 else {
0319                     u -= 0x10000;
0320                     *out++ = static_cast<char_type>(0xD800 | (u >> 10));
0321                     *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
0322                 }
0323                 return out;
0324             }
0325         }; // utf16;
0326 
0327         template<typename CharType>
0328         struct utf_traits<CharType, 4> {
0329             typedef CharType char_type;
0330             static int trail_length(char_type c)
0331             {
0332                 if(is_valid_codepoint(c))
0333                     return 0;
0334                 return -1;
0335             }
0336             static bool is_trail(char_type /*c*/) { return false; }
0337             static bool is_lead(char_type /*c*/) { return true; }
0338 
0339             template<typename It>
0340             static code_point decode_valid(It& current)
0341             {
0342                 return *current++;
0343             }
0344 
0345             template<typename It>
0346             static code_point decode(It& current, It last)
0347             {
0348                 if(BOOST_UNLIKELY(current == last))
0349                     return boost::locale::utf::incomplete;
0350                 code_point c = *current++;
0351                 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
0352                     return boost::locale::utf::illegal;
0353                 return c;
0354             }
0355             static constexpr int max_width = 1;
0356             static int width(code_point /*u*/) { return 1; }
0357             template<typename It>
0358             static It encode(code_point u, It out)
0359             {
0360                 *out++ = static_cast<char_type>(u);
0361                 return out;
0362             }
0363 
0364         }; // utf32
0365 
0366 #endif
0367 
0368     } // namespace utf
0369 }}    // namespace boost::locale
0370 
0371 #endif