Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:42:45

0001 //
0002 // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
0003 // Copyright (c) 2020 Alexander Grund
0004 //
0005 // Distributed under the Boost Software License, Version 1.0.
0006 // https://www.boost.org/LICENSE_1_0.txt
0007 
0008 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
0009 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
0010 
0011 #include <boost/nowide/config.hpp>
0012 #include <cstdint>
0013 
0014 namespace boost {
0015 namespace nowide {
0016     ///
0017     /// \brief Namespace that holds basic operations on UTF encoded sequences
0018     ///
0019     /// All functions defined in this namespace do not require linking with Boost.Nowide library.
0020     /// Extracted from Boost.Locale
0021     ///
0022     namespace utf {
0023 
0024         ///
0025         /// \brief The integral type that can hold a Unicode code point
0026         ///
0027         using code_point = uint32_t;
0028 
0029         ///
0030         /// \brief Special constant that defines illegal code point
0031         ///
0032         static const code_point illegal = 0xFFFFFFFFu;
0033 
0034         ///
0035         /// \brief Special constant that defines incomplete code point
0036         ///
0037         static const code_point incomplete = 0xFFFFFFFEu;
0038 
0039         ///
0040         /// \brief the function checks if \a v is a valid code point
0041         ///
0042         inline bool is_valid_codepoint(code_point v)
0043         {
0044             if(v > 0x10FFFF)
0045                 return false;
0046             if(0xD800 <= v && v <= 0xDFFF) // surrogates
0047                 return false;
0048             return true;
0049         }
0050 
0051 #ifdef BOOST_NOWIDE_DOXYGEN
0052         ///
0053         /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
0054         ///
0055         template<typename CharType, int size = sizeof(CharType)>
0056         struct utf_traits
0057         {
0058             ///
0059             /// The type of the character
0060             ///
0061             using char_type = CharType;
0062             ///
0063             /// Read one code point from the range [p,e) and return it.
0064             ///
0065             /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
0066             /// - If illegal sequence detected returns \ref illegal
0067             ///
0068             /// Requirements
0069             ///
0070             /// - Iterator is valid input iterator
0071             ///
0072             /// Postconditions
0073             ///
0074             /// - p points to the last consumed character
0075             ///
0076             template<typename Iterator>
0077             static code_point decode(Iterator& p, Iterator e);
0078 
0079             ///
0080             /// Maximal width of valid sequence in the code units:
0081             ///
0082             /// - UTF-8  - 4
0083             /// - UTF-16 - 2
0084             /// - UTF-32 - 1
0085             ///
0086             static const int max_width;
0087             ///
0088             /// The width of specific code point in the code units.
0089             ///
0090             /// Requirement: value is a valid Unicode code point
0091             /// Returns value in range [1..max_width]
0092             ///
0093             static int width(code_point value);
0094 
0095             ///
0096             /// Get the size of the trail part of variable length encoded sequence.
0097             ///
0098             /// Returns -1 if C is not valid lead character
0099             ///
0100             static int trail_length(char_type c);
0101             ///
0102             /// Returns true if c is trail code unit, always false for UTF-32
0103             ///
0104             static bool is_trail(char_type c);
0105             ///
0106             /// Returns true if c is lead code unit, always true of UTF-32
0107             ///
0108             static bool is_lead(char_type c);
0109 
0110             ///
0111             /// Convert valid Unicode code point \a value to the UTF sequence.
0112             ///
0113             /// Requirements:
0114             ///
0115             /// - \a value is valid code point
0116             /// - \a out is an output iterator should be able to accept at least width(value) units
0117             ///
0118             /// Returns the iterator past the last written code unit.
0119             ///
0120             template<typename Iterator>
0121             static Iterator encode(code_point value, Iterator out);
0122             ///
0123             /// Decodes valid UTF sequence that is pointed by p into code point.
0124             ///
0125             /// If the sequence is invalid or points to end the behavior is undefined
0126             ///
0127             template<typename Iterator>
0128             static code_point decode_valid(Iterator& p);
0129         };
0130 
0131 #else
0132 
0133         template<typename CharType, int size = sizeof(CharType)>
0134         struct utf_traits;
0135 
0136         template<typename CharType>
0137         struct utf_traits<CharType, 1>
0138         {
0139             using char_type = CharType;
0140 
0141             static int trail_length(char_type ci)
0142             {
0143                 unsigned char c = ci;
0144                 if(c < 128)
0145                     return 0;
0146                 if(BOOST_UNLIKELY(c < 194))
0147                     return -1;
0148                 if(c < 224)
0149                     return 1;
0150                 if(c < 240)
0151                     return 2;
0152                 if(BOOST_LIKELY(c <= 244))
0153                     return 3;
0154                 return -1;
0155             }
0156 
0157             static const int max_width = 4;
0158 
0159             static int width(code_point value)
0160             {
0161                 if(value <= 0x7F)
0162                 {
0163                     return 1;
0164                 } else if(value <= 0x7FF)
0165                 {
0166                     return 2;
0167                 } else if(BOOST_LIKELY(value <= 0xFFFF))
0168                 {
0169                     return 3;
0170                 } else
0171                 {
0172                     return 4;
0173                 }
0174             }
0175 
0176             static bool is_trail(char_type ci)
0177             {
0178                 unsigned char c = ci;
0179                 return (c & 0xC0) == 0x80;
0180             }
0181 
0182             static bool is_lead(char_type ci)
0183             {
0184                 return !is_trail(ci);
0185             }
0186 
0187             template<typename Iterator>
0188             static code_point decode(Iterator& p, Iterator e)
0189             {
0190                 if(BOOST_UNLIKELY(p == e))
0191                     return incomplete;
0192 
0193                 unsigned char lead = *p++;
0194 
0195                 // First byte is fully validated here
0196                 int trail_size = trail_length(lead);
0197 
0198                 if(BOOST_UNLIKELY(trail_size < 0))
0199                     return illegal;
0200 
0201                 // OK as only ASCII may be of size = 0
0202                 // also optimize for ASCII text
0203                 if(trail_size == 0)
0204                     return lead;
0205 
0206                 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0207 
0208                 // Read the rest
0209                 unsigned char tmp;
0210                 switch(trail_size)
0211                 {
0212                 case 3:
0213                     if(BOOST_UNLIKELY(p == e))
0214                         return incomplete;
0215                     tmp = *p++;
0216                     if(!is_trail(tmp))
0217                         return illegal;
0218                     c = (c << 6) | (tmp & 0x3F);
0219                     BOOST_NOWIDE_FALLTHROUGH;
0220                 case 2:
0221                     if(BOOST_UNLIKELY(p == e))
0222                         return incomplete;
0223                     tmp = *p++;
0224                     if(!is_trail(tmp))
0225                         return illegal;
0226                     c = (c << 6) | (tmp & 0x3F);
0227                     BOOST_NOWIDE_FALLTHROUGH;
0228                 case 1:
0229                     if(BOOST_UNLIKELY(p == e))
0230                         return incomplete;
0231                     tmp = *p++;
0232                     if(!is_trail(tmp))
0233                         return illegal;
0234                     c = (c << 6) | (tmp & 0x3F);
0235                 }
0236 
0237                 // Check code point validity:
0238                 // - no surrogates and valid range
0239                 // - most compact representation
0240                 if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
0241                 {
0242                     p -= trail_size;
0243                     return illegal;
0244                 }
0245 
0246                 return c;
0247             }
0248 
0249             template<typename Iterator>
0250             static code_point decode_valid(Iterator& p)
0251             {
0252                 unsigned char lead = *p++;
0253                 if(lead < 192)
0254                     return lead;
0255 
0256                 int trail_size;
0257 
0258                 if(lead < 224)
0259                     trail_size = 1;
0260                 else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
0261                     trail_size = 2;
0262                 else
0263                     trail_size = 3;
0264 
0265                 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0266 
0267                 switch(trail_size)
0268                 {
0269                 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
0270                 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
0271                 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
0272                 }
0273 
0274                 return c;
0275             }
0276 
0277             template<typename Iterator>
0278             static Iterator encode(code_point value, Iterator out)
0279             {
0280                 if(value <= 0x7F)
0281                 {
0282                     *out++ = static_cast<char_type>(value);
0283                 } else if(value <= 0x7FF)
0284                 {
0285                     *out++ = static_cast<char_type>((value >> 6) | 0xC0);
0286                     *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0287                 } else if(BOOST_LIKELY(value <= 0xFFFF))
0288                 {
0289                     *out++ = static_cast<char_type>((value >> 12) | 0xE0);
0290                     *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0291                     *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0292                 } else
0293                 {
0294                     *out++ = static_cast<char_type>((value >> 18) | 0xF0);
0295                     *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
0296                     *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0297                     *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0298                 }
0299                 return out;
0300             }
0301         }; // utf8
0302 
0303         template<typename CharType>
0304         struct utf_traits<CharType, 2>
0305         {
0306             using char_type = CharType;
0307 
0308             // See RFC 2781
0309             static bool is_single_codepoint(uint16_t x)
0310             {
0311                 // Ranges [U+0000, 0+D7FF], [U+E000, U+FFFF] are numerically equal in UTF-16
0312                 return x <= 0xD7FF || x >= 0xE000;
0313             }
0314             static bool is_first_surrogate(uint16_t x)
0315             {
0316                 // Range [U+D800, 0+DBFF]: High surrogate
0317                 return 0xD800 <= x && x <= 0xDBFF;
0318             }
0319             static bool is_second_surrogate(uint16_t x)
0320             {
0321                 // Range [U+DC00, 0+DFFF]: Low surrogate
0322                 return 0xDC00 <= x && x <= 0xDFFF;
0323             }
0324             static code_point combine_surrogate(uint16_t w1, uint16_t w2)
0325             {
0326                 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
0327             }
0328             static int trail_length(char_type c)
0329             {
0330                 if(is_first_surrogate(c))
0331                     return 1;
0332                 if(is_second_surrogate(c))
0333                     return -1;
0334                 return 0;
0335             }
0336             /// Return true if c is trail code unit, always false for UTF-32
0337             static bool is_trail(char_type c)
0338             {
0339                 return is_second_surrogate(c);
0340             }
0341             /// Return true if c is lead code unit, always true of UTF-32
0342             static bool is_lead(char_type c)
0343             {
0344                 return !is_second_surrogate(c);
0345             }
0346 
0347             template<typename It>
0348             static code_point decode(It& current, It last)
0349             {
0350                 if(BOOST_UNLIKELY(current == last))
0351                     return incomplete;
0352                 uint16_t w1 = *current++;
0353                 if(BOOST_LIKELY(is_single_codepoint(w1)))
0354                 {
0355                     return w1;
0356                 }
0357                 // Now it's either a high or a low surrogate, the latter is invalid
0358                 if(w1 >= 0xDC00)
0359                     return illegal;
0360                 if(current == last)
0361                     return incomplete;
0362                 uint16_t w2 = *current++;
0363                 if(!is_second_surrogate(w2))
0364                     return illegal;
0365                 return combine_surrogate(w1, w2);
0366             }
0367             template<typename It>
0368             static code_point decode_valid(It& current)
0369             {
0370                 uint16_t w1 = *current++;
0371                 if(BOOST_LIKELY(is_single_codepoint(w1)))
0372                 {
0373                     return w1;
0374                 }
0375                 uint16_t w2 = *current++;
0376                 return combine_surrogate(w1, w2);
0377             }
0378 
0379             static const int max_width = 2;
0380             static int width(code_point u) // LCOV_EXCL_LINE
0381             {
0382                 return u >= 0x10000 ? 2 : 1;
0383             }
0384             template<typename It>
0385             static It encode(code_point u, It out)
0386             {
0387                 if(BOOST_LIKELY(u <= 0xFFFF))
0388                 {
0389                     *out++ = static_cast<char_type>(u);
0390                 } else
0391                 {
0392                     u -= 0x10000;
0393                     *out++ = static_cast<char_type>(0xD800 | (u >> 10));
0394                     *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
0395                 }
0396                 return out;
0397             }
0398         }; // utf16;
0399 
0400         template<typename CharType>
0401         struct utf_traits<CharType, 4>
0402         {
0403             using char_type = CharType;
0404             static int trail_length(char_type c)
0405             {
0406                 if(is_valid_codepoint(c))
0407                     return 0;
0408                 return -1;
0409             }
0410             static bool is_trail(char_type /*c*/)
0411             {
0412                 return false;
0413             }
0414             static bool is_lead(char_type /*c*/)
0415             {
0416                 return true;
0417             }
0418 
0419             template<typename It>
0420             static code_point decode_valid(It& current)
0421             {
0422                 return *current++;
0423             }
0424 
0425             template<typename It>
0426             static code_point decode(It& current, It last)
0427             {
0428                 if(BOOST_UNLIKELY(current == last))
0429                     return incomplete;
0430                 code_point c = *current++;
0431                 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
0432                     return illegal;
0433                 return c;
0434             }
0435             static const int max_width = 1;
0436             static int width(code_point /*u*/)
0437             {
0438                 return 1;
0439             }
0440             template<typename It>
0441             static It encode(code_point u, It out)
0442             {
0443                 *out++ = static_cast<char_type>(u);
0444                 return out;
0445             }
0446         }; // utf32
0447 
0448 #endif
0449 
0450     } // namespace utf
0451 } // namespace nowide
0452 } // namespace boost
0453 
0454 #endif