Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:42:46

0001 //
0002 // Copyright (c) 2015 Artyom Beilis (Tonkikh)
0003 // Copyright (c) 2020 Alexander Grund
0004 //
0005 // Distributed under the Boost Software License, Version 1.0.
0006 // https://www.boost.org/LICENSE_1_0.txt
0007 
0008 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
0009 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
0010 
0011 #include <boost/nowide/replacement.hpp>
0012 #include <boost/nowide/utf/utf.hpp>
0013 #include <cassert>
0014 #include <cstdint>
0015 #include <locale>
0016 
0017 namespace boost {
0018 namespace nowide {
0019 
0020     static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
0021     namespace detail {
0022         // Avoid including cstring for std::memcpy
0023         inline void copy_uint16_t(void* dst, const void* src)
0024         {
0025             unsigned char* cdst = static_cast<unsigned char*>(dst);
0026             const unsigned char* csrc = static_cast<const unsigned char*>(src);
0027             cdst[0] = csrc[0];
0028             cdst[1] = csrc[1];
0029         }
0030         inline std::uint16_t read_state(const std::mbstate_t& src)
0031         {
0032             std::uint16_t dst;
0033             copy_uint16_t(&dst, &src);
0034             return dst;
0035         }
0036         inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
0037         {
0038             copy_uint16_t(&dst, &src);
0039         }
0040     } // namespace detail
0041 
0042     /// std::codecvt implementation that converts between UTF-8 and UTF-16 or UTF-32
0043     ///
0044     /// @tparam CharSize Determines the encoding: 2 for UTF-16, 4 for UTF-32
0045     ///
0046     /// Invalid sequences are replaced by #BOOST_NOWIDE_REPLACEMENT_CHARACTER
0047     /// A trailing incomplete sequence will result in a return value of std::codecvt::partial
0048     template<typename CharType, int CharSize = sizeof(CharType)>
0049     class utf8_codecvt;
0050 
0051     BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
0052     /// Specialization for the UTF-8 <-> UTF-16 variant of the std::codecvt implementation
0053     template<typename CharType>
0054     class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
0055     {
0056     public:
0057         static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
0058 
0059         utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
0060         {}
0061         BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
0062 
0063     protected:
0064         using uchar = CharType;
0065 
0066         std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* /*to*/, char*& next) const override
0067         {
0068             if(detail::read_state(s) != 0)
0069                 return std::codecvt_base::error;
0070             next = from;
0071             return std::codecvt_base::ok;
0072         }
0073         int do_encoding() const noexcept override
0074         {
0075             return 0;
0076         }
0077         int do_max_length() const noexcept override
0078         {
0079             return 4;
0080         }
0081         bool do_always_noconv() const noexcept override
0082         {
0083             return false;
0084         }
0085 
0086         // LCOV_EXCL_START
0087         int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
0088         {
0089             // LCOV_EXCL_STOP
0090             using utf16_traits = utf::utf_traits<uchar, 2>;
0091             std::uint16_t state = detail::read_state(std_state);
0092             const char* save_from = from;
0093             if(state && max > 0)
0094             {
0095                 max--;
0096                 state = 0;
0097             }
0098             while(max > 0 && from < from_end)
0099             {
0100                 const char* prev_from = from;
0101                 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0102                 if(ch == utf::illegal)
0103                 {
0104                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0105                 } else if(ch == utf::incomplete)
0106                 {
0107                     from = prev_from;
0108                     break;
0109                 }
0110                 // If we can't write the char, we have to save the low surrogate in state
0111                 if(BOOST_LIKELY(static_cast<size_t>(utf16_traits::width(ch)) <= max))
0112                 {
0113                     max -= utf16_traits::width(ch);
0114                 } else
0115                 {
0116                     static_assert(utf16_traits::max_width == 2, "Required for below");
0117                     std::uint16_t tmpOut[2]{};
0118                     utf16_traits::encode(ch, tmpOut);
0119                     state = tmpOut[1];
0120                     break;
0121                 }
0122             }
0123             detail::write_state(std_state, state);
0124             return static_cast<int>(from - save_from);
0125         }
0126 
0127         std::codecvt_base::result do_in(std::mbstate_t& std_state, // LCOV_EXCL_LINE
0128                                         const char* from,
0129                                         const char* from_end,
0130                                         const char*& from_next,
0131                                         uchar* to,
0132                                         uchar* to_end,
0133                                         uchar*& to_next) const override
0134         {
0135             std::codecvt_base::result r = std::codecvt_base::ok;
0136             using utf16_traits = utf::utf_traits<uchar, 2>;
0137 
0138             // mbstate_t is POD type and should be initialized to 0 (i.e. state = stateT())
0139             // according to standard.
0140             // We use it to store a low surrogate if it was not yet written, else state is 0
0141             std::uint16_t state = detail::read_state(std_state);
0142             // Write low surrogate if present
0143             if(state && to < to_end)
0144             {
0145                 *to++ = static_cast<CharType>(state);
0146                 state = 0;
0147             }
0148             while(to < to_end && from < from_end)
0149             {
0150                 const char* from_saved = from;
0151 
0152                 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0153 
0154                 if(ch == utf::illegal)
0155                 {
0156                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0157                 } else if(ch == utf::incomplete)
0158                 {
0159                     from = from_saved;
0160                     r = std::codecvt_base::partial;
0161                     break;
0162                 }
0163                 // If the encoded char fits, write directly, else safe the low surrogate in state
0164                 if(BOOST_LIKELY(utf16_traits::width(ch) <= to_end - to))
0165                 {
0166                     to = utf16_traits::encode(ch, to);
0167                 } else
0168                 {
0169                     static_assert(utf16_traits::max_width == 2, "Required for below");
0170                     std::uint16_t tmpOut[2]{};
0171                     utf16_traits::encode(ch, tmpOut);
0172                     *to++ = static_cast<CharType>(tmpOut[0]);
0173                     state = tmpOut[1];
0174                     break;
0175                 }
0176             }
0177             from_next = from;
0178             to_next = to;
0179             if(r == std::codecvt_base::ok && (from != from_end || state != 0))
0180                 r = std::codecvt_base::partial;
0181             detail::write_state(std_state, state);
0182             return r;
0183         }
0184 
0185         std::codecvt_base::result do_out(std::mbstate_t& std_state,
0186                                          const uchar* from,
0187                                          const uchar* from_end,
0188                                          const uchar*& from_next,
0189                                          char* to,
0190                                          char* to_end,
0191                                          char*& to_next) const override
0192         {
0193             std::codecvt_base::result r = std::codecvt_base::ok;
0194             using utf16_traits = utf::utf_traits<uchar, 2>;
0195             // mbstate_t is POD type and should be initialized to 0
0196             // (i.e. state = stateT()) according to standard.
0197             // We use it to store the first observed surrogate pair, or 0 if there is none yet
0198             std::uint16_t state = detail::read_state(std_state);
0199             for(; to < to_end && from < from_end; ++from)
0200             {
0201                 std::uint32_t ch = 0;
0202                 if(state != 0)
0203                 {
0204                     // We have a high surrogate, so now there should be a low surrogate
0205                     std::uint16_t w1 = state;
0206                     std::uint16_t w2 = *from;
0207                     if(BOOST_LIKELY(utf16_traits::is_trail(w2)))
0208                     {
0209                         ch = utf16_traits::combine_surrogate(w1, w2);
0210                     } else
0211                     {
0212                         ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0213                     }
0214                 } else
0215                 {
0216                     std::uint16_t w1 = *from;
0217                     if(BOOST_LIKELY(utf16_traits::is_single_codepoint(w1)))
0218                     {
0219                         ch = w1;
0220                     } else if(BOOST_LIKELY(utf16_traits::is_first_surrogate(w1)))
0221                     {
0222                         // Store into state and continue at next character
0223                         state = w1;
0224                         continue;
0225                     } else
0226                     {
0227                         // Neither a single codepoint nor a high surrogate so must be low surrogate.
0228                         // This is an error -> Replace character
0229                         ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0230                     }
0231                 }
0232                 assert(utf::is_valid_codepoint(ch)); // Any valid UTF16 sequence is a valid codepoint
0233                 int len = utf::utf_traits<char>::width(ch);
0234                 if(to_end - to < len)
0235                 {
0236                     r = std::codecvt_base::partial;
0237                     break;
0238                 }
0239                 to = utf::utf_traits<char>::encode(ch, to);
0240                 state = 0;
0241             }
0242             from_next = from;
0243             to_next = to;
0244             if(r == std::codecvt_base::ok && (from != from_end || state != 0))
0245                 r = std::codecvt_base::partial;
0246             detail::write_state(std_state, state);
0247             return r;
0248         }
0249     };
0250 
0251     BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
0252     /// Specialization for the UTF-8 <-> UTF-32 variant of the std::codecvt implementation
0253     template<typename CharType>
0254     class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
0255     {
0256     public:
0257         utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
0258         {}
0259         BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
0260 
0261     protected:
0262         using uchar = CharType;
0263 
0264         std::codecvt_base::result
0265         do_unshift(std::mbstate_t& /*s*/, char* from, char* /*to*/, char*& next) const override
0266         {
0267             next = from;
0268             return std::codecvt_base::noconv;
0269         }
0270         int do_encoding() const noexcept override
0271         {
0272             return 0;
0273         }
0274         int do_max_length() const noexcept override
0275         {
0276             return 4;
0277         }
0278         bool do_always_noconv() const noexcept override
0279         {
0280             return false;
0281         }
0282 
0283         int do_length(std::mbstate_t& /*state*/, const char* from, const char* from_end, size_t max) const override
0284         {
0285             const char* start_from = from;
0286 
0287             while(max > 0 && from < from_end)
0288             {
0289                 const char* save_from = from;
0290                 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0291                 if(ch == utf::incomplete)
0292                 {
0293                     from = save_from;
0294                     break;
0295                 } else if(ch == utf::illegal)
0296                 {
0297                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0298                 }
0299                 max--;
0300             }
0301             return static_cast<int>(from - start_from);
0302         }
0303 
0304         std::codecvt_base::result do_in(std::mbstate_t& /*state*/,
0305                                         const char* from,
0306                                         const char* from_end,
0307                                         const char*& from_next,
0308                                         uchar* to,
0309                                         uchar* to_end,
0310                                         uchar*& to_next) const override
0311         {
0312             std::codecvt_base::result r = std::codecvt_base::ok;
0313 
0314             while(to < to_end && from < from_end)
0315             {
0316                 const char* from_saved = from;
0317 
0318                 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0319 
0320                 if(ch == utf::illegal)
0321                 {
0322                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0323                 } else if(ch == utf::incomplete)
0324                 {
0325                     r = std::codecvt_base::partial;
0326                     from = from_saved;
0327                     break;
0328                 }
0329                 *to++ = ch;
0330             }
0331             from_next = from;
0332             to_next = to;
0333             if(r == std::codecvt_base::ok && from != from_end)
0334                 r = std::codecvt_base::partial;
0335             return r;
0336         }
0337 
0338         std::codecvt_base::result do_out(std::mbstate_t& /*std_state*/,
0339                                          const uchar* from,
0340                                          const uchar* from_end,
0341                                          const uchar*& from_next,
0342                                          char* to,
0343                                          char* to_end,
0344                                          char*& to_next) const override
0345         {
0346             std::codecvt_base::result r = std::codecvt_base::ok;
0347             while(to < to_end && from < from_end)
0348             {
0349                 std::uint32_t ch = 0;
0350                 ch = *from;
0351                 if(!utf::is_valid_codepoint(ch))
0352                 {
0353                     ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0354                 }
0355                 int len = utf::utf_traits<char>::width(ch);
0356                 if(to_end - to < len)
0357                 {
0358                     r = std::codecvt_base::partial;
0359                     break;
0360                 }
0361                 to = utf::utf_traits<char>::encode(ch, to);
0362                 from++;
0363             }
0364             from_next = from;
0365             to_next = to;
0366             if(r == std::codecvt_base::ok && from != from_end)
0367                 r = std::codecvt_base::partial;
0368             return r;
0369         }
0370     };
0371 
0372 } // namespace nowide
0373 } // namespace boost
0374 
0375 #endif