File indexing completed on 2025-01-18 09:42:46
0001
0002
0003
0004
0005
0006
0007
0008 #ifndef BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
0009 #define BOOST_NOWIDE_UTF8_CODECVT_HPP_INCLUDED
0010
0011 #include <boost/nowide/replacement.hpp>
0012 #include <boost/nowide/utf/utf.hpp>
0013 #include <cassert>
0014 #include <cstdint>
0015 #include <locale>
0016
0017 namespace boost {
0018 namespace nowide {
0019
0020 static_assert(sizeof(std::mbstate_t) >= 2, "mbstate_t is to small to store an UTF-16 codepoint");
0021 namespace detail {
0022
0023 inline void copy_uint16_t(void* dst, const void* src)
0024 {
0025 unsigned char* cdst = static_cast<unsigned char*>(dst);
0026 const unsigned char* csrc = static_cast<const unsigned char*>(src);
0027 cdst[0] = csrc[0];
0028 cdst[1] = csrc[1];
0029 }
0030 inline std::uint16_t read_state(const std::mbstate_t& src)
0031 {
0032 std::uint16_t dst;
0033 copy_uint16_t(&dst, &src);
0034 return dst;
0035 }
0036 inline void write_state(std::mbstate_t& dst, const std::uint16_t src)
0037 {
0038 copy_uint16_t(&dst, &src);
0039 }
0040 }
0041
0042
0043
0044
0045
0046
0047
0048 template<typename CharType, int CharSize = sizeof(CharType)>
0049 class utf8_codecvt;
0050
0051 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
0052
0053 template<typename CharType>
0054 class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 2> : public std::codecvt<CharType, char, std::mbstate_t>
0055 {
0056 public:
0057 static_assert(sizeof(CharType) >= 2, "CharType must be able to store UTF16 code point");
0058
0059 utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
0060 {}
0061 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
0062
0063 protected:
0064 using uchar = CharType;
0065
0066 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* , char*& next) const override
0067 {
0068 if(detail::read_state(s) != 0)
0069 return std::codecvt_base::error;
0070 next = from;
0071 return std::codecvt_base::ok;
0072 }
0073 int do_encoding() const noexcept override
0074 {
0075 return 0;
0076 }
0077 int do_max_length() const noexcept override
0078 {
0079 return 4;
0080 }
0081 bool do_always_noconv() const noexcept override
0082 {
0083 return false;
0084 }
0085
0086
0087 int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
0088 {
0089
0090 using utf16_traits = utf::utf_traits<uchar, 2>;
0091 std::uint16_t state = detail::read_state(std_state);
0092 const char* save_from = from;
0093 if(state && max > 0)
0094 {
0095 max--;
0096 state = 0;
0097 }
0098 while(max > 0 && from < from_end)
0099 {
0100 const char* prev_from = from;
0101 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0102 if(ch == utf::illegal)
0103 {
0104 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0105 } else if(ch == utf::incomplete)
0106 {
0107 from = prev_from;
0108 break;
0109 }
0110
0111 if(BOOST_LIKELY(static_cast<size_t>(utf16_traits::width(ch)) <= max))
0112 {
0113 max -= utf16_traits::width(ch);
0114 } else
0115 {
0116 static_assert(utf16_traits::max_width == 2, "Required for below");
0117 std::uint16_t tmpOut[2]{};
0118 utf16_traits::encode(ch, tmpOut);
0119 state = tmpOut[1];
0120 break;
0121 }
0122 }
0123 detail::write_state(std_state, state);
0124 return static_cast<int>(from - save_from);
0125 }
0126
0127 std::codecvt_base::result do_in(std::mbstate_t& std_state,
0128 const char* from,
0129 const char* from_end,
0130 const char*& from_next,
0131 uchar* to,
0132 uchar* to_end,
0133 uchar*& to_next) const override
0134 {
0135 std::codecvt_base::result r = std::codecvt_base::ok;
0136 using utf16_traits = utf::utf_traits<uchar, 2>;
0137
0138
0139
0140
0141 std::uint16_t state = detail::read_state(std_state);
0142
0143 if(state && to < to_end)
0144 {
0145 *to++ = static_cast<CharType>(state);
0146 state = 0;
0147 }
0148 while(to < to_end && from < from_end)
0149 {
0150 const char* from_saved = from;
0151
0152 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0153
0154 if(ch == utf::illegal)
0155 {
0156 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0157 } else if(ch == utf::incomplete)
0158 {
0159 from = from_saved;
0160 r = std::codecvt_base::partial;
0161 break;
0162 }
0163
0164 if(BOOST_LIKELY(utf16_traits::width(ch) <= to_end - to))
0165 {
0166 to = utf16_traits::encode(ch, to);
0167 } else
0168 {
0169 static_assert(utf16_traits::max_width == 2, "Required for below");
0170 std::uint16_t tmpOut[2]{};
0171 utf16_traits::encode(ch, tmpOut);
0172 *to++ = static_cast<CharType>(tmpOut[0]);
0173 state = tmpOut[1];
0174 break;
0175 }
0176 }
0177 from_next = from;
0178 to_next = to;
0179 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
0180 r = std::codecvt_base::partial;
0181 detail::write_state(std_state, state);
0182 return r;
0183 }
0184
0185 std::codecvt_base::result do_out(std::mbstate_t& std_state,
0186 const uchar* from,
0187 const uchar* from_end,
0188 const uchar*& from_next,
0189 char* to,
0190 char* to_end,
0191 char*& to_next) const override
0192 {
0193 std::codecvt_base::result r = std::codecvt_base::ok;
0194 using utf16_traits = utf::utf_traits<uchar, 2>;
0195
0196
0197
0198 std::uint16_t state = detail::read_state(std_state);
0199 for(; to < to_end && from < from_end; ++from)
0200 {
0201 std::uint32_t ch = 0;
0202 if(state != 0)
0203 {
0204
0205 std::uint16_t w1 = state;
0206 std::uint16_t w2 = *from;
0207 if(BOOST_LIKELY(utf16_traits::is_trail(w2)))
0208 {
0209 ch = utf16_traits::combine_surrogate(w1, w2);
0210 } else
0211 {
0212 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0213 }
0214 } else
0215 {
0216 std::uint16_t w1 = *from;
0217 if(BOOST_LIKELY(utf16_traits::is_single_codepoint(w1)))
0218 {
0219 ch = w1;
0220 } else if(BOOST_LIKELY(utf16_traits::is_first_surrogate(w1)))
0221 {
0222
0223 state = w1;
0224 continue;
0225 } else
0226 {
0227
0228
0229 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0230 }
0231 }
0232 assert(utf::is_valid_codepoint(ch));
0233 int len = utf::utf_traits<char>::width(ch);
0234 if(to_end - to < len)
0235 {
0236 r = std::codecvt_base::partial;
0237 break;
0238 }
0239 to = utf::utf_traits<char>::encode(ch, to);
0240 state = 0;
0241 }
0242 from_next = from;
0243 to_next = to;
0244 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
0245 r = std::codecvt_base::partial;
0246 detail::write_state(std_state, state);
0247 return r;
0248 }
0249 };
0250
0251 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_BEGIN
0252
0253 template<typename CharType>
0254 class BOOST_SYMBOL_VISIBLE utf8_codecvt<CharType, 4> : public std::codecvt<CharType, char, std::mbstate_t>
0255 {
0256 public:
0257 utf8_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs)
0258 {}
0259 BOOST_NOWIDE_SUPPRESS_UTF_CODECVT_DEPRECATION_END
0260
0261 protected:
0262 using uchar = CharType;
0263
0264 std::codecvt_base::result
0265 do_unshift(std::mbstate_t& , char* from, char* , char*& next) const override
0266 {
0267 next = from;
0268 return std::codecvt_base::noconv;
0269 }
0270 int do_encoding() const noexcept override
0271 {
0272 return 0;
0273 }
0274 int do_max_length() const noexcept override
0275 {
0276 return 4;
0277 }
0278 bool do_always_noconv() const noexcept override
0279 {
0280 return false;
0281 }
0282
0283 int do_length(std::mbstate_t& , const char* from, const char* from_end, size_t max) const override
0284 {
0285 const char* start_from = from;
0286
0287 while(max > 0 && from < from_end)
0288 {
0289 const char* save_from = from;
0290 std::uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0291 if(ch == utf::incomplete)
0292 {
0293 from = save_from;
0294 break;
0295 } else if(ch == utf::illegal)
0296 {
0297 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0298 }
0299 max--;
0300 }
0301 return static_cast<int>(from - start_from);
0302 }
0303
0304 std::codecvt_base::result do_in(std::mbstate_t& ,
0305 const char* from,
0306 const char* from_end,
0307 const char*& from_next,
0308 uchar* to,
0309 uchar* to_end,
0310 uchar*& to_next) const override
0311 {
0312 std::codecvt_base::result r = std::codecvt_base::ok;
0313
0314 while(to < to_end && from < from_end)
0315 {
0316 const char* from_saved = from;
0317
0318 uint32_t ch = utf::utf_traits<char>::decode(from, from_end);
0319
0320 if(ch == utf::illegal)
0321 {
0322 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0323 } else if(ch == utf::incomplete)
0324 {
0325 r = std::codecvt_base::partial;
0326 from = from_saved;
0327 break;
0328 }
0329 *to++ = ch;
0330 }
0331 from_next = from;
0332 to_next = to;
0333 if(r == std::codecvt_base::ok && from != from_end)
0334 r = std::codecvt_base::partial;
0335 return r;
0336 }
0337
0338 std::codecvt_base::result do_out(std::mbstate_t& ,
0339 const uchar* from,
0340 const uchar* from_end,
0341 const uchar*& from_next,
0342 char* to,
0343 char* to_end,
0344 char*& to_next) const override
0345 {
0346 std::codecvt_base::result r = std::codecvt_base::ok;
0347 while(to < to_end && from < from_end)
0348 {
0349 std::uint32_t ch = 0;
0350 ch = *from;
0351 if(!utf::is_valid_codepoint(ch))
0352 {
0353 ch = BOOST_NOWIDE_REPLACEMENT_CHARACTER;
0354 }
0355 int len = utf::utf_traits<char>::width(ch);
0356 if(to_end - to < len)
0357 {
0358 r = std::codecvt_base::partial;
0359 break;
0360 }
0361 to = utf::utf_traits<char>::encode(ch, to);
0362 from++;
0363 }
0364 from_next = from;
0365 to_next = to;
0366 if(r == std::codecvt_base::ok && from != from_end)
0367 r = std::codecvt_base::partial;
0368 return r;
0369 }
0370 };
0371
0372 }
0373 }
0374
0375 #endif