File indexing completed on 2025-01-30 09:44:57
0001
0002
0003
0004
0005
0006
0007
0008 #ifndef BOOST_LOCALE_GENERIC_CODECVT_HPP
0009 #define BOOST_LOCALE_GENERIC_CODECVT_HPP
0010
0011 #include <boost/locale/utf.hpp>
0012 #include <cstdint>
0013 #include <locale>
0014
0015 namespace boost { namespace locale {
0016
0017 static_assert(sizeof(std::mbstate_t) >= 2, "std::mbstate_t is to small to store an UTF-16 codepoint");
0018 namespace detail {
0019
0020 inline void copy_uint16_t(void* dst, const void* src)
0021 {
0022 unsigned char* cdst = static_cast<unsigned char*>(dst);
0023 const unsigned char* csrc = static_cast<const unsigned char*>(src);
0024 cdst[0] = csrc[0];
0025 cdst[1] = csrc[1];
0026 }
0027 inline uint16_t read_state(const std::mbstate_t& src)
0028 {
0029 uint16_t dst;
0030 copy_uint16_t(&dst, &src);
0031 return dst;
0032 }
0033 inline void write_state(std::mbstate_t& dst, const uint16_t src)
0034 {
0035 copy_uint16_t(&dst, &src);
0036 }
0037 }
0038
0039
0040 class generic_codecvt_base {
0041 public:
0042
0043 enum initial_convertion_state {
0044 to_unicode_state,
0045 from_unicode_state
0046 };
0047 };
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076
0077
0078
0079
0080
0081
0082
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150 template<typename CharType, typename CodecvtImpl, int CharSize = sizeof(CharType)>
0151 class generic_codecvt;
0152
0153
0154
0155
0156
0157
0158
0159 template<typename CharType, typename CodecvtImpl>
0160 class generic_codecvt<CharType, CodecvtImpl, 2> : public std::codecvt<CharType, char, std::mbstate_t>,
0161 public generic_codecvt_base {
0162 public:
0163 typedef CharType uchar;
0164
0165 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
0166 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
0167
0168 protected:
0169 std::codecvt_base::result do_unshift(std::mbstate_t& s, char* from, char* , char*& next) const override
0170 {
0171 if(*reinterpret_cast<char*>(&s) != 0)
0172 return std::codecvt_base::error;
0173 next = from;
0174 return std::codecvt_base::ok;
0175 }
0176 int do_encoding() const noexcept override { return 0; }
0177 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
0178 bool do_always_noconv() const noexcept override { return false; }
0179
0180 int do_length(std::mbstate_t& std_state, const char* from, const char* from_end, size_t max) const override
0181 {
0182 bool state = *reinterpret_cast<char*>(&std_state) != 0;
0183 const char* save_from = from;
0184
0185 auto cvt_state = implementation().initial_state(to_unicode_state);
0186 while(max > 0 && from < from_end) {
0187 const char* prev_from = from;
0188 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0189 if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
0190 from = prev_from;
0191 break;
0192 }
0193 max--;
0194 if(ch > 0xFFFF) {
0195 if(!state)
0196 from = prev_from;
0197 state = !state;
0198 }
0199 }
0200 *reinterpret_cast<char*>(&std_state) = state;
0201 return static_cast<int>(from - save_from);
0202 }
0203
0204 std::codecvt_base::result do_in(std::mbstate_t& std_state,
0205 const char* from,
0206 const char* from_end,
0207 const char*& from_next,
0208 uchar* to,
0209 uchar* to_end,
0210 uchar*& to_next) const override
0211 {
0212 std::codecvt_base::result r = std::codecvt_base::ok;
0213
0214
0215
0216
0217
0218
0219 bool state = *reinterpret_cast<char*>(&std_state) != 0;
0220 auto cvt_state = implementation().initial_state(to_unicode_state);
0221 while(to < to_end && from < from_end) {
0222 const char* from_saved = from;
0223
0224 utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0225
0226 if(ch == boost::locale::utf::illegal) {
0227 from = from_saved;
0228 r = std::codecvt_base::error;
0229 break;
0230 }
0231 if(ch == boost::locale::utf::incomplete) {
0232 from = from_saved;
0233 r = std::codecvt_base::partial;
0234 break;
0235 }
0236
0237 if(ch <= 0xFFFF)
0238 *to++ = static_cast<uchar>(ch);
0239 else {
0240
0241
0242
0243
0244
0245
0246
0247
0248
0249 ch -= 0x10000;
0250 std::uint16_t w1 = static_cast<std::uint16_t>(0xD800 | (ch >> 10));
0251 std::uint16_t w2 = static_cast<std::uint16_t>(0xDC00 | (ch & 0x3FF));
0252 if(!state) {
0253 from = from_saved;
0254 *to++ = w1;
0255 } else
0256 *to++ = w2;
0257 state = !state;
0258 }
0259 }
0260 from_next = from;
0261 to_next = to;
0262 if(r == std::codecvt_base::ok && (from != from_end || state))
0263 r = std::codecvt_base::partial;
0264 *reinterpret_cast<char*>(&std_state) = state;
0265 return r;
0266 }
0267
0268 std::codecvt_base::result do_out(std::mbstate_t& std_state,
0269 const uchar* from,
0270 const uchar* from_end,
0271 const uchar*& from_next,
0272 char* to,
0273 char* to_end,
0274 char*& to_next) const override
0275 {
0276 std::codecvt_base::result r = std::codecvt_base::ok;
0277
0278
0279
0280
0281
0282
0283 std::uint16_t state = detail::read_state(std_state);
0284 auto cvt_state = implementation().initial_state(from_unicode_state);
0285 while(to < to_end && from < from_end) {
0286 utf::code_point ch = 0;
0287 if(state != 0) {
0288
0289
0290
0291 std::uint16_t w1 = state;
0292 std::uint16_t w2 = *from;
0293
0294
0295 if(0xDC00 <= w2 && w2 <= 0xDFFF) {
0296 std::uint16_t vh = w1 - 0xD800;
0297 std::uint16_t vl = w2 - 0xDC00;
0298 ch = ((uint32_t(vh) << 10) | vl) + 0x10000;
0299 } else {
0300
0301 r = std::codecvt_base::error;
0302 break;
0303 }
0304 } else {
0305 ch = *from;
0306 if(0xD800 <= ch && ch <= 0xDBFF) {
0307
0308
0309
0310
0311 state = static_cast<uint16_t>(ch);
0312 from++;
0313 continue;
0314 } else if(0xDC00 <= ch && ch <= 0xDFFF) {
0315
0316
0317
0318 r = std::codecvt_base::error;
0319 break;
0320 }
0321 }
0322 if(!boost::locale::utf::is_valid_codepoint(ch)) {
0323 r = std::codecvt_base::error;
0324 break;
0325 }
0326 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
0327 if(len == boost::locale::utf::incomplete) {
0328 r = std::codecvt_base::partial;
0329 break;
0330 } else if(len == boost::locale::utf::illegal) {
0331 r = std::codecvt_base::error;
0332 break;
0333 } else
0334 to += len;
0335 state = 0;
0336 from++;
0337 }
0338 from_next = from;
0339 to_next = to;
0340 if(r == std::codecvt_base::ok && (from != from_end || state != 0))
0341 r = std::codecvt_base::partial;
0342 detail::write_state(std_state, state);
0343 return r;
0344 }
0345 };
0346
0347
0348
0349
0350
0351 template<typename CharType, typename CodecvtImpl>
0352 class generic_codecvt<CharType, CodecvtImpl, 4> : public std::codecvt<CharType, char, std::mbstate_t>,
0353 public generic_codecvt_base {
0354 public:
0355 typedef CharType uchar;
0356
0357 generic_codecvt(size_t refs = 0) : std::codecvt<CharType, char, std::mbstate_t>(refs) {}
0358
0359 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
0360
0361 protected:
0362 std::codecvt_base::result
0363 do_unshift(std::mbstate_t& , char* from, char* , char*& next) const override
0364 {
0365 next = from;
0366 return std::codecvt_base::ok;
0367 }
0368 int do_encoding() const noexcept override { return 0; }
0369 int do_max_length() const noexcept override { return implementation().max_encoding_length(); }
0370 bool do_always_noconv() const noexcept override { return false; }
0371
0372 int do_length(std::mbstate_t& , const char* from, const char* from_end, size_t max) const override
0373 {
0374 const char* start_from = from;
0375 auto cvt_state = implementation().initial_state(to_unicode_state);
0376 while(max > 0 && from < from_end) {
0377 const char* save_from = from;
0378 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0379 if(ch == boost::locale::utf::incomplete || ch == boost::locale::utf::illegal) {
0380 from = save_from;
0381 break;
0382 }
0383 max--;
0384 }
0385
0386 return static_cast<int>(from - start_from);
0387 }
0388
0389 std::codecvt_base::result do_in(std::mbstate_t& ,
0390 const char* from,
0391 const char* from_end,
0392 const char*& from_next,
0393 uchar* to,
0394 uchar* to_end,
0395 uchar*& to_next) const override
0396 {
0397 std::codecvt_base::result r = std::codecvt_base::ok;
0398
0399 auto cvt_state = implementation().initial_state(to_unicode_state);
0400 while(to < to_end && from < from_end) {
0401 const char* from_saved = from;
0402
0403 const utf::code_point ch = implementation().to_unicode(cvt_state, from, from_end);
0404
0405 if(ch == boost::locale::utf::illegal) {
0406 r = std::codecvt_base::error;
0407 from = from_saved;
0408 break;
0409 }
0410 if(ch == boost::locale::utf::incomplete) {
0411 r = std::codecvt_base::partial;
0412 from = from_saved;
0413 break;
0414 }
0415 *to++ = ch;
0416 }
0417 from_next = from;
0418 to_next = to;
0419 if(r == std::codecvt_base::ok && from != from_end)
0420 r = std::codecvt_base::partial;
0421 return r;
0422 }
0423
0424 std::codecvt_base::result do_out(std::mbstate_t& ,
0425 const uchar* from,
0426 const uchar* from_end,
0427 const uchar*& from_next,
0428 char* to,
0429 char* to_end,
0430 char*& to_next) const override
0431 {
0432 std::codecvt_base::result r = std::codecvt_base::ok;
0433 auto cvt_state = implementation().initial_state(from_unicode_state);
0434 while(to < to_end && from < from_end) {
0435 const std::uint32_t ch = *from;
0436 if(!boost::locale::utf::is_valid_codepoint(ch)) {
0437 r = std::codecvt_base::error;
0438 break;
0439 }
0440 const utf::code_point len = implementation().from_unicode(cvt_state, ch, to, to_end);
0441 if(len == boost::locale::utf::incomplete) {
0442 r = std::codecvt_base::partial;
0443 break;
0444 } else if(len == boost::locale::utf::illegal) {
0445 r = std::codecvt_base::error;
0446 break;
0447 }
0448 to += len;
0449 from++;
0450 }
0451 from_next = from;
0452 to_next = to;
0453 if(r == std::codecvt_base::ok && from != from_end)
0454 r = std::codecvt_base::partial;
0455 return r;
0456 }
0457 };
0458
0459 template<typename CodecvtImpl>
0460 class generic_codecvt<char, CodecvtImpl, 1> : public std::codecvt<char, char, std::mbstate_t>,
0461 public generic_codecvt_base {
0462 public:
0463 typedef char uchar;
0464
0465 const CodecvtImpl& implementation() const { return *static_cast<const CodecvtImpl*>(this); }
0466
0467 generic_codecvt(size_t refs = 0) : std::codecvt<char, char, std::mbstate_t>(refs) {}
0468 };
0469
0470 }}
0471
0472 #endif