File indexing completed on 2025-01-18 09:42:45
0001
0002
0003
0004
0005
0006
0007
0008 #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
0009 #define BOOST_NOWIDE_UTF_HPP_INCLUDED
0010
0011 #include <boost/nowide/config.hpp>
0012 #include <cstdint>
0013
0014 namespace boost {
0015 namespace nowide {
0016
0017
0018
0019
0020
0021
0022 namespace utf {
0023
0024
0025
0026
0027 using code_point = uint32_t;
0028
0029
0030
0031
0032 static const code_point illegal = 0xFFFFFFFFu;
0033
0034
0035
0036
0037 static const code_point incomplete = 0xFFFFFFFEu;
0038
0039
0040
0041
0042 inline bool is_valid_codepoint(code_point v)
0043 {
0044 if(v > 0x10FFFF)
0045 return false;
0046 if(0xD800 <= v && v <= 0xDFFF)
0047 return false;
0048 return true;
0049 }
0050
0051 #ifdef BOOST_NOWIDE_DOXYGEN
0052
0053
0054
0055 template<typename CharType, int size = sizeof(CharType)>
0056 struct utf_traits
0057 {
0058
0059
0060
0061 using char_type = CharType;
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075
0076 template<typename Iterator>
0077 static code_point decode(Iterator& p, Iterator e);
0078
0079
0080
0081
0082
0083
0084
0085
0086 static const int max_width;
0087
0088
0089
0090
0091
0092
0093 static int width(code_point value);
0094
0095
0096
0097
0098
0099
0100 static int trail_length(char_type c);
0101
0102
0103
0104 static bool is_trail(char_type c);
0105
0106
0107
0108 static bool is_lead(char_type c);
0109
0110
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120 template<typename Iterator>
0121 static Iterator encode(code_point value, Iterator out);
0122
0123
0124
0125
0126
0127 template<typename Iterator>
0128 static code_point decode_valid(Iterator& p);
0129 };
0130
0131 #else
0132
0133 template<typename CharType, int size = sizeof(CharType)>
0134 struct utf_traits;
0135
0136 template<typename CharType>
0137 struct utf_traits<CharType, 1>
0138 {
0139 using char_type = CharType;
0140
0141 static int trail_length(char_type ci)
0142 {
0143 unsigned char c = ci;
0144 if(c < 128)
0145 return 0;
0146 if(BOOST_UNLIKELY(c < 194))
0147 return -1;
0148 if(c < 224)
0149 return 1;
0150 if(c < 240)
0151 return 2;
0152 if(BOOST_LIKELY(c <= 244))
0153 return 3;
0154 return -1;
0155 }
0156
0157 static const int max_width = 4;
0158
0159 static int width(code_point value)
0160 {
0161 if(value <= 0x7F)
0162 {
0163 return 1;
0164 } else if(value <= 0x7FF)
0165 {
0166 return 2;
0167 } else if(BOOST_LIKELY(value <= 0xFFFF))
0168 {
0169 return 3;
0170 } else
0171 {
0172 return 4;
0173 }
0174 }
0175
0176 static bool is_trail(char_type ci)
0177 {
0178 unsigned char c = ci;
0179 return (c & 0xC0) == 0x80;
0180 }
0181
0182 static bool is_lead(char_type ci)
0183 {
0184 return !is_trail(ci);
0185 }
0186
0187 template<typename Iterator>
0188 static code_point decode(Iterator& p, Iterator e)
0189 {
0190 if(BOOST_UNLIKELY(p == e))
0191 return incomplete;
0192
0193 unsigned char lead = *p++;
0194
0195
0196 int trail_size = trail_length(lead);
0197
0198 if(BOOST_UNLIKELY(trail_size < 0))
0199 return illegal;
0200
0201
0202
0203 if(trail_size == 0)
0204 return lead;
0205
0206 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0207
0208
0209 unsigned char tmp;
0210 switch(trail_size)
0211 {
0212 case 3:
0213 if(BOOST_UNLIKELY(p == e))
0214 return incomplete;
0215 tmp = *p++;
0216 if(!is_trail(tmp))
0217 return illegal;
0218 c = (c << 6) | (tmp & 0x3F);
0219 BOOST_NOWIDE_FALLTHROUGH;
0220 case 2:
0221 if(BOOST_UNLIKELY(p == e))
0222 return incomplete;
0223 tmp = *p++;
0224 if(!is_trail(tmp))
0225 return illegal;
0226 c = (c << 6) | (tmp & 0x3F);
0227 BOOST_NOWIDE_FALLTHROUGH;
0228 case 1:
0229 if(BOOST_UNLIKELY(p == e))
0230 return incomplete;
0231 tmp = *p++;
0232 if(!is_trail(tmp))
0233 return illegal;
0234 c = (c << 6) | (tmp & 0x3F);
0235 }
0236
0237
0238
0239
0240 if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
0241 {
0242 p -= trail_size;
0243 return illegal;
0244 }
0245
0246 return c;
0247 }
0248
0249 template<typename Iterator>
0250 static code_point decode_valid(Iterator& p)
0251 {
0252 unsigned char lead = *p++;
0253 if(lead < 192)
0254 return lead;
0255
0256 int trail_size;
0257
0258 if(lead < 224)
0259 trail_size = 1;
0260 else if(BOOST_LIKELY(lead < 240))
0261 trail_size = 2;
0262 else
0263 trail_size = 3;
0264
0265 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0266
0267 switch(trail_size)
0268 {
0269 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
0270 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
0271 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
0272 }
0273
0274 return c;
0275 }
0276
0277 template<typename Iterator>
0278 static Iterator encode(code_point value, Iterator out)
0279 {
0280 if(value <= 0x7F)
0281 {
0282 *out++ = static_cast<char_type>(value);
0283 } else if(value <= 0x7FF)
0284 {
0285 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
0286 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0287 } else if(BOOST_LIKELY(value <= 0xFFFF))
0288 {
0289 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
0290 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0291 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0292 } else
0293 {
0294 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
0295 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
0296 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0297 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0298 }
0299 return out;
0300 }
0301 };
0302
0303 template<typename CharType>
0304 struct utf_traits<CharType, 2>
0305 {
0306 using char_type = CharType;
0307
0308
0309 static bool is_single_codepoint(uint16_t x)
0310 {
0311
0312 return x <= 0xD7FF || x >= 0xE000;
0313 }
0314 static bool is_first_surrogate(uint16_t x)
0315 {
0316
0317 return 0xD800 <= x && x <= 0xDBFF;
0318 }
0319 static bool is_second_surrogate(uint16_t x)
0320 {
0321
0322 return 0xDC00 <= x && x <= 0xDFFF;
0323 }
0324 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
0325 {
0326 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
0327 }
0328 static int trail_length(char_type c)
0329 {
0330 if(is_first_surrogate(c))
0331 return 1;
0332 if(is_second_surrogate(c))
0333 return -1;
0334 return 0;
0335 }
0336
0337 static bool is_trail(char_type c)
0338 {
0339 return is_second_surrogate(c);
0340 }
0341
0342 static bool is_lead(char_type c)
0343 {
0344 return !is_second_surrogate(c);
0345 }
0346
0347 template<typename It>
0348 static code_point decode(It& current, It last)
0349 {
0350 if(BOOST_UNLIKELY(current == last))
0351 return incomplete;
0352 uint16_t w1 = *current++;
0353 if(BOOST_LIKELY(is_single_codepoint(w1)))
0354 {
0355 return w1;
0356 }
0357
0358 if(w1 >= 0xDC00)
0359 return illegal;
0360 if(current == last)
0361 return incomplete;
0362 uint16_t w2 = *current++;
0363 if(!is_second_surrogate(w2))
0364 return illegal;
0365 return combine_surrogate(w1, w2);
0366 }
0367 template<typename It>
0368 static code_point decode_valid(It& current)
0369 {
0370 uint16_t w1 = *current++;
0371 if(BOOST_LIKELY(is_single_codepoint(w1)))
0372 {
0373 return w1;
0374 }
0375 uint16_t w2 = *current++;
0376 return combine_surrogate(w1, w2);
0377 }
0378
0379 static const int max_width = 2;
0380 static int width(code_point u)
0381 {
0382 return u >= 0x10000 ? 2 : 1;
0383 }
0384 template<typename It>
0385 static It encode(code_point u, It out)
0386 {
0387 if(BOOST_LIKELY(u <= 0xFFFF))
0388 {
0389 *out++ = static_cast<char_type>(u);
0390 } else
0391 {
0392 u -= 0x10000;
0393 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
0394 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
0395 }
0396 return out;
0397 }
0398 };
0399
0400 template<typename CharType>
0401 struct utf_traits<CharType, 4>
0402 {
0403 using char_type = CharType;
0404 static int trail_length(char_type c)
0405 {
0406 if(is_valid_codepoint(c))
0407 return 0;
0408 return -1;
0409 }
0410 static bool is_trail(char_type )
0411 {
0412 return false;
0413 }
0414 static bool is_lead(char_type )
0415 {
0416 return true;
0417 }
0418
0419 template<typename It>
0420 static code_point decode_valid(It& current)
0421 {
0422 return *current++;
0423 }
0424
0425 template<typename It>
0426 static code_point decode(It& current, It last)
0427 {
0428 if(BOOST_UNLIKELY(current == last))
0429 return incomplete;
0430 code_point c = *current++;
0431 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
0432 return illegal;
0433 return c;
0434 }
0435 static const int max_width = 1;
0436 static int width(code_point )
0437 {
0438 return 1;
0439 }
0440 template<typename It>
0441 static It encode(code_point u, It out)
0442 {
0443 *out++ = static_cast<char_type>(u);
0444 return out;
0445 }
0446 };
0447
0448 #endif
0449
0450 }
0451 }
0452 }
0453
0454 #endif