File indexing completed on 2025-10-24 08:45:40
0001
0002
0003
0004
0005
0006
0007 #ifndef BOOST_LOCALE_UTF_HPP_INCLUDED
0008 #define BOOST_LOCALE_UTF_HPP_INCLUDED
0009
0010 #include <boost/locale/config.hpp>
0011 #include <cstdint>
0012
0013 namespace boost { namespace locale {
0014
0015
0016
0017 namespace utf {
0018
0019 using code_point = uint32_t;
0020
0021
0022 constexpr code_point illegal = 0xFFFFFFFFu;
0023
0024 constexpr code_point incomplete = 0xFFFFFFFEu;
0025
0026
0027 using len_or_error = code_point;
0028
0029
0030 inline bool is_valid_codepoint(code_point v)
0031 {
0032 if(v > 0x10FFFF)
0033 return false;
0034 if(0xD800 <= v && v <= 0xDFFF)
0035 return false;
0036 return true;
0037 }
0038
0039 #ifdef BOOST_LOCALE_DOXYGEN
0040
0041
0042 template<typename CharType, int size = sizeof(CharType)>
0043 struct utf_traits {
0044
0045 typedef CharType char_type;
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059 template<typename Iterator>
0060 static code_point decode(Iterator& p, Iterator e);
0061
0062
0063
0064
0065
0066
0067 static constexpr int max_width;
0068
0069
0070
0071
0072
0073 static int width(code_point value);
0074
0075
0076
0077
0078 static int trail_length(char_type c);
0079
0080 static bool is_trail(char_type c);
0081
0082 static bool is_lead(char_type c);
0083
0084
0085
0086
0087
0088
0089
0090
0091
0092 template<typename Iterator>
0093 static Iterator encode(code_point value, Iterator out);
0094
0095
0096
0097
0098 template<typename Iterator>
0099 static code_point decode_valid(Iterator& p);
0100 };
0101
0102 #else
0103
0104 template<typename CharType, int size = sizeof(CharType)>
0105 struct utf_traits;
0106
0107 template<typename CharType>
0108 struct utf_traits<CharType, 1> {
0109 typedef CharType char_type;
0110
0111 static int trail_length(char_type ci)
0112 {
0113 unsigned char c = ci;
0114 if(c < 128)
0115 return 0;
0116 if(BOOST_UNLIKELY(c < 194))
0117 return -1;
0118 if(c < 224)
0119 return 1;
0120 if(c < 240)
0121 return 2;
0122 if(BOOST_LIKELY(c <= 244))
0123 return 3;
0124 return -1;
0125 }
0126
0127 static constexpr int max_width = 4;
0128
0129 static int width(code_point value)
0130 {
0131 if(value <= 0x7F)
0132 return 1;
0133 else if(value <= 0x7FF)
0134 return 2;
0135 else if(BOOST_LIKELY(value <= 0xFFFF))
0136 return 3;
0137 else
0138 return 4;
0139 }
0140
0141 static bool is_trail(char_type ci)
0142 {
0143 unsigned char c = ci;
0144 return (c & 0xC0) == 0x80;
0145 }
0146
0147 static bool is_lead(char_type ci) { return !is_trail(ci); }
0148
0149 template<typename Iterator>
0150 static code_point decode(Iterator& p, Iterator e)
0151 {
0152 if(BOOST_UNLIKELY(p == e))
0153 return incomplete;
0154
0155 unsigned char lead = *p++;
0156
0157
0158 int trail_size = trail_length(lead);
0159
0160 if(BOOST_UNLIKELY(trail_size < 0))
0161 return illegal;
0162
0163
0164
0165 if(trail_size == 0)
0166 return lead;
0167
0168 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0169
0170
0171 unsigned char tmp;
0172 switch(trail_size) {
0173 case 3:
0174 if(BOOST_UNLIKELY(p == e))
0175 return incomplete;
0176 tmp = *p++;
0177 if(!is_trail(tmp))
0178 return illegal;
0179 c = (c << 6) | (tmp & 0x3F);
0180 BOOST_FALLTHROUGH;
0181 case 2:
0182 if(BOOST_UNLIKELY(p == e))
0183 return incomplete;
0184 tmp = *p++;
0185 if(!is_trail(tmp))
0186 return illegal;
0187 c = (c << 6) | (tmp & 0x3F);
0188 BOOST_FALLTHROUGH;
0189 case 1:
0190 if(BOOST_UNLIKELY(p == e))
0191 return incomplete;
0192 tmp = *p++;
0193 if(!is_trail(tmp))
0194 return illegal;
0195 c = (c << 6) | (tmp & 0x3F);
0196 }
0197
0198
0199
0200 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
0201 return illegal;
0202
0203
0204 if(BOOST_UNLIKELY(width(c) != trail_size + 1))
0205 return illegal;
0206
0207 return c;
0208 }
0209
0210 template<typename Iterator>
0211 static code_point decode_valid(Iterator& p)
0212 {
0213 unsigned char lead = *p++;
0214 if(lead < 192)
0215 return lead;
0216
0217 int trail_size;
0218
0219 if(lead < 224)
0220 trail_size = 1;
0221 else if(BOOST_LIKELY(lead < 240))
0222 trail_size = 2;
0223 else
0224 trail_size = 3;
0225
0226 code_point c = lead & ((1 << (6 - trail_size)) - 1);
0227
0228 switch(trail_size) {
0229 case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
0230 case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_FALLTHROUGH;
0231 case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
0232 }
0233
0234 return c;
0235 }
0236
0237 template<typename Iterator>
0238 static Iterator encode(code_point value, Iterator out)
0239 {
0240 if(value <= 0x7F)
0241 *out++ = static_cast<char_type>(value);
0242 else if(value <= 0x7FF) {
0243 *out++ = static_cast<char_type>((value >> 6) | 0xC0);
0244 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0245 } else if(BOOST_LIKELY(value <= 0xFFFF)) {
0246 *out++ = static_cast<char_type>((value >> 12) | 0xE0);
0247 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0248 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0249 } else {
0250 *out++ = static_cast<char_type>((value >> 18) | 0xF0);
0251 *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
0252 *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
0253 *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
0254 }
0255 return out;
0256 }
0257 };
0258
0259 template<typename CharType>
0260 struct utf_traits<CharType, 2> {
0261 typedef CharType char_type;
0262
0263
0264 static bool is_first_surrogate(uint16_t x) { return 0xD800 <= x && x <= 0xDBFF; }
0265 static bool is_second_surrogate(uint16_t x) { return 0xDC00 <= x && x <= 0xDFFF; }
0266 static code_point combine_surrogate(uint16_t w1, uint16_t w2)
0267 {
0268 return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
0269 }
0270 static int trail_length(char_type c)
0271 {
0272 if(is_first_surrogate(c))
0273 return 1;
0274 if(is_second_surrogate(c))
0275 return -1;
0276 return 0;
0277 }
0278
0279
0280 static bool is_trail(char_type c) { return is_second_surrogate(c); }
0281
0282 static bool is_lead(char_type c) { return !is_second_surrogate(c); }
0283
0284 template<typename It>
0285 static code_point decode(It& current, It last)
0286 {
0287 if(BOOST_UNLIKELY(current == last))
0288 return incomplete;
0289 uint16_t w1 = *current++;
0290 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
0291 return w1;
0292 if(w1 > 0xDBFF)
0293 return illegal;
0294 if(current == last)
0295 return incomplete;
0296 uint16_t w2 = *current++;
0297 if(w2 < 0xDC00 || 0xDFFF < w2)
0298 return illegal;
0299 return combine_surrogate(w1, w2);
0300 }
0301 template<typename It>
0302 static code_point decode_valid(It& current)
0303 {
0304 uint16_t w1 = *current++;
0305 if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
0306 return w1;
0307 uint16_t w2 = *current++;
0308 return combine_surrogate(w1, w2);
0309 }
0310
0311 static constexpr int max_width = 2;
0312 static int width(code_point u) { return u >= 0x10000 ? 2 : 1; }
0313 template<typename It>
0314 static It encode(code_point u, It out)
0315 {
0316 if(BOOST_LIKELY(u <= 0xFFFF))
0317 *out++ = static_cast<char_type>(u);
0318 else {
0319 u -= 0x10000;
0320 *out++ = static_cast<char_type>(0xD800 | (u >> 10));
0321 *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
0322 }
0323 return out;
0324 }
0325 };
0326
0327 template<typename CharType>
0328 struct utf_traits<CharType, 4> {
0329 typedef CharType char_type;
0330 static int trail_length(char_type c)
0331 {
0332 if(is_valid_codepoint(c))
0333 return 0;
0334 return -1;
0335 }
0336 static bool is_trail(char_type ) { return false; }
0337 static bool is_lead(char_type ) { return true; }
0338
0339 template<typename It>
0340 static code_point decode_valid(It& current)
0341 {
0342 return *current++;
0343 }
0344
0345 template<typename It>
0346 static code_point decode(It& current, It last)
0347 {
0348 if(BOOST_UNLIKELY(current == last))
0349 return boost::locale::utf::incomplete;
0350 code_point c = *current++;
0351 if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
0352 return boost::locale::utf::illegal;
0353 return c;
0354 }
0355 static constexpr int max_width = 1;
0356 static int width(code_point ) { return 1; }
0357 template<typename It>
0358 static It encode(code_point u, It out)
0359 {
0360 *out++ = static_cast<char_type>(u);
0361 return out;
0362 }
0363
0364 };
0365
0366 #endif
0367
0368 }
0369 }}
0370
0371 #endif