File indexing completed on 2025-01-18 09:30:42
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013 #include <boost/detail/utf8_codecvt_facet.hpp>
0014
0015 #include <cstdlib> // for multi-byte converson routines
0016 #include <cassert>
0017
0018 #include <boost/limits.hpp>
0019 #include <boost/config.hpp>
0020
0021
0022
0023
0024
0025
0026 #ifndef BOOST_NO_STD_WSTRING
0027
0028 BOOST_UTF8_BEGIN_NAMESPACE
0029
0030
0031
0032
0033 namespace detail {
0034
0035 inline const wchar_t * get_octet1_modifier_table() BOOST_NOEXCEPT
0036 {
0037 static const wchar_t octet1_modifier_table[] = {
0038 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
0039 };
0040 return octet1_modifier_table;
0041 }
0042
0043 }
0044
0045
0046 BOOST_UTF8_DECL utf8_codecvt_facet::utf8_codecvt_facet(
0047 std::size_t no_locale_manage
0048 ) :
0049 std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
0050 {}
0051
0052 BOOST_UTF8_DECL utf8_codecvt_facet::~utf8_codecvt_facet()
0053 {}
0054
0055
0056 BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_in(
0057 std::mbstate_t& ,
0058 const char * from,
0059 const char * from_end,
0060 const char * & from_next,
0061 wchar_t * to,
0062 wchar_t * to_end,
0063 wchar_t * & to_next
0064 ) const {
0065
0066
0067
0068
0069
0070
0071
0072
0073 const wchar_t * const octet1_modifier_table = detail::get_octet1_modifier_table();
0074 while (from != from_end && to != to_end) {
0075
0076
0077 if (invalid_leading_octet(*from)) {
0078 from_next = from;
0079 to_next = to;
0080 return std::codecvt_base::error;
0081 }
0082
0083
0084
0085 const int cont_octet_count = get_cont_octet_count(*from);
0086
0087
0088
0089 wchar_t ucs_result =
0090 (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
0091
0092
0093
0094
0095
0096 int i = 0;
0097 while (i != cont_octet_count && from != from_end) {
0098
0099
0100 if (invalid_continuing_octet(*from)) {
0101 from_next = from;
0102 to_next = to;
0103 return std::codecvt_base::error;
0104 }
0105
0106 ucs_result *= (1 << 6);
0107
0108
0109
0110 ucs_result += (unsigned char)(*from++) - 0x80;
0111 ++i;
0112 }
0113
0114
0115 if (from == from_end && i != cont_octet_count) {
0116
0117 from_next = from - (i + 1);
0118 to_next = to;
0119 return std::codecvt_base::partial;
0120 }
0121 *to++ = ucs_result;
0122 }
0123 from_next = from;
0124 to_next = to;
0125
0126
0127 if (from == from_end)
0128 return std::codecvt_base::ok;
0129 else
0130 return std::codecvt_base::partial;
0131 }
0132
0133 BOOST_UTF8_DECL std::codecvt_base::result utf8_codecvt_facet::do_out(
0134 std::mbstate_t& ,
0135 const wchar_t * from,
0136 const wchar_t * from_end,
0137 const wchar_t * & from_next,
0138 char * to,
0139 char * to_end,
0140 char * & to_next
0141 ) const
0142 {
0143 const wchar_t * const octet1_modifier_table = detail::get_octet1_modifier_table();
0144 wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)();
0145 while (from != from_end && to != to_end) {
0146
0147
0148 if (*from > max_wchar) {
0149 from_next = from;
0150 to_next = to;
0151 return std::codecvt_base::error;
0152 }
0153
0154 int cont_octet_count = get_cont_octet_out_count(*from);
0155
0156
0157 int shift_exponent = cont_octet_count * 6;
0158
0159
0160 *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] +
0161 (unsigned char)(*from / (1 << shift_exponent)));
0162
0163
0164
0165
0166
0167
0168 int i = 0;
0169 while (i != cont_octet_count && to != to_end) {
0170 shift_exponent -= 6;
0171 *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
0172 ++i;
0173 }
0174
0175 if (to == to_end && i != cont_octet_count) {
0176 from_next = from;
0177 to_next = to - (i + 1);
0178 return std::codecvt_base::partial;
0179 }
0180 ++from;
0181 }
0182 from_next = from;
0183 to_next = to;
0184
0185
0186 if (from == from_end)
0187 return std::codecvt_base::ok;
0188 else
0189 return std::codecvt_base::partial;
0190 }
0191
0192
0193
0194 BOOST_UTF8_DECL int utf8_codecvt_facet::do_length(
0195 std::mbstate_t &,
0196 const char * from,
0197 const char * from_end,
0198 std::size_t max_limit
0199 ) const
0200 #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
0201 throw()
0202 #endif
0203 {
0204 const char * from_next = from;
0205 for (std::size_t char_count = 0u; char_count < max_limit && from_next < from_end; ++char_count) {
0206 unsigned int octet_count = get_octet_count(*from_next);
0207
0208 if (octet_count > static_cast<std::size_t>(from_end - from_next))
0209 break;
0210 from_next += octet_count;
0211 }
0212
0213 return static_cast<int>(from_next - from);
0214 }
0215
0216 BOOST_UTF8_DECL unsigned int utf8_codecvt_facet::get_octet_count(
0217 unsigned char lead_octet
0218 ) {
0219
0220 if (lead_octet <= 0x7f) return 1;
0221
0222
0223
0224
0225 if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
0226 else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
0227 else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
0228 else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
0229 else return 6;
0230 }
0231
0232 namespace detail {
0233
0234 template<std::size_t s>
0235 inline int get_cont_octet_out_count_impl(wchar_t word) {
0236 if (word < 0x80) {
0237 return 0;
0238 }
0239 if (word < 0x800) {
0240 return 1;
0241 }
0242 return 2;
0243 }
0244
0245 template<>
0246 inline int get_cont_octet_out_count_impl<4>(wchar_t word) {
0247 if (word < 0x80) {
0248 return 0;
0249 }
0250 if (word < 0x800) {
0251 return 1;
0252 }
0253
0254
0255
0256
0257
0258
0259
0260 #if !defined(WCHAR_MAX)
0261 # error WCHAR_MAX not defined!
0262 #endif
0263
0264 #if defined(_MSC_VER) && _MSC_VER <= 1310
0265 return 2;
0266 #elif WCHAR_MAX > 0x10000
0267
0268 if (word < 0x10000) {
0269 return 2;
0270 }
0271 if (word < 0x200000) {
0272 return 3;
0273 }
0274 if (word < 0x4000000) {
0275 return 4;
0276 }
0277 return 5;
0278
0279 #else
0280 return 2;
0281 #endif
0282 }
0283
0284 }
0285
0286
0287
0288 BOOST_UTF8_DECL int utf8_codecvt_facet::get_cont_octet_out_count(
0289 wchar_t word
0290 ) {
0291 return detail::get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
0292 }
0293
0294 BOOST_UTF8_END_NAMESPACE
0295
0296 #endif