File indexing completed on 2025-01-18 09:39:00
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #ifndef BOOST_JSON_DETAIL_UTF8_HPP
0011 #define BOOST_JSON_DETAIL_UTF8_HPP
0012
0013 #include <boost/json/detail/config.hpp>
0014
0015 #include <cstddef>
0016 #include <cstring>
0017 #include <cstdint>
0018
0019 namespace boost {
0020 namespace json {
0021 namespace detail {
0022
0023 template<int N>
0024 std::uint32_t
0025 load_little_endian(void const* p)
0026 {
0027 std::uint32_t v = 0;
0028 std::memcpy(&v, p, N);
0029 #ifdef BOOST_JSON_BIG_ENDIAN
0030 v = ((v & 0xFF000000) >> 24) |
0031 ((v & 0x00FF0000) >> 8) |
0032 ((v & 0x0000FF00) << 8) |
0033 ((v & 0x000000FF) << 24);
0034 #endif
0035 return v;
0036 }
0037
0038 inline
0039 uint16_t
0040 classify_utf8(char c)
0041 {
0042
0043
0044
0045
0046
0047
0048
0049
0050 static constexpr uint16_t first[128]
0051 {
0052 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0053 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0054 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0055 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0056 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0057 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0058 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0059 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0060
0061 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0062 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0063 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0064 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0065 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
0066 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
0067 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
0068 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0069 };
0070 return first[static_cast<unsigned char>(c & 0x7F)];
0071 }
0072
0073 inline
0074 bool
0075 is_valid_utf8(const char* p, uint16_t first)
0076 {
0077 uint32_t v;
0078 switch(first >> 8)
0079 {
0080 default:
0081 return false;
0082
0083
0084 case 1:
0085 v = load_little_endian<2>(p);
0086 return (v & 0xC000) == 0x8000;
0087
0088
0089 case 2:
0090 v = load_little_endian<3>(p);
0091 return (v & 0xC0E000) == 0x80A000;
0092
0093
0094 case 3:
0095 v = load_little_endian<3>(p);
0096 return (v & 0xC0C000) == 0x808000;
0097
0098
0099 case 4:
0100 v = load_little_endian<3>(p);
0101 return (v & 0xC0E000) == 0x808000;
0102
0103
0104 case 5:
0105 v = load_little_endian<4>(p);
0106 return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
0107
0108
0109 case 6:
0110 v = load_little_endian<4>(p);
0111 return (v & 0xC0C0C000) == 0x80808000;
0112
0113
0114 case 7:
0115 v = load_little_endian<4>(p);
0116 return (v & 0xC0C0F000) == 0x80808000;
0117 }
0118 }
0119
0120 class utf8_sequence
0121 {
0122 char seq_[4];
0123 uint16_t first_;
0124 uint8_t size_;
0125
0126 public:
0127 void
0128 save(
0129 const char* p,
0130 std::size_t remain) noexcept
0131 {
0132 first_ = classify_utf8(*p );
0133 if(remain >= length())
0134 size_ = length();
0135 else
0136 size_ = static_cast<uint8_t>(remain);
0137 std::memcpy(seq_, p, size_);
0138 }
0139
0140 uint8_t
0141 length() const noexcept
0142 {
0143 return first_ & 0xFF;
0144 }
0145
0146 bool
0147 complete() const noexcept
0148 {
0149 return size_ >= length();
0150 }
0151
0152
0153 bool
0154 append(
0155 const char* p,
0156 std::size_t remain) noexcept
0157 {
0158 if(BOOST_JSON_UNLIKELY(needed() == 0))
0159 return true;
0160 if(BOOST_JSON_LIKELY(remain >= needed()))
0161 {
0162 std::memcpy(
0163 seq_ + size_, p, needed());
0164 size_ = length();
0165 return true;
0166 }
0167 if(BOOST_JSON_LIKELY(remain > 0))
0168 {
0169 std::memcpy(seq_ + size_, p, remain);
0170 size_ += static_cast<uint8_t>(remain);
0171 }
0172 return false;
0173 }
0174
0175 const char*
0176 data() const noexcept
0177 {
0178 return seq_;
0179 }
0180
0181 uint8_t
0182 needed() const noexcept
0183 {
0184 return length() - size_;
0185 }
0186
0187 bool
0188 valid() const noexcept
0189 {
0190 BOOST_ASSERT(size_ >= length());
0191 return is_valid_utf8(seq_, first_);
0192 }
0193 };
0194
0195 }
0196 }
0197 }
0198
0199 #endif