File indexing completed on 2025-07-01 08:18:35
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #ifndef BOOST_JSON_DETAIL_UTF8_HPP
0011 #define BOOST_JSON_DETAIL_UTF8_HPP
0012
0013 #include <boost/endian/conversion.hpp>
0014 #include <boost/json/detail/config.hpp>
0015
0016 #include <cstddef>
0017 #include <cstring>
0018 #include <cstdint>
0019
0020 namespace boost {
0021 namespace json {
0022 namespace detail {
0023
0024 template<int N>
0025 std::uint32_t
0026 load_little_endian(void const* p)
0027 {
0028 std::uint32_t v = 0;
0029 std::memcpy(&v, p, N);
0030 endian::little_to_native_inplace(v);
0031 return v;
0032 }
0033
0034 inline
0035 uint16_t
0036 classify_utf8(char c)
0037 {
0038
0039
0040
0041
0042
0043
0044
0045
0046 static constexpr uint16_t first[128]
0047 {
0048 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0049 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0050 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0051 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0052 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0053 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0054 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0055 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0056
0057 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0058 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0059 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0060 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0061 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
0062 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
0063 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
0064 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0065 };
0066 return first[static_cast<unsigned char>(c & 0x7F)];
0067 }
0068
0069 inline
0070 bool
0071 is_valid_utf8(const char* p, uint16_t first)
0072 {
0073 uint32_t v;
0074 switch(first >> 8)
0075 {
0076 default:
0077 return false;
0078
0079
0080 case 1:
0081 v = load_little_endian<2>(p);
0082 return (v & 0xC000) == 0x8000;
0083
0084
0085 case 2:
0086 v = load_little_endian<3>(p);
0087 return (v & 0xC0E000) == 0x80A000;
0088
0089
0090 case 3:
0091 v = load_little_endian<3>(p);
0092 return (v & 0xC0C000) == 0x808000;
0093
0094
0095 case 4:
0096 v = load_little_endian<3>(p);
0097 return (v & 0xC0E000) == 0x808000;
0098
0099
0100 case 5:
0101 v = load_little_endian<4>(p);
0102 return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
0103
0104
0105 case 6:
0106 v = load_little_endian<4>(p);
0107 return (v & 0xC0C0C000) == 0x80808000;
0108
0109
0110 case 7:
0111 v = load_little_endian<4>(p);
0112 return (v & 0xC0C0F000) == 0x80808000;
0113 }
0114 }
0115
0116 class utf8_sequence
0117 {
0118 char seq_[4];
0119 uint16_t first_;
0120 uint8_t size_;
0121
0122 public:
0123 void
0124 save(
0125 const char* p,
0126 std::size_t remain) noexcept
0127 {
0128 first_ = classify_utf8(*p );
0129 if(remain >= length())
0130 size_ = length();
0131 else
0132 size_ = static_cast<uint8_t>(remain);
0133 std::memcpy(seq_, p, size_);
0134 }
0135
0136 uint8_t
0137 length() const noexcept
0138 {
0139 return first_ & 0xFF;
0140 }
0141
0142 bool
0143 complete() const noexcept
0144 {
0145 return size_ >= length();
0146 }
0147
0148
0149 bool
0150 append(
0151 const char* p,
0152 std::size_t remain) noexcept
0153 {
0154 if(BOOST_JSON_UNLIKELY(needed() == 0))
0155 return true;
0156 if(BOOST_JSON_LIKELY(remain >= needed()))
0157 {
0158 std::memcpy(
0159 seq_ + size_, p, needed());
0160 size_ = length();
0161 return true;
0162 }
0163 if(BOOST_JSON_LIKELY(remain > 0))
0164 {
0165 std::memcpy(seq_ + size_, p, remain);
0166 size_ += static_cast<uint8_t>(remain);
0167 }
0168 return false;
0169 }
0170
0171 const char*
0172 data() const noexcept
0173 {
0174 return seq_;
0175 }
0176
0177 uint8_t
0178 needed() const noexcept
0179 {
0180 return length() - size_;
0181 }
0182
0183 bool
0184 valid() const noexcept
0185 {
0186 BOOST_ASSERT(size_ >= length());
0187 return is_valid_utf8(seq_, first_);
0188 }
0189 };
0190
0191 }
0192 }
0193 }
0194
0195 #endif