Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-07-01 08:18:35

0001 //
0002 // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
0003 //
0004 // Distributed under the Boost Software License, Version 1.0. (See accompanying
0005 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0006 //
0007 // Official repository: https://github.com/boostorg/json
0008 //
0009 
0010 #ifndef BOOST_JSON_DETAIL_UTF8_HPP
0011 #define BOOST_JSON_DETAIL_UTF8_HPP
0012 
0013 #include <boost/endian/conversion.hpp>
0014 #include <boost/json/detail/config.hpp>
0015 
0016 #include <cstddef>
0017 #include <cstring>
0018 #include <cstdint>
0019 
0020 namespace boost {
0021 namespace json {
0022 namespace detail {
0023 
0024 template<int N>
0025 std::uint32_t
0026 load_little_endian(void const* p)
0027 {
0028     std::uint32_t v = 0;
0029     std::memcpy(&v, p, N);
0030     endian::little_to_native_inplace(v);
0031     return v;
0032 }
0033 
0034 inline
0035 uint16_t
0036 classify_utf8(char c)
0037 {
0038     // 0x000 = invalid
0039     // 0x102 = 2 bytes, second byte [80, BF]
0040     // 0x203 = 3 bytes, second byte [A0, BF]
0041     // 0x303 = 3 bytes, second byte [80, BF]
0042     // 0x403 = 3 bytes, second byte [80, 9F]
0043     // 0x504 = 4 bytes, second byte [90, BF]
0044     // 0x604 = 4 bytes, second byte [80, BF]
0045     // 0x704 = 4 bytes, second byte [80, 8F]
0046     static constexpr uint16_t first[128]
0047     {
0048        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0049        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0050        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0051        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0052        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0053        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0054        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0055        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0056 
0057        0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0058        0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0059        0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0060        0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0061        0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
0062        0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
0063        0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
0064        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0065     };
0066     return first[static_cast<unsigned char>(c & 0x7F)];
0067 }
0068 
0069 inline
0070 bool
0071 is_valid_utf8(const char* p, uint16_t first)
0072 {
0073     uint32_t v;
0074     switch(first >> 8)
0075     {
0076     default:
0077         return false;
0078 
0079     // 2 bytes, second byte [80, BF]
0080     case 1:
0081         v = load_little_endian<2>(p);
0082         return (v & 0xC000) == 0x8000;
0083 
0084     // 3 bytes, second byte [A0, BF]
0085     case 2:
0086         v = load_little_endian<3>(p);
0087         return (v & 0xC0E000) == 0x80A000;
0088 
0089     // 3 bytes, second byte [80, BF]
0090     case 3:
0091         v = load_little_endian<3>(p);
0092         return (v & 0xC0C000) == 0x808000;
0093 
0094     // 3 bytes, second byte [80, 9F]
0095     case 4:
0096         v = load_little_endian<3>(p);
0097         return (v & 0xC0E000) == 0x808000;
0098 
0099     // 4 bytes, second byte [90, BF]
0100     case 5:
0101         v = load_little_endian<4>(p);
0102         return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
0103 
0104     // 4 bytes, second byte [80, BF]
0105     case 6:
0106         v = load_little_endian<4>(p);
0107         return (v & 0xC0C0C000) == 0x80808000;
0108 
0109     // 4 bytes, second byte [80, 8F]
0110     case 7:
0111         v = load_little_endian<4>(p);
0112         return (v & 0xC0C0F000) == 0x80808000;
0113     }
0114 }
0115 
0116 class utf8_sequence
0117 {
0118     char seq_[4];
0119     uint16_t first_;
0120     uint8_t size_;
0121 
0122 public:
0123     void
0124     save(
0125         const char* p,
0126         std::size_t remain) noexcept
0127     {
0128         first_ = classify_utf8(*p );
0129         if(remain >= length())
0130             size_ = length();
0131         else
0132             size_ = static_cast<uint8_t>(remain);
0133         std::memcpy(seq_, p, size_);
0134     }
0135 
0136     uint8_t
0137     length() const noexcept
0138     {
0139         return first_ & 0xFF;
0140     }
0141 
0142     bool
0143     complete() const noexcept
0144     {
0145         return size_ >= length();
0146     }
0147 
0148     // returns true if complete
0149     bool
0150     append(
0151         const char* p,
0152         std::size_t remain) noexcept
0153     {
0154         if(BOOST_JSON_UNLIKELY(needed() == 0))
0155             return true;
0156         if(BOOST_JSON_LIKELY(remain >= needed()))
0157         {
0158             std::memcpy(
0159                 seq_ + size_, p, needed());
0160             size_ = length();
0161             return true;
0162         }
0163         if(BOOST_JSON_LIKELY(remain > 0))
0164         {
0165             std::memcpy(seq_ + size_, p, remain);
0166             size_ += static_cast<uint8_t>(remain);
0167         }
0168         return false;
0169     }
0170 
0171     const char*
0172     data() const noexcept
0173     {
0174         return seq_;
0175     }
0176 
0177     uint8_t
0178     needed() const noexcept
0179     {
0180         return length() - size_;
0181     }
0182 
0183     bool
0184     valid() const noexcept
0185     {
0186         BOOST_ASSERT(size_ >= length());
0187         return is_valid_utf8(seq_, first_);
0188     }
0189 };
0190 
0191 } // detail
0192 } // namespace json
0193 } // namespace boost
0194 
0195 #endif