Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:39:00

0001 //
0002 // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
0003 //
0004 // Distributed under the Boost Software License, Version 1.0. (See accompanying
0005 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0006 //
0007 // Official repository: https://github.com/boostorg/json
0008 //
0009 
0010 #ifndef BOOST_JSON_DETAIL_UTF8_HPP
0011 #define BOOST_JSON_DETAIL_UTF8_HPP
0012 
0013 #include <boost/json/detail/config.hpp>
0014 
0015 #include <cstddef>
0016 #include <cstring>
0017 #include <cstdint>
0018 
0019 namespace boost {
0020 namespace json {
0021 namespace detail {
0022 
0023 template<int N>
0024 std::uint32_t
0025 load_little_endian(void const* p)
0026 {
0027     std::uint32_t v = 0;
0028     std::memcpy(&v, p, N);
0029 #ifdef BOOST_JSON_BIG_ENDIAN
0030     v = ((v & 0xFF000000) >> 24) |
0031         ((v & 0x00FF0000) >>  8) |
0032         ((v & 0x0000FF00) <<  8) |
0033         ((v & 0x000000FF) << 24);
0034 #endif
0035     return v;
0036 }
0037 
0038 inline
0039 uint16_t
0040 classify_utf8(char c)
0041 {
0042     // 0x000 = invalid
0043     // 0x102 = 2 bytes, second byte [80, BF]
0044     // 0x203 = 3 bytes, second byte [A0, BF]
0045     // 0x303 = 3 bytes, second byte [80, BF]
0046     // 0x403 = 3 bytes, second byte [80, 9F]
0047     // 0x504 = 4 bytes, second byte [90, BF]
0048     // 0x604 = 4 bytes, second byte [80, BF]
0049     // 0x704 = 4 bytes, second byte [80, 8F]
0050     static constexpr uint16_t first[128]
0051     {
0052        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0053        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0054        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0055        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0056        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0057        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0058        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0059        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0060 
0061        0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0062        0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0063        0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0064        0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
0065        0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
0066        0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
0067        0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
0068        0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
0069     };
0070     return first[static_cast<unsigned char>(c & 0x7F)];
0071 }
0072 
0073 inline
0074 bool
0075 is_valid_utf8(const char* p, uint16_t first)
0076 {
0077     uint32_t v;
0078     switch(first >> 8)
0079     {
0080     default:
0081         return false;
0082 
0083     // 2 bytes, second byte [80, BF]
0084     case 1:
0085         v = load_little_endian<2>(p);
0086         return (v & 0xC000) == 0x8000;
0087 
0088     // 3 bytes, second byte [A0, BF]
0089     case 2:
0090         v = load_little_endian<3>(p);
0091         return (v & 0xC0E000) == 0x80A000;
0092 
0093     // 3 bytes, second byte [80, BF]
0094     case 3:
0095         v = load_little_endian<3>(p);
0096         return (v & 0xC0C000) == 0x808000;
0097 
0098     // 3 bytes, second byte [80, 9F]
0099     case 4:
0100         v = load_little_endian<3>(p);
0101         return (v & 0xC0E000) == 0x808000;
0102 
0103     // 4 bytes, second byte [90, BF]
0104     case 5:
0105         v = load_little_endian<4>(p);
0106         return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
0107 
0108     // 4 bytes, second byte [80, BF]
0109     case 6:
0110         v = load_little_endian<4>(p);
0111         return (v & 0xC0C0C000) == 0x80808000;
0112 
0113     // 4 bytes, second byte [80, 8F]
0114     case 7:
0115         v = load_little_endian<4>(p);
0116         return (v & 0xC0C0F000) == 0x80808000;
0117     }
0118 }
0119 
0120 class utf8_sequence
0121 {
0122     char seq_[4];
0123     uint16_t first_;
0124     uint8_t size_;
0125 
0126 public:
0127     void
0128     save(
0129         const char* p,
0130         std::size_t remain) noexcept
0131     {
0132         first_ = classify_utf8(*p );
0133         if(remain >= length())
0134             size_ = length();
0135         else
0136             size_ = static_cast<uint8_t>(remain);
0137         std::memcpy(seq_, p, size_);
0138     }
0139 
0140     uint8_t
0141     length() const noexcept
0142     {
0143         return first_ & 0xFF;
0144     }
0145 
0146     bool
0147     complete() const noexcept
0148     {
0149         return size_ >= length();
0150     }
0151 
0152     // returns true if complete
0153     bool
0154     append(
0155         const char* p,
0156         std::size_t remain) noexcept
0157     {
0158         if(BOOST_JSON_UNLIKELY(needed() == 0))
0159             return true;
0160         if(BOOST_JSON_LIKELY(remain >= needed()))
0161         {
0162             std::memcpy(
0163                 seq_ + size_, p, needed());
0164             size_ = length();
0165             return true;
0166         }
0167         if(BOOST_JSON_LIKELY(remain > 0))
0168         {
0169             std::memcpy(seq_ + size_, p, remain);
0170             size_ += static_cast<uint8_t>(remain);
0171         }
0172         return false;
0173     }
0174 
0175     const char*
0176     data() const noexcept
0177     {
0178         return seq_;
0179     }
0180 
0181     uint8_t
0182     needed() const noexcept
0183     {
0184         return length() - size_;
0185     }
0186 
0187     bool
0188     valid() const noexcept
0189     {
0190         BOOST_ASSERT(size_ >= length());
0191         return is_valid_utf8(seq_, first_);
0192     }
0193 };
0194 
0195 } // detail
0196 } // namespace json
0197 } // namespace boost
0198 
0199 #endif