Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:29:33

0001 //
0002 // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
0003 //
0004 // Distributed under the Boost Software License, Version 1.0. (See accompanying
0005 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
0006 //
0007 // Official repository: https://github.com/boostorg/beast
0008 //
0009 
0010 #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
0011 #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
0012 
0013 #include <boost/beast/websocket/detail/utf8_checker.hpp>
0014 
0015 #include <boost/assert.hpp>
0016 
0017 namespace boost {
0018 namespace beast {
0019 namespace websocket {
0020 namespace detail {
0021 
0022 void
0023 utf8_checker::
0024 reset()
0025 {
0026     need_ = 0;
0027     p_ = cp_;
0028 }
0029 
0030 bool
0031 utf8_checker::
0032 finish()
0033 {
0034     auto const success = need_ == 0;
0035     reset();
0036     return success;
0037 }
0038 
0039 bool
0040 utf8_checker::
0041 write(std::uint8_t const* in, std::size_t size)
0042 {
0043     auto const valid =
0044         [](std::uint8_t const*& p)
0045         {
0046             if(p[0] < 128)
0047             {
0048                 ++p;
0049                 return true;
0050             }
0051             if((p[0] & 0xe0) == 0xc0)
0052             {
0053                 if( (p[1] & 0xc0) != 0x80 ||
0054                     (p[0] & 0x1e) == 0)  // overlong
0055                     return false;
0056                 p += 2;
0057                 return true;
0058             }
0059             if((p[0] & 0xf0) == 0xe0)
0060             {
0061                 if(    (p[1] & 0xc0) != 0x80
0062                     || (p[2] & 0xc0) != 0x80
0063                     || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
0064                     || (p[0] == 0xed && (p[1] & 0x20) == 0x20) // surrogate
0065                     //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
0066                     )
0067                     return false;
0068                 p += 3;
0069                 return true;
0070             }
0071             if((p[0] & 0xf8) == 0xf0)
0072             {
0073                 if(    (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
0074                     || (p[1] & 0xc0) != 0x80
0075                     || (p[2] & 0xc0) != 0x80
0076                     || (p[3] & 0xc0) != 0x80
0077                     || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
0078                     || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
0079                     )
0080                     return false;
0081                 p += 4;
0082                 return true;
0083             }
0084             return false;
0085         };
0086     auto const fail_fast =
0087         [&]()
0088         {
0089             if(cp_[0] < 128)
0090             {
0091                 return false;
0092             }
0093 
0094             const auto& p = cp_; // alias, only to keep this code similar to valid() above
0095             const auto known_only = p_ - cp_;
0096             if (known_only == 1)
0097             {
0098                 if((p[0] & 0xe0) == 0xc0)
0099                 {
0100                     return ((p[0] & 0x1e) == 0);  // overlong
0101                 }
0102                 if((p[0] & 0xf0) == 0xe0)
0103                 {
0104                     return false;
0105                 }
0106                 if((p[0] & 0xf8) == 0xf0)
0107                 {
0108                     return ((p[0] & 0x07) >= 0x05);  // invalid F5...FF characters
0109                 }
0110             }
0111             else if (known_only == 2)
0112             {
0113                 if((p[0] & 0xe0) == 0xc0)
0114                 {
0115                     return ((p[1] & 0xc0) != 0x80 ||
0116                             (p[0] & 0x1e) == 0);  // overlong
0117                 }
0118                 if((p[0] & 0xf0) == 0xe0)
0119                 {
0120                     return (  (p[1] & 0xc0) != 0x80
0121                            || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
0122                            || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
0123                 }
0124                 if((p[0] & 0xf8) == 0xf0)
0125                 {
0126                     return (  (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
0127                            || (p[1] & 0xc0) != 0x80
0128                            || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
0129                            || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
0130                 }
0131             }
0132             else if (known_only == 3)
0133             {
0134                 if((p[0] & 0xe0) == 0xc0)
0135                 {
0136                     return (  (p[1] & 0xc0) != 0x80
0137                            || (p[0] & 0x1e) == 0);  // overlong
0138                 }
0139                 if((p[0] & 0xf0) == 0xe0)
0140                 {
0141                     return (  (p[1] & 0xc0) != 0x80
0142                            || (p[2] & 0xc0) != 0x80
0143                            || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
0144                            || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
0145                            //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
0146                 }
0147                 if((p[0] & 0xf8) == 0xf0)
0148                 {
0149                     return (  (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
0150                            || (p[1] & 0xc0) != 0x80
0151                            || (p[2] & 0xc0) != 0x80
0152                            || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
0153                            || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
0154                 }
0155             }
0156             return true;
0157         };
0158     auto const needed =
0159         [](std::uint8_t const v)
0160         {
0161             if(v < 128)
0162                 return 1;
0163             if(v < 192)
0164                 return 0;
0165             if(v < 224)
0166                 return 2;
0167             if(v < 240)
0168                 return 3;
0169             if(v < 248)
0170                 return 4;
0171             return 0;
0172         };
0173 
0174     auto const end = in + size;
0175 
0176     // Finish up any incomplete code point
0177     if(need_ > 0)
0178     {
0179         // Calculate what we have
0180         auto n = (std::min)(size, need_);
0181         size -= n;
0182         need_ -= n;
0183 
0184         // Add characters to the code point
0185         while(n--)
0186             *p_++ = *in++;
0187         BOOST_ASSERT(p_ <= cp_ + 4);
0188 
0189         // Still incomplete?
0190         if(need_ > 0)
0191         {
0192             // Incomplete code point
0193             BOOST_ASSERT(in == end);
0194 
0195             // Do partial validation on the incomplete
0196             // code point, this is called "Fail fast"
0197             // in Autobahn|Testsuite parlance.
0198             return ! fail_fast();
0199         }
0200 
0201         // Complete code point, validate it
0202         std::uint8_t const* p = &cp_[0];
0203         if(! valid(p))
0204             return false;
0205         p_ = cp_;
0206     }
0207 
0208     if(size <= sizeof(std::size_t))
0209         goto slow;
0210 
0211     // Align `in` to sizeof(std::size_t) boundary
0212     {
0213         auto const in0 = in;
0214         auto last = reinterpret_cast<std::uint8_t const*>(
0215             ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
0216                 sizeof(std::size_t)) * sizeof(std::size_t));
0217 
0218         // Check one character at a time for low-ASCII
0219         while(in < last)
0220         {
0221             if(*in & 0x80)
0222             {
0223                 // Not low-ASCII so switch to slow loop
0224                 size = size - (in - in0);
0225                 goto slow;
0226             }
0227             ++in;
0228         }
0229         size = size - (in - in0);
0230     }
0231 
0232     // Fast loop: Process 4 or 8 low-ASCII characters at a time
0233     {
0234         auto const in0 = in;
0235         auto last = in + size - 7;
0236         auto constexpr mask = static_cast<
0237             std::size_t>(0x8080808080808080 & ~std::size_t{0});
0238         while(in < last)
0239         {
0240 #if 0
0241             std::size_t temp;
0242             std::memcpy(&temp, in, sizeof(temp));
0243             if((temp & mask) != 0)
0244 #else
0245             // Technically UB but works on all known platforms
0246             if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
0247 #endif
0248             {
0249                 size = size - (in - in0);
0250                 goto slow;
0251             }
0252             in += sizeof(std::size_t);
0253         }
0254         // There's at least one more full code point left
0255         last += 4;
0256         while(in < last)
0257             if(! valid(in))
0258                 return false;
0259         goto tail;
0260     }
0261 
0262 slow:
0263     // Slow loop: Full validation on one code point at a time
0264     {
0265         auto last = in + size - 3;
0266         while(in < last)
0267             if(! valid(in))
0268                 return false;
0269     }
0270 
0271 tail:
0272     // Handle the remaining bytes. The last
0273     // characters could split a code point so
0274     // we save the partial code point for later.
0275     //
0276     // On entry to the loop, `in` points to the
0277     // beginning of a code point.
0278     //
0279     for(;;)
0280     {
0281         // Number of chars left
0282         auto n = end - in;
0283         if(! n)
0284             break;
0285 
0286         // Chars we need to finish this code point
0287         auto const need = needed(*in);
0288         if(need == 0)
0289             return false;
0290         if(need <= n)
0291         {
0292             // Check a whole code point
0293             if(! valid(in))
0294                 return false;
0295         }
0296         else
0297         {
0298             // Calculate how many chars we need
0299             // to finish this partial code point
0300             need_ = need - n;
0301 
0302             // Save the partial code point
0303             while(n--)
0304                 *p_++ = *in++;
0305             BOOST_ASSERT(in == end);
0306             BOOST_ASSERT(p_ <= cp_ + 4);
0307 
0308             // Do partial validation on the incomplete
0309             // code point, this is called "Fail fast"
0310             // in Autobahn|Testsuite parlance.
0311             return ! fail_fast();
0312         }
0313     }
0314     return true;
0315 }
0316 
0317 bool
0318 check_utf8(char const* p, std::size_t n)
0319 {
0320     utf8_checker c;
0321     if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
0322         return false;
0323     return c.finish();
0324 }
0325 
0326 } // detail
0327 } // websocket
0328 } // beast
0329 } // boost
0330 
0331 #endif // BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP