detail/fast_float/ascii_number.hpp

0001 // Copyright 2020-2023 Daniel Lemire
0002 // Copyright 2023 Matt Borland
0003 // Distributed under the Boost Software License, Version 1.0.
0004 // https://www.boost.org/LICENSE_1_0.txt
0005 //
0006 // Derivative of: https://github.com/fastfloat/fast_float
0007
0008 #ifndef BOOST_CHARCONV_DETAIL_FASTFLOAT_ASCII_NUMBER_HPP
0009 #define BOOST_CHARCONV_DETAIL_FASTFLOAT_ASCII_NUMBER_HPP
0010
0011 #include <boost/charconv/detail/fast_float/float_common.hpp>
0012 #include <cctype>
0013 #include <cstdint>
0014 #include <cstring>
0015 #include <iterator>
0016
0017 namespace boost { namespace charconv { namespace detail { namespace fast_float {
0018
0019 // Next function can be micro-optimized, but compilers are entirely
0020 // able to optimize it well.
0021 template <typename UC>
0022 BOOST_FORCEINLINE constexpr bool is_integer(UC c) noexcept {
0023   return !(c > UC('9') || c < UC('0'));
0024 }
0025
0026 BOOST_FORCEINLINE constexpr uint64_t byteswap(uint64_t val) {
0027   return (val & 0xFF00000000000000) >> 56
0028     | (val & 0x00FF000000000000) >> 40
0029     | (val & 0x0000FF0000000000) >> 24
0030     | (val & 0x000000FF00000000) >> 8
0031     | (val & 0x00000000FF000000) << 8
0032     | (val & 0x0000000000FF0000) << 24
0033     | (val & 0x000000000000FF00) << 40
0034     | (val & 0x00000000000000FF) << 56;
0035 }
0036
0037 BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
0038 uint64_t read_u64(const char *chars) {
0039   if (cpp20_and_in_constexpr()) {
0040     uint64_t val = 0;
0041     for(int i = 0; i < 8; ++i) {
0042       val |= uint64_t(*chars) << (i*8);
0043       ++chars;
0044     }
0045     return val;
0046   }
0047   uint64_t val;
0048   ::memcpy(&val, chars, sizeof(uint64_t));
0049 #if BOOST_CHARCONV_FASTFLOAT_IS_BIG_ENDIAN == 1
0050   // Need to read as-if the number was in little-endian order.
0051   val = byteswap(val);
0052 #endif
0053   return val;
0054 }
0055
0056 BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
0057 void write_u64(uint8_t *chars, uint64_t val) {
0058   if (cpp20_and_in_constexpr()) {
0059     for(int i = 0; i < 8; ++i) {
0060       *chars = uint8_t(val);
0061       val >>= 8;
0062       ++chars;
0063     }
0064     return;
0065   }
0066 #if BOOST_CHARCONV_FASTFLOAT_IS_BIG_ENDIAN == 1
0067   // Need to read as-if the number was in little-endian order.
0068   val = byteswap(val);
0069 #endif
0070   ::memcpy(chars, &val, sizeof(uint64_t));
0071 }
0072
0073 // credit  @aqrit
0074 BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR14
0075 uint32_t parse_eight_digits_unrolled(uint64_t val) {
0076   constexpr uint64_t mask = 0x000000FF000000FF;
0077   constexpr uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
0078   constexpr uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
0079   val -= 0x3030303030303030;
0080   val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
0081   val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
0082   return uint32_t(val);
0083 }
0084
0085 BOOST_FORCEINLINE constexpr
0086 uint32_t parse_eight_digits_unrolled(const char16_t *)  noexcept  {
0087   return 0;
0088 }
0089
0090 BOOST_FORCEINLINE constexpr
0091 uint32_t parse_eight_digits_unrolled(const char32_t *)  noexcept  {
0092   return 0;
0093 }
0094
0095 BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
0096 uint32_t parse_eight_digits_unrolled(const char *chars)  noexcept  {
0097   return parse_eight_digits_unrolled(read_u64(chars));
0098 }
0099
0100 // credit @aqrit
0101 BOOST_FORCEINLINE constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept  {
0102   return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & 0x8080808080808080));
0103 }
0104
0105 BOOST_FORCEINLINE constexpr
0106 bool is_made_of_eight_digits_fast(const char16_t *)  noexcept  {
0107   return false;
0108 }
0109
0110 BOOST_FORCEINLINE constexpr
0111 bool is_made_of_eight_digits_fast(const char32_t *)  noexcept  {
0112   return false;
0113 }
0114
0115 BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
0116 bool is_made_of_eight_digits_fast(const char *chars)  noexcept  {
0117   return is_made_of_eight_digits_fast(read_u64(chars));
0118 }
0119
0120 template <typename UC>
0121 struct parsed_number_string_t {
0122   int64_t exponent{0};
0123   uint64_t mantissa{0};
0124   UC const * lastmatch{nullptr};
0125   bool negative{false};
0126   bool valid{false};
0127   bool too_many_digits{false};
0128   // contains the range of the significant digits
0129   span<const UC> integer{};  // non-nullable
0130   span<const UC> fraction{}; // nullable
0131 };
0132 using byte_span = span<char>;
0133 using parsed_number_string = parsed_number_string_t<char>;
0134 // Assuming that you use no more than 19 digits, this will
0135 // parse an ASCII string.
0136 template <typename UC>
0137 BOOST_FORCEINLINE BOOST_CHARCONV_FASTFLOAT_CONSTEXPR20
0138 parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, parse_options_t<UC> options) noexcept {
0139   chars_format const fmt = options.format;
0140   UC const decimal_point = options.decimal_point;
0141
0142   parsed_number_string_t<UC> answer;
0143   answer.valid = false;
0144   answer.too_many_digits = false;
0145   answer.negative = (*p == UC('-'));
0146 #ifdef BOOST_CHARCONV_FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
0147   if ((*p == UC('-')) || (*p == UC('+')))
0148 #else
0149   if (*p == UC('-')) // C++17 20.19.3.(7.1) explicitly forbids '+' sign here
0150 #endif
0151   {
0152     ++p;
0153     if (p == pend) {
0154       return answer;
0155     }
0156     if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
0157       return answer;
0158     }
0159   }
0160   UC const * const start_digits = p;
0161
0162   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
0163
0164   while ((p != pend) && is_integer(*p)) {
0165     // a multiplication by 10 is cheaper than an arbitrary integer
0166     // multiplication
0167     i = 10 * i +
0168         uint64_t(*p - UC('0')); // might overflow, we will handle the overflow later
0169     ++p;
0170   }
0171   UC const * const end_of_integer_part = p;
0172   int64_t digit_count = int64_t(end_of_integer_part - start_digits);
0173   answer.integer = span<const UC>(start_digits, size_t(digit_count));
0174   int64_t exponent = 0;
0175   if ((p != pend) && (*p == decimal_point)) {
0176     ++p;
0177     UC const * before = p;
0178     // can occur at most twice without overflowing, but let it occur more, since
0179     // for integers with many digits, digit parsing is the primary bottleneck.
0180     if (std::is_same<UC,char>::value) {
0181       while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) {
0182         i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok
0183         p += 8;
0184       }
0185     }
0186     while ((p != pend) && is_integer(*p)) {
0187       uint8_t digit = uint8_t(*p - UC('0'));
0188       ++p;
0189       i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
0190     }
0191     exponent = before - p;
0192     answer.fraction = span<const UC>(before, size_t(p - before));
0193     digit_count -= exponent;
0194   }
0195   // we must have encountered at least one integer!
0196   if (digit_count == 0) {
0197     return answer;
0198   }
0199   int64_t exp_number = 0;            // explicit exponential part
0200   if ((static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::scientific)) && (p != pend) && ((UC('e') == *p) || (UC('E') == *p))) {
0201     UC const * location_of_e = p;
0202     ++p;
0203     bool neg_exp = false;
0204     if ((p != pend) && (UC('-') == *p)) {
0205       neg_exp = true;
0206       ++p;
0207     } else if ((p != pend) && (UC('+') == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
0208       ++p;
0209     }
0210     if ((p == pend) || !is_integer(*p)) {
0211       if(!(static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::fixed))) {
0212         // We are in error.
0213         return answer;
0214       }
0215       // Otherwise, we will be ignoring the 'e'.
0216       p = location_of_e;
0217     } else {
0218       while ((p != pend) && is_integer(*p)) {
0219         uint8_t digit = uint8_t(*p - UC('0'));
0220         if (exp_number < 0x10000000) {
0221           exp_number = 10 * exp_number + digit;
0222         }
0223         ++p;
0224       }
0225       if(neg_exp) { exp_number = - exp_number; }
0226       exponent += exp_number;
0227     }
0228   } else {
0229     // If it scientific and not fixed, we have to bail out.
0230     if((static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::scientific)) &&
0231        !(static_cast<unsigned>(fmt) & static_cast<unsigned>(chars_format::fixed)))
0232     {
0233         return answer;
0234     }
0235   }
0236   answer.lastmatch = p;
0237   answer.valid = true;
0238
0239   // If we frequently had to deal with long strings of digits,
0240   // we could extend our code by using a 128-bit integer instead
0241   // of a 64-bit integer. However, this is uncommon.
0242   //
0243   // We can deal with up to 19 digits.
0244   if (digit_count > 19) { // this is uncommon
0245     // It is possible that the integer had an overflow.
0246     // We have to handle the case where we have 0.0000somenumber.
0247     // We need to be mindful of the case where we only have zeroes...
0248     // E.g., 0.000000000...000.
0249     UC const * start = start_digits;
0250     while ((start != pend) && (*start == UC('0') || *start == decimal_point)) {
0251       if(*start == UC('0')) { digit_count --; }
0252       start++;
0253     }
0254     if (digit_count > 19) {
0255       answer.too_many_digits = true;
0256       // Let us start again, this time, avoiding overflows.
0257       // We don't need to check if is_integer, since we use the
0258       // pre-tokenized spans from above.
0259       i = 0;
0260       p = answer.integer.ptr;
0261       UC const * int_end = p + answer.integer.len();
0262       constexpr uint64_t minimal_nineteen_digit_integer{1000000000000000000};
0263       while((i < minimal_nineteen_digit_integer) && (p != int_end)) {
0264         i = i * 10 + uint64_t(*p - UC('0'));
0265         ++p;
0266       }
0267       if (i >= minimal_nineteen_digit_integer) { // We have a big integers
0268         exponent = end_of_integer_part - p + exp_number;
0269       } else { // We have a value with a fractional component.
0270           p = answer.fraction.ptr;
0271           UC const * frac_end = p + answer.fraction.len();
0272           while((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
0273             i = i * 10 + uint64_t(*p - UC('0'));
0274             ++p;
0275           }
0276           exponent = answer.fraction.ptr - p + exp_number;
0277       }
0278       // We have now corrected both exponent and i, to a truncated value
0279     }
0280   }
0281   answer.exponent = exponent;
0282   answer.mantissa = i;
0283   return answer;
0284 }
0285
0286 }}}} // namespace s
0287
0288 #endif