strings/internal/charconv_bigint.h

0001 // Copyright 2018 The Abseil Authors.
0002 //
0003 // Licensed under the Apache License, Version 2.0 (the "License");
0004 // you may not use this file except in compliance with the License.
0005 // You may obtain a copy of the License at
0006 //
0007 //      https://www.apache.org/licenses/LICENSE-2.0
0008 //
0009 // Unless required by applicable law or agreed to in writing, software
0010 // distributed under the License is distributed on an "AS IS" BASIS,
0011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0012 // See the License for the specific language governing permissions and
0013 // limitations under the License.
0014
0015 #ifndef ABSL_STRINGS_INTERNAL_CHARCONV_BIGINT_H_
0016 #define ABSL_STRINGS_INTERNAL_CHARCONV_BIGINT_H_
0017
0018 #include <algorithm>
0019 #include <cstdint>
0020 #include <iostream>
0021 #include <string>
0022
0023 #include "absl/base/config.h"
0024 #include "absl/strings/ascii.h"
0025 #include "absl/strings/internal/charconv_parse.h"
0026 #include "absl/strings/string_view.h"
0027
0028 namespace absl {
0029 ABSL_NAMESPACE_BEGIN
0030 namespace strings_internal {
0031
0032 // The largest power that 5 that can be raised to, and still fit in a uint32_t.
0033 constexpr int kMaxSmallPowerOfFive = 13;
0034 // The largest power that 10 that can be raised to, and still fit in a uint32_t.
0035 constexpr int kMaxSmallPowerOfTen = 9;
0036
0037 ABSL_DLL extern const uint32_t
0038     kFiveToNth[kMaxSmallPowerOfFive + 1];
0039 ABSL_DLL extern const uint32_t kTenToNth[kMaxSmallPowerOfTen + 1];
0040
0041 // Large, fixed-width unsigned integer.
0042 //
0043 // Exact rounding for decimal-to-binary floating point conversion requires very
0044 // large integer math, but a design goal of absl::from_chars is to avoid
0045 // allocating memory.  The integer precision needed for decimal-to-binary
0046 // conversions is large but bounded, so a huge fixed-width integer class
0047 // suffices.
0048 //
0049 // This is an intentionally limited big integer class.  Only needed operations
0050 // are implemented.  All storage lives in an array data member, and all
0051 // arithmetic is done in-place, to avoid requiring separate storage for operand
0052 // and result.
0053 //
0054 // This is an internal class.  Some methods live in the .cc file, and are
0055 // instantiated only for the values of max_words we need.
0056 template <int max_words>
0057 class BigUnsigned {
0058  public:
0059   static_assert(max_words == 4 || max_words == 84,
0060                 "unsupported max_words value");
0061
0062   BigUnsigned() : size_(0), words_{} {}
0063   explicit constexpr BigUnsigned(uint64_t v)
0064       : size_((v >> 32) ? 2 : v ? 1 : 0),
0065         words_{static_cast<uint32_t>(v & 0xffffffffu),
0066                static_cast<uint32_t>(v >> 32)} {}
0067
0068   // Constructs a BigUnsigned from the given string_view containing a decimal
0069   // value.  If the input string is not a decimal integer, constructs a 0
0070   // instead.
0071   explicit BigUnsigned(absl::string_view sv) : size_(0), words_{} {
0072     // Check for valid input, returning a 0 otherwise.  This is reasonable
0073     // behavior only because this constructor is for unit tests.
0074     if (std::find_if_not(sv.begin(), sv.end(), ascii_isdigit) != sv.end() ||
0075         sv.empty()) {
0076       return;
0077     }
0078     int exponent_adjust =
0079         ReadDigits(sv.data(), sv.data() + sv.size(), Digits10() + 1);
0080     if (exponent_adjust > 0) {
0081       MultiplyByTenToTheNth(exponent_adjust);
0082     }
0083   }
0084
0085   // Loads the mantissa value of a previously-parsed float.
0086   //
0087   // Returns the associated decimal exponent.  The value of the parsed float is
0088   // exactly *this * 10**exponent.
0089   int ReadFloatMantissa(const ParsedFloat& fp, int significant_digits);
0090
0091   // Returns the number of decimal digits of precision this type provides.  All
0092   // numbers with this many decimal digits or fewer are representable by this
0093   // type.
0094   //
0095   // Analogous to std::numeric_limits<BigUnsigned>::digits10.
0096   static constexpr int Digits10() {
0097     // 9975007/1035508 is very slightly less than log10(2**32).
0098     return static_cast<uint64_t>(max_words) * 9975007 / 1035508;
0099   }
0100
0101   // Shifts left by the given number of bits.
0102   void ShiftLeft(int count) {
0103     if (count > 0) {
0104       const int word_shift = count / 32;
0105       if (word_shift >= max_words) {
0106         SetToZero();
0107         return;
0108       }
0109       size_ = (std::min)(size_ + word_shift, max_words);
0110       count %= 32;
0111       if (count == 0) {
0112 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=warray-bounds
0113 // shows a lot of bogus -Warray-bounds warnings under GCC.
0114 // This is not the only one in Abseil.
0115 #if ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(14, 0)
0116 #pragma GCC diagnostic push
0117 #pragma GCC diagnostic ignored "-Warray-bounds"
0118 #endif
0119         std::copy_backward(words_, words_ + size_ - word_shift, words_ + size_);
0120 #if ABSL_INTERNAL_HAVE_MIN_GNUC_VERSION(14, 0)
0121 #pragma GCC diagnostic pop
0122 #endif
0123       } else {
0124         for (int i = (std::min)(size_, max_words - 1); i > word_shift; --i) {
0125           words_[i] = (words_[i - word_shift] << count) |
0126                       (words_[i - word_shift - 1] >> (32 - count));
0127         }
0128         words_[word_shift] = words_[0] << count;
0129         // Grow size_ if necessary.
0130         if (size_ < max_words && words_[size_]) {
0131           ++size_;
0132         }
0133       }
0134       std::fill_n(words_, word_shift, 0u);
0135     }
0136   }
0137
0138
0139   // Multiplies by v in-place.
0140   void MultiplyBy(uint32_t v) {
0141     if (size_ == 0 || v == 1) {
0142       return;
0143     }
0144     if (v == 0) {
0145       SetToZero();
0146       return;
0147     }
0148     const uint64_t factor = v;
0149     uint64_t window = 0;
0150     for (int i = 0; i < size_; ++i) {
0151       window += factor * words_[i];
0152       words_[i] = window & 0xffffffff;
0153       window >>= 32;
0154     }
0155     // If carry bits remain and there's space for them, grow size_.
0156     if (window && size_ < max_words) {
0157       words_[size_] = window & 0xffffffff;
0158       ++size_;
0159     }
0160   }
0161
0162   void MultiplyBy(uint64_t v) {
0163     uint32_t words[2];
0164     words[0] = static_cast<uint32_t>(v);
0165     words[1] = static_cast<uint32_t>(v >> 32);
0166     if (words[1] == 0) {
0167       MultiplyBy(words[0]);
0168     } else {
0169       MultiplyBy(2, words);
0170     }
0171   }
0172
0173   // Multiplies in place by 5 to the power of n.  n must be non-negative.
0174   void MultiplyByFiveToTheNth(int n) {
0175     while (n >= kMaxSmallPowerOfFive) {
0176       MultiplyBy(kFiveToNth[kMaxSmallPowerOfFive]);
0177       n -= kMaxSmallPowerOfFive;
0178     }
0179     if (n > 0) {
0180       MultiplyBy(kFiveToNth[n]);
0181     }
0182   }
0183
0184   // Multiplies in place by 10 to the power of n.  n must be non-negative.
0185   void MultiplyByTenToTheNth(int n) {
0186     if (n > kMaxSmallPowerOfTen) {
0187       // For large n, raise to a power of 5, then shift left by the same amount.
0188       // (10**n == 5**n * 2**n.)  This requires fewer multiplications overall.
0189       MultiplyByFiveToTheNth(n);
0190       ShiftLeft(n);
0191     } else if (n > 0) {
0192       // We can do this more quickly for very small N by using a single
0193       // multiplication.
0194       MultiplyBy(kTenToNth[n]);
0195     }
0196   }
0197
0198   // Returns the value of 5**n, for non-negative n.  This implementation uses
0199   // a lookup table, and is faster then seeding a BigUnsigned with 1 and calling
0200   // MultiplyByFiveToTheNth().
0201   static BigUnsigned FiveToTheNth(int n);
0202
0203   // Multiplies by another BigUnsigned, in-place.
0204   template <int M>
0205   void MultiplyBy(const BigUnsigned<M>& other) {
0206     MultiplyBy(other.size(), other.words());
0207   }
0208
0209   void SetToZero() {
0210     std::fill_n(words_, size_, 0u);
0211     size_ = 0;
0212   }
0213
0214   // Returns the value of the nth word of this BigUnsigned.  This is
0215   // range-checked, and returns 0 on out-of-bounds accesses.
0216   uint32_t GetWord(int index) const {
0217     if (index < 0 || index >= size_) {
0218       return 0;
0219     }
0220     return words_[index];
0221   }
0222
0223   // Returns this integer as a decimal string.  This is not used in the decimal-
0224   // to-binary conversion; it is intended to aid in testing.
0225   std::string ToString() const;
0226
0227   int size() const { return size_; }
0228   const uint32_t* words() const { return words_; }
0229
0230  private:
0231   // Reads the number between [begin, end), possibly containing a decimal point,
0232   // into this BigUnsigned.
0233   //
0234   // Callers are required to ensure [begin, end) contains a valid number, with
0235   // one or more decimal digits and at most one decimal point.  This routine
0236   // will behave unpredictably if these preconditions are not met.
0237   //
0238   // Only the first `significant_digits` digits are read.  Digits beyond this
0239   // limit are "sticky": If the final significant digit is 0 or 5, and if any
0240   // dropped digit is nonzero, then that final significant digit is adjusted up
0241   // to 1 or 6.  This adjustment allows for precise rounding.
0242   //
0243   // Returns `exponent_adjustment`, a power-of-ten exponent adjustment to
0244   // account for the decimal point and for dropped significant digits.  After
0245   // this function returns,
0246   //   actual_value_of_parsed_string ~= *this * 10**exponent_adjustment.
0247   int ReadDigits(const char* begin, const char* end, int significant_digits);
0248
0249   // Performs a step of big integer multiplication.  This computes the full
0250   // (64-bit-wide) values that should be added at the given index (step), and
0251   // adds to that location in-place.
0252   //
0253   // Because our math all occurs in place, we must multiply starting from the
0254   // highest word working downward.  (This is a bit more expensive due to the
0255   // extra carries involved.)
0256   //
0257   // This must be called in steps, for each word to be calculated, starting from
0258   // the high end and working down to 0.  The first value of `step` should be
0259   //   `std::min(original_size + other.size_ - 2, max_words - 1)`.
0260   // The reason for this expression is that multiplying the i'th word from one
0261   // multiplicand and the j'th word of another multiplicand creates a
0262   // two-word-wide value to be stored at the (i+j)'th element.  The highest
0263   // word indices we will access are `original_size - 1` from this object, and
0264   // `other.size_ - 1` from our operand.  Therefore,
0265   // `original_size + other.size_ - 2` is the first step we should calculate,
0266   // but limited on an upper bound by max_words.
0267
0268   // Working from high-to-low ensures that we do not overwrite the portions of
0269   // the initial value of *this which are still needed for later steps.
0270   //
0271   // Once called with step == 0, *this contains the result of the
0272   // multiplication.
0273   //
0274   // `original_size` is the size_ of *this before the first call to
0275   // MultiplyStep().  `other_words` and `other_size` are the contents of our
0276   // operand.  `step` is the step to perform, as described above.
0277   void MultiplyStep(int original_size, const uint32_t* other_words,
0278                     int other_size, int step);
0279
0280   void MultiplyBy(int other_size, const uint32_t* other_words) {
0281     const int original_size = size_;
0282     const int first_step =
0283         (std::min)(original_size + other_size - 2, max_words - 1);
0284     for (int step = first_step; step >= 0; --step) {
0285       MultiplyStep(original_size, other_words, other_size, step);
0286     }
0287   }
0288
0289   // Adds a 32-bit value to the index'th word, with carry.
0290   void AddWithCarry(int index, uint32_t value) {
0291     if (value) {
0292       while (index < max_words && value > 0) {
0293         words_[index] += value;
0294         // carry if we overflowed in this word:
0295         if (value > words_[index]) {
0296           value = 1;
0297           ++index;
0298         } else {
0299           value = 0;
0300         }
0301       }
0302       size_ = (std::min)(max_words, (std::max)(index + 1, size_));
0303     }
0304   }
0305
0306   void AddWithCarry(int index, uint64_t value) {
0307     if (value && index < max_words) {
0308       uint32_t high = value >> 32;
0309       uint32_t low = value & 0xffffffff;
0310       words_[index] += low;
0311       if (words_[index] < low) {
0312         ++high;
0313         if (high == 0) {
0314           // Carry from the low word caused our high word to overflow.
0315           // Short circuit here to do the right thing.
0316           AddWithCarry(index + 2, static_cast<uint32_t>(1));
0317           return;
0318         }
0319       }
0320       if (high > 0) {
0321         AddWithCarry(index + 1, high);
0322       } else {
0323         // Normally 32-bit AddWithCarry() sets size_, but since we don't call
0324         // it when `high` is 0, do it ourselves here.
0325         size_ = (std::min)(max_words, (std::max)(index + 1, size_));
0326       }
0327     }
0328   }
0329
0330   // Divide this in place by a constant divisor.  Returns the remainder of the
0331   // division.
0332   template <uint32_t divisor>
0333   uint32_t DivMod() {
0334     uint64_t accumulator = 0;
0335     for (int i = size_ - 1; i >= 0; --i) {
0336       accumulator <<= 32;
0337       accumulator += words_[i];
0338       // accumulator / divisor will never overflow an int32_t in this loop
0339       words_[i] = static_cast<uint32_t>(accumulator / divisor);
0340       accumulator = accumulator % divisor;
0341     }
0342     while (size_ > 0 && words_[size_ - 1] == 0) {
0343       --size_;
0344     }
0345     return static_cast<uint32_t>(accumulator);
0346   }
0347
0348   // The number of elements in words_ that may carry significant values.
0349   // All elements beyond this point are 0.
0350   //
0351   // When size_ is 0, this BigUnsigned stores the value 0.
0352   // When size_ is nonzero, is *not* guaranteed that words_[size_ - 1] is
0353   // nonzero.  This can occur due to overflow truncation.
0354   // In particular, x.size_ != y.size_ does *not* imply x != y.
0355   int size_;
0356   uint32_t words_[max_words];
0357 };
0358
0359 // Compares two big integer instances.
0360 //
0361 // Returns -1 if lhs < rhs, 0 if lhs == rhs, and 1 if lhs > rhs.
0362 template <int N, int M>
0363 int Compare(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0364   int limit = (std::max)(lhs.size(), rhs.size());
0365   for (int i = limit - 1; i >= 0; --i) {
0366     const uint32_t lhs_word = lhs.GetWord(i);
0367     const uint32_t rhs_word = rhs.GetWord(i);
0368     if (lhs_word < rhs_word) {
0369       return -1;
0370     } else if (lhs_word > rhs_word) {
0371       return 1;
0372     }
0373   }
0374   return 0;
0375 }
0376
0377 template <int N, int M>
0378 bool operator==(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0379   int limit = (std::max)(lhs.size(), rhs.size());
0380   for (int i = 0; i < limit; ++i) {
0381     if (lhs.GetWord(i) != rhs.GetWord(i)) {
0382       return false;
0383     }
0384   }
0385   return true;
0386 }
0387
0388 template <int N, int M>
0389 bool operator!=(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0390   return !(lhs == rhs);
0391 }
0392
0393 template <int N, int M>
0394 bool operator<(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0395   return Compare(lhs, rhs) == -1;
0396 }
0397
0398 template <int N, int M>
0399 bool operator>(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0400   return rhs < lhs;
0401 }
0402 template <int N, int M>
0403 bool operator<=(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0404   return !(rhs < lhs);
0405 }
0406 template <int N, int M>
0407 bool operator>=(const BigUnsigned<N>& lhs, const BigUnsigned<M>& rhs) {
0408   return !(lhs < rhs);
0409 }
0410
0411 // Output operator for BigUnsigned, for testing purposes only.
0412 template <int N>
0413 std::ostream& operator<<(std::ostream& os, const BigUnsigned<N>& num) {
0414   return os << num.ToString();
0415 }
0416
0417 // Explicit instantiation declarations for the sizes of BigUnsigned that we
0418 // are using.
0419 //
0420 // For now, the choices of 4 and 84 are arbitrary; 4 is a small value that is
0421 // still bigger than an int128, and 84 is a large value we will want to use
0422 // in the from_chars implementation.
0423 //
0424 // Comments justifying the use of 84 belong in the from_chars implementation,
0425 // and will be added in a follow-up CL.
0426 extern template class BigUnsigned<4>;
0427 extern template class BigUnsigned<84>;
0428
0429 }  // namespace strings_internal
0430 ABSL_NAMESPACE_END
0431 }  // namespace absl
0432
0433 #endif  // ABSL_STRINGS_INTERNAL_CHARCONV_BIGINT_H_