vendored/pcg/pcg_uint128.hpp

0001 /*
0002  * PCG Random Number Generation for C++
0003  *
0004  * Copyright 2014-2021 Melissa O'Neill <oneill@pcg-random.org>,
0005  *                     and the PCG Project contributors.
0006  *
0007  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
0008  *
0009  * Licensed under the Apache License, Version 2.0 (provided in
0010  * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
0011  * or under the MIT license (provided in LICENSE-MIT.txt and at
0012  * http://opensource.org/licenses/MIT), at your option. This file may not
0013  * be copied, modified, or distributed except according to those terms.
0014  *
0015  * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
0016  * express or implied.  See your chosen license for details.
0017  *
0018  * For additional information about the PCG random number generation scheme,
0019  * visit http://www.pcg-random.org/.
0020  */
0021
0022 /*
0023  * This code provides a a C++ class that can provide 128-bit (or higher)
0024  * integers.  To produce 2K-bit integers, it uses two K-bit integers,
0025  * placed in a union that allowes the code to also see them as four K/2 bit
0026  * integers (and access them either directly name, or by index).
0027  *
0028  * It may seem like we're reinventing the wheel here, because several
0029  * libraries already exist that support large integers, but most existing
0030  * libraries provide a very generic multiprecision code, but here we're
0031  * operating at a fixed size.  Also, most other libraries are fairly
0032  * heavyweight.  So we use a direct implementation.  Sadly, it's much slower
0033  * than hand-coded assembly or direct CPU support.
0034  */
0035
0036 #ifndef PCG_UINT128_HPP_INCLUDED
0037 #define PCG_UINT128_HPP_INCLUDED 1
0038
0039 #include <cstdint>
0040 #include <cstdio>
0041 #include <cassert>
0042 #include <climits>
0043 #include <utility>
0044 #include <initializer_list>
0045 #include <type_traits>
0046
0047 #if defined(_MSC_VER)  // Use MSVC++ intrinsics
0048 #include <intrin.h>
0049 #endif
0050
0051 /*
0052  * We want to lay the type out the same way that a native type would be laid
0053  * out, which means we must know the machine's endian, at compile time.
0054  * This ugliness attempts to do so.
0055  */
0056
0057 #ifndef PCG_LITTLE_ENDIAN
0058     #if defined(__BYTE_ORDER__)
0059         #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0060             #define PCG_LITTLE_ENDIAN 1
0061         #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
0062             #define PCG_LITTLE_ENDIAN 0
0063         #else
0064             #error __BYTE_ORDER__ does not match a standard endian, pick a side
0065         #endif
0066     #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
0067         #define PCG_LITTLE_ENDIAN 1
0068     #elif __BIG_ENDIAN__ || _BIG_ENDIAN
0069         #define PCG_LITTLE_ENDIAN 0
0070     #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86
0071         #define PCG_LITTLE_ENDIAN 1
0072     #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
0073           || __m68k__ || __mc68000__
0074         #define PCG_LITTLE_ENDIAN 0
0075     #else
0076         #error Unable to determine target endianness
0077     #endif
0078 #endif
0079
0080 #if INTPTR_MAX == INT64_MAX && !defined(PCG_64BIT_SPECIALIZATIONS)
0081     #define PCG_64BIT_SPECIALIZATIONS 1
0082 #endif
0083
0084 namespace arrow_vendored {
0085 namespace pcg_extras {
0086
0087 // Recent versions of GCC have intrinsics we can use to quickly calculate
0088 // the number of leading and trailing zeros in a number.  If possible, we
0089 // use them, otherwise we fall back to old-fashioned bit twiddling to figure
0090 // them out.
0091
0092 #ifndef PCG_BITCOUNT_T
0093     typedef uint8_t bitcount_t;
0094 #else
0095     typedef PCG_BITCOUNT_T bitcount_t;
0096 #endif
0097
0098 /*
0099  * Provide some useful helper functions
0100  *      * flog2                 floor(log2(x))
0101  *      * trailingzeros         number of trailing zero bits
0102  */
0103
0104 #if defined(__GNUC__)   // Any GNU-compatible compiler supporting C++11 has
0105                         // some useful intrinsics we can use.
0106
0107 inline bitcount_t flog2(uint32_t v)
0108 {
0109     return 31 - __builtin_clz(v);
0110 }
0111
0112 inline bitcount_t trailingzeros(uint32_t v)
0113 {
0114     return __builtin_ctz(v);
0115 }
0116
0117 inline bitcount_t flog2(uint64_t v)
0118 {
0119 #if UINT64_MAX == ULONG_MAX
0120     return 63 - __builtin_clzl(v);
0121 #elif UINT64_MAX == ULLONG_MAX
0122     return 63 - __builtin_clzll(v);
0123 #else
0124     #error Cannot find a function for uint64_t
0125 #endif
0126 }
0127
0128 inline bitcount_t trailingzeros(uint64_t v)
0129 {
0130 #if UINT64_MAX == ULONG_MAX
0131     return __builtin_ctzl(v);
0132 #elif UINT64_MAX == ULLONG_MAX
0133     return __builtin_ctzll(v);
0134 #else
0135     #error Cannot find a function for uint64_t
0136 #endif
0137 }
0138
0139 #elif defined(_MSC_VER)  // Use MSVC++ intrinsics
0140
0141 #pragma intrinsic(_BitScanReverse, _BitScanForward)
0142 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0143 #pragma intrinsic(_BitScanReverse64, _BitScanForward64)
0144 #endif
0145
0146 inline bitcount_t flog2(uint32_t v)
0147 {
0148     unsigned long i;
0149     _BitScanReverse(&i, v);
0150     return bitcount_t(i);
0151 }
0152
0153 inline bitcount_t trailingzeros(uint32_t v)
0154 {
0155     unsigned long i;
0156     _BitScanForward(&i, v);
0157     return bitcount_t(i);
0158 }
0159
0160 inline bitcount_t flog2(uint64_t v)
0161 {
0162 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0163     unsigned long i;
0164     _BitScanReverse64(&i, v);
0165     return bitcount_t(i);
0166 #else
0167     // 32-bit x86
0168     uint32_t high = v >> 32;
0169     uint32_t low  = uint32_t(v);
0170     return high ? 32+flog2(high) : flog2(low);
0171 #endif
0172 }
0173
0174 inline bitcount_t trailingzeros(uint64_t v)
0175 {
0176 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0177     unsigned long i;
0178     _BitScanForward64(&i, v);
0179     return bitcount_t(i);
0180 #else
0181     // 32-bit x86
0182     uint32_t high = v >> 32;
0183     uint32_t low  = uint32_t(v);
0184     return low ? trailingzeros(low) : trailingzeros(high)+32;
0185 #endif
0186 }
0187
0188 #else                   // Otherwise, we fall back to bit twiddling
0189                         // implementations
0190
0191 inline bitcount_t flog2(uint32_t v)
0192 {
0193     // Based on code by Eric Cole and Mark Dickinson, which appears at
0194     // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
0195
0196     static const uint8_t multiplyDeBruijnBitPos[32] = {
0197       0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
0198       8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
0199     };
0200
0201     v |= v >> 1; // first round down to one less than a power of 2
0202     v |= v >> 2;
0203     v |= v >> 4;
0204     v |= v >> 8;
0205     v |= v >> 16;
0206
0207     return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
0208 }
0209
0210 inline bitcount_t trailingzeros(uint32_t v)
0211 {
0212     static const uint8_t multiplyDeBruijnBitPos[32] = {
0213       0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
0214       31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
0215     };
0216
0217     return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
0218 }
0219
0220 inline bitcount_t flog2(uint64_t v)
0221 {
0222     uint32_t high = v >> 32;
0223     uint32_t low  = uint32_t(v);
0224
0225     return high ? 32+flog2(high) : flog2(low);
0226 }
0227
0228 inline bitcount_t trailingzeros(uint64_t v)
0229 {
0230     uint32_t high = v >> 32;
0231     uint32_t low  = uint32_t(v);
0232
0233     return low ? trailingzeros(low) : trailingzeros(high)+32;
0234 }
0235
0236 #endif
0237
0238 inline bitcount_t flog2(uint8_t v)
0239 {
0240     return flog2(uint32_t(v));
0241 }
0242
0243 inline bitcount_t flog2(uint16_t v)
0244 {
0245     return flog2(uint32_t(v));
0246 }
0247
0248 #if __SIZEOF_INT128__
0249 inline bitcount_t flog2(__uint128_t v)
0250 {
0251     uint64_t high = uint64_t(v >> 64);
0252     uint64_t low  = uint64_t(v);
0253
0254     return high ? 64+flog2(high) : flog2(low);
0255 }
0256 #endif
0257
0258 inline bitcount_t trailingzeros(uint8_t v)
0259 {
0260     return trailingzeros(uint32_t(v));
0261 }
0262
0263 inline bitcount_t trailingzeros(uint16_t v)
0264 {
0265     return trailingzeros(uint32_t(v));
0266 }
0267
0268 #if __SIZEOF_INT128__
0269 inline bitcount_t trailingzeros(__uint128_t v)
0270 {
0271     uint64_t high = uint64_t(v >> 64);
0272     uint64_t low  = uint64_t(v);
0273     return low ? trailingzeros(low) : trailingzeros(high)+64;
0274 }
0275 #endif
0276
0277 template <typename UInt>
0278 inline bitcount_t clog2(UInt v)
0279 {
0280     return flog2(v) + ((v & (-v)) != v);
0281 }
0282
0283 template <typename UInt>
0284 inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
0285 {
0286     UInt half_result = y + carryin;
0287     UInt result = x + half_result;
0288     *carryout = (half_result < y) || (result < x);
0289     return result;
0290 }
0291
0292 template <typename UInt>
0293 inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
0294 {
0295     UInt half_result = y + carryin;
0296     UInt result = x - half_result;
0297     *carryout = (half_result < y) || (result > x);
0298     return result;
0299 }
0300
0301
0302 template <typename UInt, typename UIntX2>
0303 class uint_x4 {
0304 // private:
0305     static constexpr unsigned int UINT_BITS = sizeof(UInt) * CHAR_BIT;
0306 public:
0307     union {
0308 #if PCG_LITTLE_ENDIAN
0309         struct {
0310             UInt v0, v1, v2, v3;
0311         } w;
0312         struct {
0313             UIntX2 v01, v23;
0314         } d;
0315 #else
0316         struct {
0317             UInt v3, v2, v1, v0;
0318         } w;
0319         struct {
0320             UIntX2 v23, v01;
0321         } d;
0322 #endif
0323         // For the array access versions, the code that uses the array
0324         // must handle endian itself.  Yuck.
0325         UInt wa[4];
0326     };
0327
0328 public:
0329     uint_x4() = default;
0330
0331     constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
0332 #if PCG_LITTLE_ENDIAN
0333        : w{v0, v1, v2, v3}
0334 #else
0335        : w{v3, v2, v1, v0}
0336 #endif
0337     {
0338         // Nothing (else) to do
0339     }
0340
0341     constexpr uint_x4(UIntX2 v23, UIntX2 v01)
0342 #if PCG_LITTLE_ENDIAN
0343        : d{v01,v23}
0344 #else
0345        : d{v23,v01}
0346 #endif
0347     {
0348         // Nothing (else) to do
0349     }
0350
0351     constexpr uint_x4(UIntX2 v01)
0352 #if PCG_LITTLE_ENDIAN
0353        : d{v01, UIntX2(0)}
0354 #else
0355        : d{UIntX2(0),v01}
0356 #endif
0357     {
0358         // Nothing (else) to do
0359     }
0360
0361     template<class Integral,
0362              typename std::enable_if<(std::is_integral<Integral>::value
0363                                       && sizeof(Integral) <= sizeof(UIntX2))
0364                                     >::type* = nullptr>
0365     constexpr uint_x4(Integral v01)
0366 #if PCG_LITTLE_ENDIAN
0367        : d{UIntX2(v01), UIntX2(0)}
0368 #else
0369        : d{UIntX2(0), UIntX2(v01)}
0370 #endif
0371     {
0372         // Nothing (else) to do
0373     }
0374
0375     explicit constexpr operator UIntX2() const
0376     {
0377         return d.v01;
0378     }
0379
0380     template<class Integral,
0381              typename std::enable_if<(std::is_integral<Integral>::value
0382                                       && sizeof(Integral) <= sizeof(UIntX2))
0383                                     >::type* = nullptr>
0384     explicit constexpr operator Integral() const
0385     {
0386         return Integral(d.v01);
0387     }
0388
0389     explicit constexpr operator bool() const
0390     {
0391         return d.v01 || d.v23;
0392     }
0393
0394     template<typename U, typename V>
0395     friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
0396
0397     template<typename U, typename V>
0398     friend uint_x4<U,V> operator*(const uint_x4<U,V>&, V);
0399
0400     template<typename U, typename V>
0401     friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
0402         divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
0403
0404     template<typename U, typename V>
0405     friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
0406
0407     template<typename U, typename V>
0408     friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
0409
0410     template<typename U, typename V>
0411     friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const bitcount_t shift);
0412
0413     template<typename U, typename V>
0414     friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const bitcount_t shift);
0415
0416 #if PCG_64BIT_SPECIALIZATIONS
0417     template<typename U>
0418     friend uint_x4<U,uint64_t> operator<<(const uint_x4<U,uint64_t>&, const bitcount_t shift);
0419
0420     template<typename U>
0421     friend uint_x4<U,uint64_t> operator>>(const uint_x4<U,uint64_t>&, const bitcount_t shift);
0422 #endif
0423
0424     template<typename U, typename V>
0425     friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
0426
0427     template<typename U, typename V>
0428     friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
0429
0430     template<typename U, typename V>
0431     friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
0432
0433     template<typename U, typename V>
0434     friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
0435
0436     template<typename U, typename V>
0437     friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0438
0439     template<typename U, typename V>
0440     friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
0441
0442     template<typename U, typename V>
0443     friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0444
0445     template<typename U, typename V>
0446     friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
0447
0448     template<typename U, typename V>
0449     friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0450
0451     template<typename U, typename V>
0452     friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
0453
0454     template<typename U, typename V>
0455     friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
0456
0457     template<typename U, typename V>
0458     friend bitcount_t flog2(const uint_x4<U,V>&);
0459
0460     template<typename U, typename V>
0461     friend bitcount_t trailingzeros(const uint_x4<U,V>&);
0462
0463 #if PCG_64BIT_SPECIALIZATIONS
0464     template<typename U>
0465     friend bitcount_t flog2(const uint_x4<U,uint64_t>&);
0466
0467     template<typename U>
0468     friend bitcount_t trailingzeros(const uint_x4<U,uint64_t>&);
0469 #endif
0470
0471     uint_x4& operator*=(const uint_x4& rhs)
0472     {
0473         uint_x4 result = *this * rhs;
0474         return *this = result;
0475     }
0476
0477     uint_x4& operator*=(UIntX2 rhs)
0478     {
0479         uint_x4 result = *this * rhs;
0480         return *this = result;
0481     }
0482
0483     uint_x4& operator/=(const uint_x4& rhs)
0484     {
0485         uint_x4 result = *this / rhs;
0486         return *this = result;
0487     }
0488
0489     uint_x4& operator%=(const uint_x4& rhs)
0490     {
0491         uint_x4 result = *this % rhs;
0492         return *this = result;
0493     }
0494
0495     uint_x4& operator+=(const uint_x4& rhs)
0496     {
0497         uint_x4 result = *this + rhs;
0498         return *this = result;
0499     }
0500
0501     uint_x4& operator-=(const uint_x4& rhs)
0502     {
0503         uint_x4 result = *this - rhs;
0504         return *this = result;
0505     }
0506
0507     uint_x4& operator&=(const uint_x4& rhs)
0508     {
0509         uint_x4 result = *this & rhs;
0510         return *this = result;
0511     }
0512
0513     uint_x4& operator|=(const uint_x4& rhs)
0514     {
0515         uint_x4 result = *this | rhs;
0516         return *this = result;
0517     }
0518
0519     uint_x4& operator^=(const uint_x4& rhs)
0520     {
0521         uint_x4 result = *this ^ rhs;
0522         return *this = result;
0523     }
0524
0525     uint_x4& operator>>=(bitcount_t shift)
0526     {
0527         uint_x4 result = *this >> shift;
0528         return *this = result;
0529     }
0530
0531     uint_x4& operator<<=(bitcount_t shift)
0532     {
0533         uint_x4 result = *this << shift;
0534         return *this = result;
0535     }
0536
0537 };
0538
0539 template<typename U, typename V>
0540 bitcount_t flog2(const uint_x4<U,V>& v)
0541 {
0542 #if PCG_LITTLE_ENDIAN
0543     for (uint8_t i = 4; i !=0; /* dec in loop */) {
0544         --i;
0545 #else
0546     for (uint8_t i = 0; i < 4; ++i) {
0547 #endif
0548         if (v.wa[i] == 0)
0549              continue;
0550         return flog2(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
0551     }
0552     abort();
0553 }
0554
0555 template<typename U, typename V>
0556 bitcount_t trailingzeros(const uint_x4<U,V>& v)
0557 {
0558 #if PCG_LITTLE_ENDIAN
0559     for (uint8_t i = 0; i < 4; ++i) {
0560 #else
0561     for (uint8_t i = 4; i !=0; /* dec in loop */) {
0562         --i;
0563 #endif
0564         if (v.wa[i] != 0)
0565             return trailingzeros(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
0566     }
0567     return uint_x4<U,V>::UINT_BITS*4;
0568 }
0569
0570 #if PCG_64BIT_SPECIALIZATIONS
0571 template<typename UInt32>
0572 bitcount_t flog2(const uint_x4<UInt32,uint64_t>& v)
0573 {
0574     return v.d.v23 > 0 ? flog2(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
0575                        : flog2(v.d.v01);
0576 }
0577
0578 template<typename UInt32>
0579 bitcount_t trailingzeros(const uint_x4<UInt32,uint64_t>& v)
0580 {
0581     return v.d.v01 == 0 ? trailingzeros(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
0582                         : trailingzeros(v.d.v01);
0583 }
0584 #endif
0585
0586 template <typename UInt, typename UIntX2>
0587 std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
0588     divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
0589            const uint_x4<UInt,UIntX2>& divisor)
0590 {
0591     // If the dividend is less than the divisor, the answer is always zero.
0592     // This takes care of boundary cases like 0/x (which would otherwise be
0593     // problematic because we can't take the log of zero.  (The boundary case
0594     // of division by zero is undefined.)
0595     if (orig_dividend < divisor)
0596         return { uint_x4<UInt,UIntX2>(UIntX2(0)), orig_dividend };
0597
0598     auto dividend = orig_dividend;
0599
0600     auto log2_divisor  = flog2(divisor);
0601     auto log2_dividend = flog2(dividend);
0602     // assert(log2_dividend >= log2_divisor);
0603     bitcount_t logdiff = log2_dividend - log2_divisor;
0604
0605     constexpr uint_x4<UInt,UIntX2> ONE(UIntX2(1));
0606     if (logdiff == 0)
0607         return { ONE, dividend - divisor };
0608
0609     // Now we change the log difference to
0610     //  floor(log2(divisor)) - ceil(log2(dividend))
0611     // to ensure that we *underestimate* the result.
0612     logdiff -= 1;
0613
0614     uint_x4<UInt,UIntX2> quotient(UIntX2(0));
0615
0616     auto qfactor = ONE << logdiff;
0617     auto factor  = divisor << logdiff;
0618
0619     do {
0620         dividend -= factor;
0621         quotient += qfactor;
0622         while (dividend < factor) {
0623             factor  >>= 1;
0624             qfactor >>= 1;
0625         }
0626     } while (dividend >= divisor);
0627
0628     return { quotient, dividend };
0629 }
0630
0631 template <typename UInt, typename UIntX2>
0632 uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
0633                                const uint_x4<UInt,UIntX2>& divisor)
0634 {
0635     return divmod(dividend, divisor).first;
0636 }
0637
0638 template <typename UInt, typename UIntX2>
0639 uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
0640                                const uint_x4<UInt,UIntX2>& divisor)
0641 {
0642     return divmod(dividend, divisor).second;
0643 }
0644
0645
0646 template <typename UInt, typename UIntX2>
0647 uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
0648                                const uint_x4<UInt,UIntX2>& b)
0649 {
0650     constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
0651     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0652     bool carryin = false;
0653     bool carryout;
0654     UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
0655     r.w.v0 = UInt(a0b0);
0656     r.w.v1 = UInt(a0b0 >> UINT_BITS);
0657
0658     UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
0659     r.w.v2 = UInt(a1b0 >> UINT_BITS);
0660     r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
0661     carryin = carryout;
0662     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0663     carryin = carryout;
0664     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0665
0666     UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
0667     carryin = false;
0668     r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
0669     carryin = carryout;
0670     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0671
0672     carryin = false;
0673     r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
0674     carryin = carryout;
0675     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0676     carryin = carryout;
0677     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0678
0679     UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
0680     carryin = false;
0681     r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
0682     carryin = carryout;
0683     r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
0684
0685     r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
0686
0687     return r;
0688 }
0689
0690
0691 template <typename UInt, typename UIntX2>
0692 uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
0693                                UIntX2 b01)
0694 {
0695     constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
0696     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0697     bool carryin = false;
0698     bool carryout;
0699     UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(UInt(b01));
0700     r.w.v0 = UInt(a0b0);
0701     r.w.v1 = UInt(a0b0 >> UINT_BITS);
0702
0703     UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(UInt(b01));
0704     r.w.v2 = UInt(a1b0 >> UINT_BITS);
0705     r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
0706     carryin = carryout;
0707     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0708     carryin = carryout;
0709     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0710
0711     UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b01 >> UINT_BITS);
0712     carryin = false;
0713     r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
0714     carryin = carryout;
0715     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0716
0717     carryin = false;
0718     r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
0719     carryin = carryout;
0720     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0721     carryin = carryout;
0722     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0723
0724     UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b01 >> UINT_BITS);
0725     carryin = false;
0726     r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
0727     carryin = carryout;
0728     r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
0729
0730     r.d.v23 += a.d.v23 * b01;
0731
0732     return r;
0733 }
0734
0735 #if PCG_64BIT_SPECIALIZATIONS
0736 #if defined(_MSC_VER)
0737 #pragma intrinsic(_umul128)
0738 #endif
0739
0740 #if defined(_MSC_VER) || __SIZEOF_INT128__
0741 template <typename UInt32>
0742 uint_x4<UInt32,uint64_t> operator*(const uint_x4<UInt32,uint64_t>& a,
0743                    const uint_x4<UInt32,uint64_t>& b)
0744 {
0745 #if defined(_MSC_VER)
0746     uint64_t hi;
0747     uint64_t lo = _umul128(a.d.v01, b.d.v01, &hi);
0748 #else
0749     __uint128_t r = __uint128_t(a.d.v01) * __uint128_t(b.d.v01);
0750     uint64_t lo = uint64_t(r);
0751     uint64_t hi = r >> 64;
0752 #endif
0753     hi += a.d.v23 * b.d.v01 + a.d.v01 * b.d.v23;
0754     return {hi, lo};
0755 }
0756 #endif
0757 #endif
0758
0759
0760 template <typename UInt, typename UIntX2>
0761 uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
0762                                const uint_x4<UInt,UIntX2>& b)
0763 {
0764     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0765
0766     bool carryin = false;
0767     bool carryout;
0768     r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
0769     carryin = carryout;
0770     r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
0771     carryin = carryout;
0772     r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
0773     carryin = carryout;
0774     r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
0775
0776     return r;
0777 }
0778
0779 template <typename UInt, typename UIntX2>
0780 uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
0781                                const uint_x4<UInt,UIntX2>& b)
0782 {
0783     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0784
0785     bool carryin = false;
0786     bool carryout;
0787     r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
0788     carryin = carryout;
0789     r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
0790     carryin = carryout;
0791     r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
0792     carryin = carryout;
0793     r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
0794
0795     return r;
0796 }
0797
0798 #if PCG_64BIT_SPECIALIZATIONS
0799 template <typename UInt32>
0800 uint_x4<UInt32,uint64_t> operator+(const uint_x4<UInt32,uint64_t>& a,
0801                    const uint_x4<UInt32,uint64_t>& b)
0802 {
0803     uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
0804
0805     bool carryin = false;
0806     bool carryout;
0807     r.d.v01 = addwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
0808     carryin = carryout;
0809     r.d.v23 = addwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
0810
0811     return r;
0812 }
0813
0814 template <typename UInt32>
0815 uint_x4<UInt32,uint64_t> operator-(const uint_x4<UInt32,uint64_t>& a,
0816                    const uint_x4<UInt32,uint64_t>& b)
0817 {
0818     uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
0819
0820     bool carryin = false;
0821     bool carryout;
0822     r.d.v01 = subwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
0823     carryin = carryout;
0824     r.d.v23 = subwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
0825
0826     return r;
0827 }
0828 #endif
0829
0830 template <typename UInt, typename UIntX2>
0831 uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
0832                                const uint_x4<UInt,UIntX2>& b)
0833 {
0834     return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
0835 }
0836
0837 template <typename UInt, typename UIntX2>
0838 uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
0839                                const uint_x4<UInt,UIntX2>& b)
0840 {
0841     return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
0842 }
0843
0844 template <typename UInt, typename UIntX2>
0845 uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
0846                                const uint_x4<UInt,UIntX2>& b)
0847 {
0848     return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
0849 }
0850
0851 template <typename UInt, typename UIntX2>
0852 uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
0853 {
0854     return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
0855 }
0856
0857 template <typename UInt, typename UIntX2>
0858 uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
0859 {
0860     return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
0861 }
0862
0863 template <typename UInt, typename UIntX2>
0864 bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0865 {
0866     return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
0867 }
0868
0869 template <typename UInt, typename UIntX2>
0870 bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0871 {
0872     return !operator==(a,b);
0873 }
0874
0875
0876 template <typename UInt, typename UIntX2>
0877 bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0878 {
0879     return (a.d.v23 < b.d.v23)
0880            || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
0881 }
0882
0883 template <typename UInt, typename UIntX2>
0884 bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0885 {
0886     return operator<(b,a);
0887 }
0888
0889 template <typename UInt, typename UIntX2>
0890 bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0891 {
0892     return !(operator<(b,a));
0893 }
0894
0895 template <typename UInt, typename UIntX2>
0896 bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0897 {
0898     return !(operator<(a,b));
0899 }
0900
0901
0902
0903 template <typename UInt, typename UIntX2>
0904 uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
0905                                 const bitcount_t shift)
0906 {
0907     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0908     const bitcount_t bits    = uint_x4<UInt,UIntX2>::UINT_BITS;
0909     const bitcount_t bitmask = bits - 1;
0910     const bitcount_t shiftdiv = shift / bits;
0911     const bitcount_t shiftmod = shift & bitmask;
0912
0913     if (shiftmod) {
0914         UInt carryover = 0;
0915 #if PCG_LITTLE_ENDIAN
0916         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0917 #else
0918         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0919             --out, --in;
0920 #endif
0921             r.wa[out] = (v.wa[in] << shiftmod) | carryover;
0922             carryover = (v.wa[in] >> (bits - shiftmod));
0923         }
0924     } else {
0925 #if PCG_LITTLE_ENDIAN
0926         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0927 #else
0928         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0929             --out, --in;
0930 #endif
0931             r.wa[out] = v.wa[in];
0932         }
0933     }
0934
0935     return r;
0936 }
0937
0938 template <typename UInt, typename UIntX2>
0939 uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
0940                                 const bitcount_t shift)
0941 {
0942     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0943     const bitcount_t bits    = uint_x4<UInt,UIntX2>::UINT_BITS;
0944     const bitcount_t bitmask = bits - 1;
0945     const bitcount_t shiftdiv = shift / bits;
0946     const bitcount_t shiftmod = shift & bitmask;
0947
0948     if (shiftmod) {
0949         UInt carryover = 0;
0950 #if PCG_LITTLE_ENDIAN
0951         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0952             --out, --in;
0953 #else
0954         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0955 #endif
0956             r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
0957             carryover = (v.wa[in] << (bits - shiftmod));
0958         }
0959     } else {
0960 #if PCG_LITTLE_ENDIAN
0961         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0962             --out, --in;
0963 #else
0964         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0965 #endif
0966             r.wa[out] = v.wa[in];
0967         }
0968     }
0969
0970     return r;
0971 }
0972
0973 #if PCG_64BIT_SPECIALIZATIONS
0974 template <typename UInt32>
0975 uint_x4<UInt32,uint64_t> operator<<(const uint_x4<UInt32,uint64_t>& v,
0976                     const bitcount_t shift)
0977 {
0978     constexpr bitcount_t bits2   = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
0979
0980     if (shift >= bits2) {
0981         return {v.d.v01 << (shift-bits2), uint64_t(0u)};
0982     } else {
0983         return {shift ? (v.d.v23 << shift) | (v.d.v01 >> (bits2-shift))
0984                       : v.d.v23,
0985                 v.d.v01 << shift};
0986     }
0987 }
0988
0989 template <typename UInt32>
0990 uint_x4<UInt32,uint64_t> operator>>(const uint_x4<UInt32,uint64_t>& v,
0991                     const bitcount_t shift)
0992 {
0993     constexpr bitcount_t bits2   = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
0994
0995     if (shift >= bits2) {
0996         return {uint64_t(0u), v.d.v23 >> (shift-bits2)};
0997     } else {
0998         return {v.d.v23 >> shift,
0999                 shift ? (v.d.v01 >> shift) | (v.d.v23 << (bits2-shift))
1000                       : v.d.v01};
1001     }
1002 }
1003 #endif
1004
1005 } // namespace pcg_extras
1006 } // namespace arrow_vendored
1007
1008 #endif // PCG_UINT128_HPP_INCLUDED