Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:27:16

0001 /*
0002  * PCG Random Number Generation for C++
0003  *
0004  * Copyright 2014-2021 Melissa O'Neill <oneill@pcg-random.org>,
0005  *                     and the PCG Project contributors.
0006  *
0007  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
0008  *
0009  * Licensed under the Apache License, Version 2.0 (provided in
0010  * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
0011  * or under the MIT license (provided in LICENSE-MIT.txt and at
0012  * http://opensource.org/licenses/MIT), at your option. This file may not
0013  * be copied, modified, or distributed except according to those terms.
0014  *
0015  * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
0016  * express or implied.  See your chosen license for details.
0017  *
0018  * For additional information about the PCG random number generation scheme,
0019  * visit http://www.pcg-random.org/.
0020  */
0021 
0022 /*
0023  * This code provides a a C++ class that can provide 128-bit (or higher)
0024  * integers.  To produce 2K-bit integers, it uses two K-bit integers,
0025  * placed in a union that allowes the code to also see them as four K/2 bit
0026  * integers (and access them either directly name, or by index).
0027  *
0028  * It may seem like we're reinventing the wheel here, because several
0029  * libraries already exist that support large integers, but most existing
0030  * libraries provide a very generic multiprecision code, but here we're
0031  * operating at a fixed size.  Also, most other libraries are fairly
0032  * heavyweight.  So we use a direct implementation.  Sadly, it's much slower
0033  * than hand-coded assembly or direct CPU support.
0034  */
0035 
0036 #ifndef PCG_UINT128_HPP_INCLUDED
0037 #define PCG_UINT128_HPP_INCLUDED 1
0038 
0039 #include <cstdint>
0040 #include <cstdio>
0041 #include <cassert>
0042 #include <climits>
0043 #include <utility>
0044 #include <initializer_list>
0045 #include <type_traits>
0046 
0047 #if defined(_MSC_VER)  // Use MSVC++ intrinsics
0048 #include <intrin.h>
0049 #endif
0050 
0051 /*
0052  * We want to lay the type out the same way that a native type would be laid
0053  * out, which means we must know the machine's endian, at compile time.
0054  * This ugliness attempts to do so.
0055  */
0056 
0057 #ifndef PCG_LITTLE_ENDIAN
0058     #if defined(__BYTE_ORDER__)
0059         #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0060             #define PCG_LITTLE_ENDIAN 1
0061         #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
0062             #define PCG_LITTLE_ENDIAN 0
0063         #else
0064             #error __BYTE_ORDER__ does not match a standard endian, pick a side
0065         #endif
0066     #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
0067         #define PCG_LITTLE_ENDIAN 1
0068     #elif __BIG_ENDIAN__ || _BIG_ENDIAN
0069         #define PCG_LITTLE_ENDIAN 0
0070     #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86
0071         #define PCG_LITTLE_ENDIAN 1
0072     #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
0073           || __m68k__ || __mc68000__
0074         #define PCG_LITTLE_ENDIAN 0
0075     #else
0076         #error Unable to determine target endianness
0077     #endif
0078 #endif
0079 
0080 #if INTPTR_MAX == INT64_MAX && !defined(PCG_64BIT_SPECIALIZATIONS)
0081     #define PCG_64BIT_SPECIALIZATIONS 1
0082 #endif
0083 
0084 namespace arrow_vendored {
0085 namespace pcg_extras {
0086 
0087 // Recent versions of GCC have intrinsics we can use to quickly calculate
0088 // the number of leading and trailing zeros in a number.  If possible, we
0089 // use them, otherwise we fall back to old-fashioned bit twiddling to figure
0090 // them out.
0091 
0092 #ifndef PCG_BITCOUNT_T
0093     typedef uint8_t bitcount_t;
0094 #else
0095     typedef PCG_BITCOUNT_T bitcount_t;
0096 #endif
0097 
0098 /*
0099  * Provide some useful helper functions
0100  *      * flog2                 floor(log2(x))
0101  *      * trailingzeros         number of trailing zero bits
0102  */
0103 
0104 #if defined(__GNUC__)   // Any GNU-compatible compiler supporting C++11 has
0105                         // some useful intrinsics we can use.
0106 
0107 inline bitcount_t flog2(uint32_t v)
0108 {
0109     return 31 - __builtin_clz(v);
0110 }
0111 
0112 inline bitcount_t trailingzeros(uint32_t v)
0113 {
0114     return __builtin_ctz(v);
0115 }
0116 
0117 inline bitcount_t flog2(uint64_t v)
0118 {
0119 #if UINT64_MAX == ULONG_MAX
0120     return 63 - __builtin_clzl(v);
0121 #elif UINT64_MAX == ULLONG_MAX
0122     return 63 - __builtin_clzll(v);
0123 #else
0124     #error Cannot find a function for uint64_t
0125 #endif
0126 }
0127 
0128 inline bitcount_t trailingzeros(uint64_t v)
0129 {
0130 #if UINT64_MAX == ULONG_MAX
0131     return __builtin_ctzl(v);
0132 #elif UINT64_MAX == ULLONG_MAX
0133     return __builtin_ctzll(v);
0134 #else
0135     #error Cannot find a function for uint64_t
0136 #endif
0137 }
0138 
0139 #elif defined(_MSC_VER)  // Use MSVC++ intrinsics
0140 
0141 #pragma intrinsic(_BitScanReverse, _BitScanForward)
0142 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0143 #pragma intrinsic(_BitScanReverse64, _BitScanForward64)
0144 #endif
0145 
0146 inline bitcount_t flog2(uint32_t v)
0147 {
0148     unsigned long i;
0149     _BitScanReverse(&i, v);
0150     return bitcount_t(i);
0151 }
0152 
0153 inline bitcount_t trailingzeros(uint32_t v)
0154 {
0155     unsigned long i;
0156     _BitScanForward(&i, v);
0157     return bitcount_t(i);
0158 }
0159 
0160 inline bitcount_t flog2(uint64_t v)
0161 {
0162 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0163     unsigned long i;
0164     _BitScanReverse64(&i, v);
0165     return bitcount_t(i);
0166 #else
0167     // 32-bit x86
0168     uint32_t high = v >> 32;
0169     uint32_t low  = uint32_t(v);
0170     return high ? 32+flog2(high) : flog2(low);
0171 #endif
0172 }
0173 
0174 inline bitcount_t trailingzeros(uint64_t v)
0175 {
0176 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0177     unsigned long i;
0178     _BitScanForward64(&i, v);
0179     return bitcount_t(i);
0180 #else
0181     // 32-bit x86
0182     uint32_t high = v >> 32;
0183     uint32_t low  = uint32_t(v);
0184     return low ? trailingzeros(low) : trailingzeros(high)+32;
0185 #endif
0186 }
0187 
0188 #else                   // Otherwise, we fall back to bit twiddling
0189                         // implementations
0190 
0191 inline bitcount_t flog2(uint32_t v)
0192 {
0193     // Based on code by Eric Cole and Mark Dickinson, which appears at
0194     // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
0195 
0196     static const uint8_t multiplyDeBruijnBitPos[32] = {
0197       0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
0198       8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
0199     };
0200 
0201     v |= v >> 1; // first round down to one less than a power of 2
0202     v |= v >> 2;
0203     v |= v >> 4;
0204     v |= v >> 8;
0205     v |= v >> 16;
0206 
0207     return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
0208 }
0209 
0210 inline bitcount_t trailingzeros(uint32_t v)
0211 {
0212     static const uint8_t multiplyDeBruijnBitPos[32] = {
0213       0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
0214       31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
0215     };
0216 
0217     return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
0218 }
0219 
0220 inline bitcount_t flog2(uint64_t v)
0221 {
0222     uint32_t high = v >> 32;
0223     uint32_t low  = uint32_t(v);
0224 
0225     return high ? 32+flog2(high) : flog2(low);
0226 }
0227 
0228 inline bitcount_t trailingzeros(uint64_t v)
0229 {
0230     uint32_t high = v >> 32;
0231     uint32_t low  = uint32_t(v);
0232 
0233     return low ? trailingzeros(low) : trailingzeros(high)+32;
0234 }
0235 
0236 #endif
0237 
0238 inline bitcount_t flog2(uint8_t v)
0239 {
0240     return flog2(uint32_t(v));
0241 }
0242 
0243 inline bitcount_t flog2(uint16_t v)
0244 {
0245     return flog2(uint32_t(v));
0246 }
0247 
0248 #if __SIZEOF_INT128__
0249 inline bitcount_t flog2(__uint128_t v)
0250 {
0251     uint64_t high = uint64_t(v >> 64);
0252     uint64_t low  = uint64_t(v);
0253 
0254     return high ? 64+flog2(high) : flog2(low);
0255 }
0256 #endif
0257 
0258 inline bitcount_t trailingzeros(uint8_t v)
0259 {
0260     return trailingzeros(uint32_t(v));
0261 }
0262 
0263 inline bitcount_t trailingzeros(uint16_t v)
0264 {
0265     return trailingzeros(uint32_t(v));
0266 }
0267 
0268 #if __SIZEOF_INT128__
0269 inline bitcount_t trailingzeros(__uint128_t v)
0270 {
0271     uint64_t high = uint64_t(v >> 64);
0272     uint64_t low  = uint64_t(v);
0273     return low ? trailingzeros(low) : trailingzeros(high)+64;
0274 }
0275 #endif
0276 
0277 template <typename UInt>
0278 inline bitcount_t clog2(UInt v)
0279 {
0280     return flog2(v) + ((v & (-v)) != v);
0281 }
0282 
0283 template <typename UInt>
0284 inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
0285 {
0286     UInt half_result = y + carryin;
0287     UInt result = x + half_result;
0288     *carryout = (half_result < y) || (result < x);
0289     return result;
0290 }
0291 
0292 template <typename UInt>
0293 inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
0294 {
0295     UInt half_result = y + carryin;
0296     UInt result = x - half_result;
0297     *carryout = (half_result < y) || (result > x);
0298     return result;
0299 }
0300 
0301 
0302 template <typename UInt, typename UIntX2>
0303 class uint_x4 {
0304 // private:
0305     static constexpr unsigned int UINT_BITS = sizeof(UInt) * CHAR_BIT;
0306 public:
0307     union {
0308 #if PCG_LITTLE_ENDIAN
0309         struct {
0310             UInt v0, v1, v2, v3;
0311         } w;
0312         struct {
0313             UIntX2 v01, v23;
0314         } d;
0315 #else
0316         struct {
0317             UInt v3, v2, v1, v0;
0318         } w;
0319         struct {
0320             UIntX2 v23, v01;
0321         } d;
0322 #endif
0323         // For the array access versions, the code that uses the array
0324         // must handle endian itself.  Yuck.
0325         UInt wa[4];
0326     };
0327 
0328 public:
0329     uint_x4() = default;
0330 
0331     constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
0332 #if PCG_LITTLE_ENDIAN
0333        : w{v0, v1, v2, v3}
0334 #else
0335        : w{v3, v2, v1, v0}
0336 #endif
0337     {
0338         // Nothing (else) to do
0339     }
0340 
0341     constexpr uint_x4(UIntX2 v23, UIntX2 v01)
0342 #if PCG_LITTLE_ENDIAN
0343        : d{v01,v23}
0344 #else
0345        : d{v23,v01}
0346 #endif
0347     {
0348         // Nothing (else) to do
0349     }
0350 
0351     constexpr uint_x4(UIntX2 v01)
0352 #if PCG_LITTLE_ENDIAN
0353        : d{v01, UIntX2(0)}
0354 #else
0355        : d{UIntX2(0),v01}
0356 #endif
0357     {
0358         // Nothing (else) to do
0359     }
0360 
0361     template<class Integral,
0362              typename std::enable_if<(std::is_integral<Integral>::value
0363                                       && sizeof(Integral) <= sizeof(UIntX2))
0364                                     >::type* = nullptr>
0365     constexpr uint_x4(Integral v01)
0366 #if PCG_LITTLE_ENDIAN
0367        : d{UIntX2(v01), UIntX2(0)}
0368 #else
0369        : d{UIntX2(0), UIntX2(v01)}
0370 #endif
0371     {
0372         // Nothing (else) to do
0373     }
0374 
0375     explicit constexpr operator UIntX2() const
0376     {
0377         return d.v01;
0378     }
0379 
0380     template<class Integral,
0381              typename std::enable_if<(std::is_integral<Integral>::value
0382                                       && sizeof(Integral) <= sizeof(UIntX2))
0383                                     >::type* = nullptr>
0384     explicit constexpr operator Integral() const
0385     {
0386         return Integral(d.v01);
0387     }
0388 
0389     explicit constexpr operator bool() const
0390     {
0391         return d.v01 || d.v23;
0392     }
0393 
0394     template<typename U, typename V>
0395     friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
0396 
0397     template<typename U, typename V>
0398     friend uint_x4<U,V> operator*(const uint_x4<U,V>&, V);
0399 
0400     template<typename U, typename V>
0401     friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
0402         divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
0403 
0404     template<typename U, typename V>
0405     friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
0406 
0407     template<typename U, typename V>
0408     friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
0409 
0410     template<typename U, typename V>
0411     friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const bitcount_t shift);
0412 
0413     template<typename U, typename V>
0414     friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const bitcount_t shift);
0415 
0416 #if PCG_64BIT_SPECIALIZATIONS
0417     template<typename U>
0418     friend uint_x4<U,uint64_t> operator<<(const uint_x4<U,uint64_t>&, const bitcount_t shift);
0419 
0420     template<typename U>
0421     friend uint_x4<U,uint64_t> operator>>(const uint_x4<U,uint64_t>&, const bitcount_t shift);
0422 #endif
0423 
0424     template<typename U, typename V>
0425     friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
0426 
0427     template<typename U, typename V>
0428     friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
0429 
0430     template<typename U, typename V>
0431     friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
0432 
0433     template<typename U, typename V>
0434     friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
0435 
0436     template<typename U, typename V>
0437     friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0438 
0439     template<typename U, typename V>
0440     friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
0441 
0442     template<typename U, typename V>
0443     friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0444 
0445     template<typename U, typename V>
0446     friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
0447 
0448     template<typename U, typename V>
0449     friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0450 
0451     template<typename U, typename V>
0452     friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
0453 
0454     template<typename U, typename V>
0455     friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
0456 
0457     template<typename U, typename V>
0458     friend bitcount_t flog2(const uint_x4<U,V>&);
0459 
0460     template<typename U, typename V>
0461     friend bitcount_t trailingzeros(const uint_x4<U,V>&);
0462 
0463 #if PCG_64BIT_SPECIALIZATIONS
0464     template<typename U>
0465     friend bitcount_t flog2(const uint_x4<U,uint64_t>&);
0466 
0467     template<typename U>
0468     friend bitcount_t trailingzeros(const uint_x4<U,uint64_t>&);
0469 #endif
0470 
0471     uint_x4& operator*=(const uint_x4& rhs)
0472     {
0473         uint_x4 result = *this * rhs;
0474         return *this = result;
0475     }
0476 
0477     uint_x4& operator*=(UIntX2 rhs)
0478     {
0479         uint_x4 result = *this * rhs;
0480         return *this = result;
0481     }
0482 
0483     uint_x4& operator/=(const uint_x4& rhs)
0484     {
0485         uint_x4 result = *this / rhs;
0486         return *this = result;
0487     }
0488 
0489     uint_x4& operator%=(const uint_x4& rhs)
0490     {
0491         uint_x4 result = *this % rhs;
0492         return *this = result;
0493     }
0494 
0495     uint_x4& operator+=(const uint_x4& rhs)
0496     {
0497         uint_x4 result = *this + rhs;
0498         return *this = result;
0499     }
0500 
0501     uint_x4& operator-=(const uint_x4& rhs)
0502     {
0503         uint_x4 result = *this - rhs;
0504         return *this = result;
0505     }
0506 
0507     uint_x4& operator&=(const uint_x4& rhs)
0508     {
0509         uint_x4 result = *this & rhs;
0510         return *this = result;
0511     }
0512 
0513     uint_x4& operator|=(const uint_x4& rhs)
0514     {
0515         uint_x4 result = *this | rhs;
0516         return *this = result;
0517     }
0518 
0519     uint_x4& operator^=(const uint_x4& rhs)
0520     {
0521         uint_x4 result = *this ^ rhs;
0522         return *this = result;
0523     }
0524 
0525     uint_x4& operator>>=(bitcount_t shift)
0526     {
0527         uint_x4 result = *this >> shift;
0528         return *this = result;
0529     }
0530 
0531     uint_x4& operator<<=(bitcount_t shift)
0532     {
0533         uint_x4 result = *this << shift;
0534         return *this = result;
0535     }
0536 
0537 };
0538 
0539 template<typename U, typename V>
0540 bitcount_t flog2(const uint_x4<U,V>& v)
0541 {
0542 #if PCG_LITTLE_ENDIAN
0543     for (uint8_t i = 4; i !=0; /* dec in loop */) {
0544         --i;
0545 #else
0546     for (uint8_t i = 0; i < 4; ++i) {
0547 #endif
0548         if (v.wa[i] == 0)
0549              continue;
0550         return flog2(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
0551     }
0552     abort();
0553 }
0554 
0555 template<typename U, typename V>
0556 bitcount_t trailingzeros(const uint_x4<U,V>& v)
0557 {
0558 #if PCG_LITTLE_ENDIAN
0559     for (uint8_t i = 0; i < 4; ++i) {
0560 #else
0561     for (uint8_t i = 4; i !=0; /* dec in loop */) {
0562         --i;
0563 #endif
0564         if (v.wa[i] != 0)
0565             return trailingzeros(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
0566     }
0567     return uint_x4<U,V>::UINT_BITS*4;
0568 }
0569 
0570 #if PCG_64BIT_SPECIALIZATIONS
0571 template<typename UInt32>
0572 bitcount_t flog2(const uint_x4<UInt32,uint64_t>& v)
0573 {
0574     return v.d.v23 > 0 ? flog2(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
0575                        : flog2(v.d.v01);
0576 }
0577 
0578 template<typename UInt32>
0579 bitcount_t trailingzeros(const uint_x4<UInt32,uint64_t>& v)
0580 {
0581     return v.d.v01 == 0 ? trailingzeros(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
0582                         : trailingzeros(v.d.v01);
0583 }
0584 #endif
0585 
0586 template <typename UInt, typename UIntX2>
0587 std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
0588     divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
0589            const uint_x4<UInt,UIntX2>& divisor)
0590 {
0591     // If the dividend is less than the divisor, the answer is always zero.
0592     // This takes care of boundary cases like 0/x (which would otherwise be
0593     // problematic because we can't take the log of zero.  (The boundary case
0594     // of division by zero is undefined.)
0595     if (orig_dividend < divisor)
0596         return { uint_x4<UInt,UIntX2>(UIntX2(0)), orig_dividend };
0597 
0598     auto dividend = orig_dividend;
0599 
0600     auto log2_divisor  = flog2(divisor);
0601     auto log2_dividend = flog2(dividend);
0602     // assert(log2_dividend >= log2_divisor);
0603     bitcount_t logdiff = log2_dividend - log2_divisor;
0604 
0605     constexpr uint_x4<UInt,UIntX2> ONE(UIntX2(1));
0606     if (logdiff == 0)
0607         return { ONE, dividend - divisor };
0608 
0609     // Now we change the log difference to
0610     //  floor(log2(divisor)) - ceil(log2(dividend))
0611     // to ensure that we *underestimate* the result.
0612     logdiff -= 1;
0613 
0614     uint_x4<UInt,UIntX2> quotient(UIntX2(0));
0615 
0616     auto qfactor = ONE << logdiff;
0617     auto factor  = divisor << logdiff;
0618 
0619     do {
0620         dividend -= factor;
0621         quotient += qfactor;
0622         while (dividend < factor) {
0623             factor  >>= 1;
0624             qfactor >>= 1;
0625         }
0626     } while (dividend >= divisor);
0627 
0628     return { quotient, dividend };
0629 }
0630 
0631 template <typename UInt, typename UIntX2>
0632 uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
0633                                const uint_x4<UInt,UIntX2>& divisor)
0634 {
0635     return divmod(dividend, divisor).first;
0636 }
0637 
0638 template <typename UInt, typename UIntX2>
0639 uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
0640                                const uint_x4<UInt,UIntX2>& divisor)
0641 {
0642     return divmod(dividend, divisor).second;
0643 }
0644 
0645 
0646 template <typename UInt, typename UIntX2>
0647 uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
0648                                const uint_x4<UInt,UIntX2>& b)
0649 {
0650     constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
0651     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0652     bool carryin = false;
0653     bool carryout;
0654     UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
0655     r.w.v0 = UInt(a0b0);
0656     r.w.v1 = UInt(a0b0 >> UINT_BITS);
0657 
0658     UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
0659     r.w.v2 = UInt(a1b0 >> UINT_BITS);
0660     r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
0661     carryin = carryout;
0662     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0663     carryin = carryout;
0664     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0665 
0666     UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
0667     carryin = false;
0668     r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
0669     carryin = carryout;
0670     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0671 
0672     carryin = false;
0673     r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
0674     carryin = carryout;
0675     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0676     carryin = carryout;
0677     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0678 
0679     UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
0680     carryin = false;
0681     r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
0682     carryin = carryout;
0683     r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
0684 
0685     r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
0686 
0687     return r;
0688 }
0689 
0690  
0691 template <typename UInt, typename UIntX2>
0692 uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
0693                                UIntX2 b01)
0694 {
0695     constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
0696     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0697     bool carryin = false;
0698     bool carryout;
0699     UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(UInt(b01));
0700     r.w.v0 = UInt(a0b0);
0701     r.w.v1 = UInt(a0b0 >> UINT_BITS);
0702 
0703     UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(UInt(b01));
0704     r.w.v2 = UInt(a1b0 >> UINT_BITS);
0705     r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
0706     carryin = carryout;
0707     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0708     carryin = carryout;
0709     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0710 
0711     UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b01 >> UINT_BITS);
0712     carryin = false;
0713     r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
0714     carryin = carryout;
0715     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0716 
0717     carryin = false;
0718     r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
0719     carryin = carryout;
0720     r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0721     carryin = carryout;
0722     r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0723 
0724     UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b01 >> UINT_BITS);
0725     carryin = false;
0726     r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
0727     carryin = carryout;
0728     r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
0729 
0730     r.d.v23 += a.d.v23 * b01;
0731 
0732     return r;
0733 }
0734 
0735 #if PCG_64BIT_SPECIALIZATIONS
0736 #if defined(_MSC_VER)
0737 #pragma intrinsic(_umul128)
0738 #endif
0739 
0740 #if defined(_MSC_VER) || __SIZEOF_INT128__
0741 template <typename UInt32>
0742 uint_x4<UInt32,uint64_t> operator*(const uint_x4<UInt32,uint64_t>& a,
0743                    const uint_x4<UInt32,uint64_t>& b)
0744 {
0745 #if defined(_MSC_VER)
0746     uint64_t hi;
0747     uint64_t lo = _umul128(a.d.v01, b.d.v01, &hi);
0748 #else
0749     __uint128_t r = __uint128_t(a.d.v01) * __uint128_t(b.d.v01);
0750     uint64_t lo = uint64_t(r);
0751     uint64_t hi = r >> 64;
0752 #endif
0753     hi += a.d.v23 * b.d.v01 + a.d.v01 * b.d.v23;
0754     return {hi, lo};
0755 }
0756 #endif
0757 #endif
0758 
0759 
0760 template <typename UInt, typename UIntX2>
0761 uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
0762                                const uint_x4<UInt,UIntX2>& b)
0763 {
0764     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0765 
0766     bool carryin = false;
0767     bool carryout;
0768     r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
0769     carryin = carryout;
0770     r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
0771     carryin = carryout;
0772     r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
0773     carryin = carryout;
0774     r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
0775 
0776     return r;
0777 }
0778 
0779 template <typename UInt, typename UIntX2>
0780 uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
0781                                const uint_x4<UInt,UIntX2>& b)
0782 {
0783     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0784 
0785     bool carryin = false;
0786     bool carryout;
0787     r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
0788     carryin = carryout;
0789     r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
0790     carryin = carryout;
0791     r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
0792     carryin = carryout;
0793     r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
0794 
0795     return r;
0796 }
0797 
0798 #if PCG_64BIT_SPECIALIZATIONS
0799 template <typename UInt32>
0800 uint_x4<UInt32,uint64_t> operator+(const uint_x4<UInt32,uint64_t>& a,
0801                    const uint_x4<UInt32,uint64_t>& b)
0802 {
0803     uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
0804 
0805     bool carryin = false;
0806     bool carryout;
0807     r.d.v01 = addwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
0808     carryin = carryout;
0809     r.d.v23 = addwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
0810 
0811     return r;
0812 }
0813 
0814 template <typename UInt32>
0815 uint_x4<UInt32,uint64_t> operator-(const uint_x4<UInt32,uint64_t>& a,
0816                    const uint_x4<UInt32,uint64_t>& b)
0817 {
0818     uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
0819 
0820     bool carryin = false;
0821     bool carryout;
0822     r.d.v01 = subwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
0823     carryin = carryout;
0824     r.d.v23 = subwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
0825 
0826     return r;
0827 }
0828 #endif
0829 
0830 template <typename UInt, typename UIntX2>
0831 uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
0832                                const uint_x4<UInt,UIntX2>& b)
0833 {
0834     return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
0835 }
0836 
0837 template <typename UInt, typename UIntX2>
0838 uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
0839                                const uint_x4<UInt,UIntX2>& b)
0840 {
0841     return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
0842 }
0843 
0844 template <typename UInt, typename UIntX2>
0845 uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
0846                                const uint_x4<UInt,UIntX2>& b)
0847 {
0848     return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
0849 }
0850 
0851 template <typename UInt, typename UIntX2>
0852 uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
0853 {
0854     return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
0855 }
0856 
0857 template <typename UInt, typename UIntX2>
0858 uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
0859 {
0860     return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
0861 }
0862 
0863 template <typename UInt, typename UIntX2>
0864 bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0865 {
0866     return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
0867 }
0868 
0869 template <typename UInt, typename UIntX2>
0870 bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0871 {
0872     return !operator==(a,b);
0873 }
0874 
0875 
0876 template <typename UInt, typename UIntX2>
0877 bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0878 {
0879     return (a.d.v23 < b.d.v23)
0880            || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
0881 }
0882 
0883 template <typename UInt, typename UIntX2>
0884 bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0885 {
0886     return operator<(b,a);
0887 }
0888 
0889 template <typename UInt, typename UIntX2>
0890 bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0891 {
0892     return !(operator<(b,a));
0893 }
0894 
0895 template <typename UInt, typename UIntX2>
0896 bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0897 {
0898     return !(operator<(a,b));
0899 }
0900 
0901 
0902 
0903 template <typename UInt, typename UIntX2>
0904 uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
0905                                 const bitcount_t shift)
0906 {
0907     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0908     const bitcount_t bits    = uint_x4<UInt,UIntX2>::UINT_BITS;
0909     const bitcount_t bitmask = bits - 1;
0910     const bitcount_t shiftdiv = shift / bits;
0911     const bitcount_t shiftmod = shift & bitmask;
0912 
0913     if (shiftmod) {
0914         UInt carryover = 0;
0915 #if PCG_LITTLE_ENDIAN
0916         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0917 #else
0918         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0919             --out, --in;
0920 #endif
0921             r.wa[out] = (v.wa[in] << shiftmod) | carryover;
0922             carryover = (v.wa[in] >> (bits - shiftmod));
0923         }
0924     } else {
0925 #if PCG_LITTLE_ENDIAN
0926         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0927 #else
0928         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0929             --out, --in;
0930 #endif
0931             r.wa[out] = v.wa[in];
0932         }
0933     }
0934 
0935     return r;
0936 }
0937 
0938 template <typename UInt, typename UIntX2>
0939 uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
0940                                 const bitcount_t shift)
0941 {
0942     uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0943     const bitcount_t bits    = uint_x4<UInt,UIntX2>::UINT_BITS;
0944     const bitcount_t bitmask = bits - 1;
0945     const bitcount_t shiftdiv = shift / bits;
0946     const bitcount_t shiftmod = shift & bitmask;
0947 
0948     if (shiftmod) {
0949         UInt carryover = 0;
0950 #if PCG_LITTLE_ENDIAN
0951         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0952             --out, --in;
0953 #else
0954         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0955 #endif
0956             r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
0957             carryover = (v.wa[in] << (bits - shiftmod));
0958         }
0959     } else {
0960 #if PCG_LITTLE_ENDIAN
0961         for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
0962             --out, --in;
0963 #else
0964         for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0965 #endif
0966             r.wa[out] = v.wa[in];
0967         }
0968     }
0969 
0970     return r;
0971 }
0972 
0973 #if PCG_64BIT_SPECIALIZATIONS
0974 template <typename UInt32>
0975 uint_x4<UInt32,uint64_t> operator<<(const uint_x4<UInt32,uint64_t>& v,
0976                     const bitcount_t shift)
0977 {
0978     constexpr bitcount_t bits2   = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
0979     
0980     if (shift >= bits2) {
0981         return {v.d.v01 << (shift-bits2), uint64_t(0u)};
0982     } else {
0983         return {shift ? (v.d.v23 << shift) | (v.d.v01 >> (bits2-shift)) 
0984                       : v.d.v23,
0985                 v.d.v01 << shift};
0986     }
0987 }
0988 
0989 template <typename UInt32>
0990 uint_x4<UInt32,uint64_t> operator>>(const uint_x4<UInt32,uint64_t>& v,
0991                     const bitcount_t shift)
0992 {
0993     constexpr bitcount_t bits2   = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
0994     
0995     if (shift >= bits2) {
0996         return {uint64_t(0u), v.d.v23 >> (shift-bits2)};
0997     } else {
0998         return {v.d.v23 >> shift,
0999                 shift ? (v.d.v01 >> shift) | (v.d.v23 << (bits2-shift))
1000                       : v.d.v01};
1001     }
1002 }
1003 #endif
1004 
1005 } // namespace pcg_extras
1006 } // namespace arrow_vendored
1007 
1008 #endif // PCG_UINT128_HPP_INCLUDED