File indexing completed on 2025-08-28 08:27:16
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035
0036 #ifndef PCG_UINT128_HPP_INCLUDED
0037 #define PCG_UINT128_HPP_INCLUDED 1
0038
0039 #include <cstdint>
0040 #include <cstdio>
0041 #include <cassert>
0042 #include <climits>
0043 #include <utility>
0044 #include <initializer_list>
0045 #include <type_traits>
0046
0047 #if defined(_MSC_VER)
0048 #include <intrin.h>
0049 #endif
0050
0051
0052
0053
0054
0055
0056
0057 #ifndef PCG_LITTLE_ENDIAN
0058 #if defined(__BYTE_ORDER__)
0059 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0060 #define PCG_LITTLE_ENDIAN 1
0061 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
0062 #define PCG_LITTLE_ENDIAN 0
0063 #else
0064 #error __BYTE_ORDER__ does not match a standard endian, pick a side
0065 #endif
0066 #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
0067 #define PCG_LITTLE_ENDIAN 1
0068 #elif __BIG_ENDIAN__ || _BIG_ENDIAN
0069 #define PCG_LITTLE_ENDIAN 0
0070 #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86
0071 #define PCG_LITTLE_ENDIAN 1
0072 #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
0073 || __m68k__ || __mc68000__
0074 #define PCG_LITTLE_ENDIAN 0
0075 #else
0076 #error Unable to determine target endianness
0077 #endif
0078 #endif
0079
0080 #if INTPTR_MAX == INT64_MAX && !defined(PCG_64BIT_SPECIALIZATIONS)
0081 #define PCG_64BIT_SPECIALIZATIONS 1
0082 #endif
0083
0084 namespace arrow_vendored {
0085 namespace pcg_extras {
0086
0087
0088
0089
0090
0091
0092 #ifndef PCG_BITCOUNT_T
0093 typedef uint8_t bitcount_t;
0094 #else
0095 typedef PCG_BITCOUNT_T bitcount_t;
0096 #endif
0097
0098
0099
0100
0101
0102
0103
0104 #if defined(__GNUC__)
0105
0106
0107 inline bitcount_t flog2(uint32_t v)
0108 {
0109 return 31 - __builtin_clz(v);
0110 }
0111
0112 inline bitcount_t trailingzeros(uint32_t v)
0113 {
0114 return __builtin_ctz(v);
0115 }
0116
0117 inline bitcount_t flog2(uint64_t v)
0118 {
0119 #if UINT64_MAX == ULONG_MAX
0120 return 63 - __builtin_clzl(v);
0121 #elif UINT64_MAX == ULLONG_MAX
0122 return 63 - __builtin_clzll(v);
0123 #else
0124 #error Cannot find a function for uint64_t
0125 #endif
0126 }
0127
0128 inline bitcount_t trailingzeros(uint64_t v)
0129 {
0130 #if UINT64_MAX == ULONG_MAX
0131 return __builtin_ctzl(v);
0132 #elif UINT64_MAX == ULLONG_MAX
0133 return __builtin_ctzll(v);
0134 #else
0135 #error Cannot find a function for uint64_t
0136 #endif
0137 }
0138
0139 #elif defined(_MSC_VER)
0140
0141 #pragma intrinsic(_BitScanReverse, _BitScanForward)
0142 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0143 #pragma intrinsic(_BitScanReverse64, _BitScanForward64)
0144 #endif
0145
0146 inline bitcount_t flog2(uint32_t v)
0147 {
0148 unsigned long i;
0149 _BitScanReverse(&i, v);
0150 return bitcount_t(i);
0151 }
0152
0153 inline bitcount_t trailingzeros(uint32_t v)
0154 {
0155 unsigned long i;
0156 _BitScanForward(&i, v);
0157 return bitcount_t(i);
0158 }
0159
0160 inline bitcount_t flog2(uint64_t v)
0161 {
0162 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0163 unsigned long i;
0164 _BitScanReverse64(&i, v);
0165 return bitcount_t(i);
0166 #else
0167
0168 uint32_t high = v >> 32;
0169 uint32_t low = uint32_t(v);
0170 return high ? 32+flog2(high) : flog2(low);
0171 #endif
0172 }
0173
0174 inline bitcount_t trailingzeros(uint64_t v)
0175 {
0176 #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
0177 unsigned long i;
0178 _BitScanForward64(&i, v);
0179 return bitcount_t(i);
0180 #else
0181
0182 uint32_t high = v >> 32;
0183 uint32_t low = uint32_t(v);
0184 return low ? trailingzeros(low) : trailingzeros(high)+32;
0185 #endif
0186 }
0187
0188 #else
0189
0190
0191 inline bitcount_t flog2(uint32_t v)
0192 {
0193
0194
0195
0196 static const uint8_t multiplyDeBruijnBitPos[32] = {
0197 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
0198 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
0199 };
0200
0201 v |= v >> 1;
0202 v |= v >> 2;
0203 v |= v >> 4;
0204 v |= v >> 8;
0205 v |= v >> 16;
0206
0207 return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
0208 }
0209
0210 inline bitcount_t trailingzeros(uint32_t v)
0211 {
0212 static const uint8_t multiplyDeBruijnBitPos[32] = {
0213 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
0214 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
0215 };
0216
0217 return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
0218 }
0219
0220 inline bitcount_t flog2(uint64_t v)
0221 {
0222 uint32_t high = v >> 32;
0223 uint32_t low = uint32_t(v);
0224
0225 return high ? 32+flog2(high) : flog2(low);
0226 }
0227
0228 inline bitcount_t trailingzeros(uint64_t v)
0229 {
0230 uint32_t high = v >> 32;
0231 uint32_t low = uint32_t(v);
0232
0233 return low ? trailingzeros(low) : trailingzeros(high)+32;
0234 }
0235
0236 #endif
0237
0238 inline bitcount_t flog2(uint8_t v)
0239 {
0240 return flog2(uint32_t(v));
0241 }
0242
0243 inline bitcount_t flog2(uint16_t v)
0244 {
0245 return flog2(uint32_t(v));
0246 }
0247
0248 #if __SIZEOF_INT128__
0249 inline bitcount_t flog2(__uint128_t v)
0250 {
0251 uint64_t high = uint64_t(v >> 64);
0252 uint64_t low = uint64_t(v);
0253
0254 return high ? 64+flog2(high) : flog2(low);
0255 }
0256 #endif
0257
0258 inline bitcount_t trailingzeros(uint8_t v)
0259 {
0260 return trailingzeros(uint32_t(v));
0261 }
0262
0263 inline bitcount_t trailingzeros(uint16_t v)
0264 {
0265 return trailingzeros(uint32_t(v));
0266 }
0267
0268 #if __SIZEOF_INT128__
0269 inline bitcount_t trailingzeros(__uint128_t v)
0270 {
0271 uint64_t high = uint64_t(v >> 64);
0272 uint64_t low = uint64_t(v);
0273 return low ? trailingzeros(low) : trailingzeros(high)+64;
0274 }
0275 #endif
0276
0277 template <typename UInt>
0278 inline bitcount_t clog2(UInt v)
0279 {
0280 return flog2(v) + ((v & (-v)) != v);
0281 }
0282
0283 template <typename UInt>
0284 inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
0285 {
0286 UInt half_result = y + carryin;
0287 UInt result = x + half_result;
0288 *carryout = (half_result < y) || (result < x);
0289 return result;
0290 }
0291
0292 template <typename UInt>
0293 inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
0294 {
0295 UInt half_result = y + carryin;
0296 UInt result = x - half_result;
0297 *carryout = (half_result < y) || (result > x);
0298 return result;
0299 }
0300
0301
0302 template <typename UInt, typename UIntX2>
0303 class uint_x4 {
0304
0305 static constexpr unsigned int UINT_BITS = sizeof(UInt) * CHAR_BIT;
0306 public:
0307 union {
0308 #if PCG_LITTLE_ENDIAN
0309 struct {
0310 UInt v0, v1, v2, v3;
0311 } w;
0312 struct {
0313 UIntX2 v01, v23;
0314 } d;
0315 #else
0316 struct {
0317 UInt v3, v2, v1, v0;
0318 } w;
0319 struct {
0320 UIntX2 v23, v01;
0321 } d;
0322 #endif
0323
0324
0325 UInt wa[4];
0326 };
0327
0328 public:
0329 uint_x4() = default;
0330
0331 constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
0332 #if PCG_LITTLE_ENDIAN
0333 : w{v0, v1, v2, v3}
0334 #else
0335 : w{v3, v2, v1, v0}
0336 #endif
0337 {
0338
0339 }
0340
0341 constexpr uint_x4(UIntX2 v23, UIntX2 v01)
0342 #if PCG_LITTLE_ENDIAN
0343 : d{v01,v23}
0344 #else
0345 : d{v23,v01}
0346 #endif
0347 {
0348
0349 }
0350
0351 constexpr uint_x4(UIntX2 v01)
0352 #if PCG_LITTLE_ENDIAN
0353 : d{v01, UIntX2(0)}
0354 #else
0355 : d{UIntX2(0),v01}
0356 #endif
0357 {
0358
0359 }
0360
0361 template<class Integral,
0362 typename std::enable_if<(std::is_integral<Integral>::value
0363 && sizeof(Integral) <= sizeof(UIntX2))
0364 >::type* = nullptr>
0365 constexpr uint_x4(Integral v01)
0366 #if PCG_LITTLE_ENDIAN
0367 : d{UIntX2(v01), UIntX2(0)}
0368 #else
0369 : d{UIntX2(0), UIntX2(v01)}
0370 #endif
0371 {
0372
0373 }
0374
0375 explicit constexpr operator UIntX2() const
0376 {
0377 return d.v01;
0378 }
0379
0380 template<class Integral,
0381 typename std::enable_if<(std::is_integral<Integral>::value
0382 && sizeof(Integral) <= sizeof(UIntX2))
0383 >::type* = nullptr>
0384 explicit constexpr operator Integral() const
0385 {
0386 return Integral(d.v01);
0387 }
0388
0389 explicit constexpr operator bool() const
0390 {
0391 return d.v01 || d.v23;
0392 }
0393
0394 template<typename U, typename V>
0395 friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
0396
0397 template<typename U, typename V>
0398 friend uint_x4<U,V> operator*(const uint_x4<U,V>&, V);
0399
0400 template<typename U, typename V>
0401 friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
0402 divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
0403
0404 template<typename U, typename V>
0405 friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
0406
0407 template<typename U, typename V>
0408 friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
0409
0410 template<typename U, typename V>
0411 friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const bitcount_t shift);
0412
0413 template<typename U, typename V>
0414 friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const bitcount_t shift);
0415
0416 #if PCG_64BIT_SPECIALIZATIONS
0417 template<typename U>
0418 friend uint_x4<U,uint64_t> operator<<(const uint_x4<U,uint64_t>&, const bitcount_t shift);
0419
0420 template<typename U>
0421 friend uint_x4<U,uint64_t> operator>>(const uint_x4<U,uint64_t>&, const bitcount_t shift);
0422 #endif
0423
0424 template<typename U, typename V>
0425 friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
0426
0427 template<typename U, typename V>
0428 friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
0429
0430 template<typename U, typename V>
0431 friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
0432
0433 template<typename U, typename V>
0434 friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
0435
0436 template<typename U, typename V>
0437 friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0438
0439 template<typename U, typename V>
0440 friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
0441
0442 template<typename U, typename V>
0443 friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0444
0445 template<typename U, typename V>
0446 friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
0447
0448 template<typename U, typename V>
0449 friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
0450
0451 template<typename U, typename V>
0452 friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
0453
0454 template<typename U, typename V>
0455 friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
0456
0457 template<typename U, typename V>
0458 friend bitcount_t flog2(const uint_x4<U,V>&);
0459
0460 template<typename U, typename V>
0461 friend bitcount_t trailingzeros(const uint_x4<U,V>&);
0462
0463 #if PCG_64BIT_SPECIALIZATIONS
0464 template<typename U>
0465 friend bitcount_t flog2(const uint_x4<U,uint64_t>&);
0466
0467 template<typename U>
0468 friend bitcount_t trailingzeros(const uint_x4<U,uint64_t>&);
0469 #endif
0470
0471 uint_x4& operator*=(const uint_x4& rhs)
0472 {
0473 uint_x4 result = *this * rhs;
0474 return *this = result;
0475 }
0476
0477 uint_x4& operator*=(UIntX2 rhs)
0478 {
0479 uint_x4 result = *this * rhs;
0480 return *this = result;
0481 }
0482
0483 uint_x4& operator/=(const uint_x4& rhs)
0484 {
0485 uint_x4 result = *this / rhs;
0486 return *this = result;
0487 }
0488
0489 uint_x4& operator%=(const uint_x4& rhs)
0490 {
0491 uint_x4 result = *this % rhs;
0492 return *this = result;
0493 }
0494
0495 uint_x4& operator+=(const uint_x4& rhs)
0496 {
0497 uint_x4 result = *this + rhs;
0498 return *this = result;
0499 }
0500
0501 uint_x4& operator-=(const uint_x4& rhs)
0502 {
0503 uint_x4 result = *this - rhs;
0504 return *this = result;
0505 }
0506
0507 uint_x4& operator&=(const uint_x4& rhs)
0508 {
0509 uint_x4 result = *this & rhs;
0510 return *this = result;
0511 }
0512
0513 uint_x4& operator|=(const uint_x4& rhs)
0514 {
0515 uint_x4 result = *this | rhs;
0516 return *this = result;
0517 }
0518
0519 uint_x4& operator^=(const uint_x4& rhs)
0520 {
0521 uint_x4 result = *this ^ rhs;
0522 return *this = result;
0523 }
0524
0525 uint_x4& operator>>=(bitcount_t shift)
0526 {
0527 uint_x4 result = *this >> shift;
0528 return *this = result;
0529 }
0530
0531 uint_x4& operator<<=(bitcount_t shift)
0532 {
0533 uint_x4 result = *this << shift;
0534 return *this = result;
0535 }
0536
0537 };
0538
0539 template<typename U, typename V>
0540 bitcount_t flog2(const uint_x4<U,V>& v)
0541 {
0542 #if PCG_LITTLE_ENDIAN
0543 for (uint8_t i = 4; i !=0; ) {
0544 --i;
0545 #else
0546 for (uint8_t i = 0; i < 4; ++i) {
0547 #endif
0548 if (v.wa[i] == 0)
0549 continue;
0550 return flog2(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
0551 }
0552 abort();
0553 }
0554
0555 template<typename U, typename V>
0556 bitcount_t trailingzeros(const uint_x4<U,V>& v)
0557 {
0558 #if PCG_LITTLE_ENDIAN
0559 for (uint8_t i = 0; i < 4; ++i) {
0560 #else
0561 for (uint8_t i = 4; i !=0; ) {
0562 --i;
0563 #endif
0564 if (v.wa[i] != 0)
0565 return trailingzeros(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
0566 }
0567 return uint_x4<U,V>::UINT_BITS*4;
0568 }
0569
0570 #if PCG_64BIT_SPECIALIZATIONS
0571 template<typename UInt32>
0572 bitcount_t flog2(const uint_x4<UInt32,uint64_t>& v)
0573 {
0574 return v.d.v23 > 0 ? flog2(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
0575 : flog2(v.d.v01);
0576 }
0577
0578 template<typename UInt32>
0579 bitcount_t trailingzeros(const uint_x4<UInt32,uint64_t>& v)
0580 {
0581 return v.d.v01 == 0 ? trailingzeros(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
0582 : trailingzeros(v.d.v01);
0583 }
0584 #endif
0585
0586 template <typename UInt, typename UIntX2>
0587 std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
0588 divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
0589 const uint_x4<UInt,UIntX2>& divisor)
0590 {
0591
0592
0593
0594
0595 if (orig_dividend < divisor)
0596 return { uint_x4<UInt,UIntX2>(UIntX2(0)), orig_dividend };
0597
0598 auto dividend = orig_dividend;
0599
0600 auto log2_divisor = flog2(divisor);
0601 auto log2_dividend = flog2(dividend);
0602
0603 bitcount_t logdiff = log2_dividend - log2_divisor;
0604
0605 constexpr uint_x4<UInt,UIntX2> ONE(UIntX2(1));
0606 if (logdiff == 0)
0607 return { ONE, dividend - divisor };
0608
0609
0610
0611
0612 logdiff -= 1;
0613
0614 uint_x4<UInt,UIntX2> quotient(UIntX2(0));
0615
0616 auto qfactor = ONE << logdiff;
0617 auto factor = divisor << logdiff;
0618
0619 do {
0620 dividend -= factor;
0621 quotient += qfactor;
0622 while (dividend < factor) {
0623 factor >>= 1;
0624 qfactor >>= 1;
0625 }
0626 } while (dividend >= divisor);
0627
0628 return { quotient, dividend };
0629 }
0630
0631 template <typename UInt, typename UIntX2>
0632 uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
0633 const uint_x4<UInt,UIntX2>& divisor)
0634 {
0635 return divmod(dividend, divisor).first;
0636 }
0637
0638 template <typename UInt, typename UIntX2>
0639 uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
0640 const uint_x4<UInt,UIntX2>& divisor)
0641 {
0642 return divmod(dividend, divisor).second;
0643 }
0644
0645
0646 template <typename UInt, typename UIntX2>
0647 uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
0648 const uint_x4<UInt,UIntX2>& b)
0649 {
0650 constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
0651 uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0652 bool carryin = false;
0653 bool carryout;
0654 UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
0655 r.w.v0 = UInt(a0b0);
0656 r.w.v1 = UInt(a0b0 >> UINT_BITS);
0657
0658 UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
0659 r.w.v2 = UInt(a1b0 >> UINT_BITS);
0660 r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
0661 carryin = carryout;
0662 r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0663 carryin = carryout;
0664 r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0665
0666 UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
0667 carryin = false;
0668 r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
0669 carryin = carryout;
0670 r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0671
0672 carryin = false;
0673 r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
0674 carryin = carryout;
0675 r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0676 carryin = carryout;
0677 r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0678
0679 UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
0680 carryin = false;
0681 r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
0682 carryin = carryout;
0683 r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
0684
0685 r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
0686
0687 return r;
0688 }
0689
0690
0691 template <typename UInt, typename UIntX2>
0692 uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
0693 UIntX2 b01)
0694 {
0695 constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
0696 uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0697 bool carryin = false;
0698 bool carryout;
0699 UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(UInt(b01));
0700 r.w.v0 = UInt(a0b0);
0701 r.w.v1 = UInt(a0b0 >> UINT_BITS);
0702
0703 UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(UInt(b01));
0704 r.w.v2 = UInt(a1b0 >> UINT_BITS);
0705 r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
0706 carryin = carryout;
0707 r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0708 carryin = carryout;
0709 r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0710
0711 UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b01 >> UINT_BITS);
0712 carryin = false;
0713 r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
0714 carryin = carryout;
0715 r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0716
0717 carryin = false;
0718 r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
0719 carryin = carryout;
0720 r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
0721 carryin = carryout;
0722 r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
0723
0724 UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b01 >> UINT_BITS);
0725 carryin = false;
0726 r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
0727 carryin = carryout;
0728 r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
0729
0730 r.d.v23 += a.d.v23 * b01;
0731
0732 return r;
0733 }
0734
0735 #if PCG_64BIT_SPECIALIZATIONS
0736 #if defined(_MSC_VER)
0737 #pragma intrinsic(_umul128)
0738 #endif
0739
0740 #if defined(_MSC_VER) || __SIZEOF_INT128__
0741 template <typename UInt32>
0742 uint_x4<UInt32,uint64_t> operator*(const uint_x4<UInt32,uint64_t>& a,
0743 const uint_x4<UInt32,uint64_t>& b)
0744 {
0745 #if defined(_MSC_VER)
0746 uint64_t hi;
0747 uint64_t lo = _umul128(a.d.v01, b.d.v01, &hi);
0748 #else
0749 __uint128_t r = __uint128_t(a.d.v01) * __uint128_t(b.d.v01);
0750 uint64_t lo = uint64_t(r);
0751 uint64_t hi = r >> 64;
0752 #endif
0753 hi += a.d.v23 * b.d.v01 + a.d.v01 * b.d.v23;
0754 return {hi, lo};
0755 }
0756 #endif
0757 #endif
0758
0759
0760 template <typename UInt, typename UIntX2>
0761 uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
0762 const uint_x4<UInt,UIntX2>& b)
0763 {
0764 uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0765
0766 bool carryin = false;
0767 bool carryout;
0768 r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
0769 carryin = carryout;
0770 r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
0771 carryin = carryout;
0772 r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
0773 carryin = carryout;
0774 r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
0775
0776 return r;
0777 }
0778
0779 template <typename UInt, typename UIntX2>
0780 uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
0781 const uint_x4<UInt,UIntX2>& b)
0782 {
0783 uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0784
0785 bool carryin = false;
0786 bool carryout;
0787 r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
0788 carryin = carryout;
0789 r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
0790 carryin = carryout;
0791 r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
0792 carryin = carryout;
0793 r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
0794
0795 return r;
0796 }
0797
0798 #if PCG_64BIT_SPECIALIZATIONS
0799 template <typename UInt32>
0800 uint_x4<UInt32,uint64_t> operator+(const uint_x4<UInt32,uint64_t>& a,
0801 const uint_x4<UInt32,uint64_t>& b)
0802 {
0803 uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
0804
0805 bool carryin = false;
0806 bool carryout;
0807 r.d.v01 = addwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
0808 carryin = carryout;
0809 r.d.v23 = addwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
0810
0811 return r;
0812 }
0813
0814 template <typename UInt32>
0815 uint_x4<UInt32,uint64_t> operator-(const uint_x4<UInt32,uint64_t>& a,
0816 const uint_x4<UInt32,uint64_t>& b)
0817 {
0818 uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
0819
0820 bool carryin = false;
0821 bool carryout;
0822 r.d.v01 = subwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
0823 carryin = carryout;
0824 r.d.v23 = subwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
0825
0826 return r;
0827 }
0828 #endif
0829
0830 template <typename UInt, typename UIntX2>
0831 uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
0832 const uint_x4<UInt,UIntX2>& b)
0833 {
0834 return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
0835 }
0836
0837 template <typename UInt, typename UIntX2>
0838 uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
0839 const uint_x4<UInt,UIntX2>& b)
0840 {
0841 return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
0842 }
0843
0844 template <typename UInt, typename UIntX2>
0845 uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
0846 const uint_x4<UInt,UIntX2>& b)
0847 {
0848 return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
0849 }
0850
0851 template <typename UInt, typename UIntX2>
0852 uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
0853 {
0854 return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
0855 }
0856
0857 template <typename UInt, typename UIntX2>
0858 uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
0859 {
0860 return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
0861 }
0862
0863 template <typename UInt, typename UIntX2>
0864 bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0865 {
0866 return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
0867 }
0868
0869 template <typename UInt, typename UIntX2>
0870 bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0871 {
0872 return !operator==(a,b);
0873 }
0874
0875
0876 template <typename UInt, typename UIntX2>
0877 bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0878 {
0879 return (a.d.v23 < b.d.v23)
0880 || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
0881 }
0882
0883 template <typename UInt, typename UIntX2>
0884 bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0885 {
0886 return operator<(b,a);
0887 }
0888
0889 template <typename UInt, typename UIntX2>
0890 bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0891 {
0892 return !(operator<(b,a));
0893 }
0894
0895 template <typename UInt, typename UIntX2>
0896 bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
0897 {
0898 return !(operator<(a,b));
0899 }
0900
0901
0902
0903 template <typename UInt, typename UIntX2>
0904 uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
0905 const bitcount_t shift)
0906 {
0907 uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0908 const bitcount_t bits = uint_x4<UInt,UIntX2>::UINT_BITS;
0909 const bitcount_t bitmask = bits - 1;
0910 const bitcount_t shiftdiv = shift / bits;
0911 const bitcount_t shiftmod = shift & bitmask;
0912
0913 if (shiftmod) {
0914 UInt carryover = 0;
0915 #if PCG_LITTLE_ENDIAN
0916 for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0917 #else
0918 for (uint8_t out = 4-shiftdiv, in = 4; out != 0; ) {
0919 --out, --in;
0920 #endif
0921 r.wa[out] = (v.wa[in] << shiftmod) | carryover;
0922 carryover = (v.wa[in] >> (bits - shiftmod));
0923 }
0924 } else {
0925 #if PCG_LITTLE_ENDIAN
0926 for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0927 #else
0928 for (uint8_t out = 4-shiftdiv, in = 4; out != 0; ) {
0929 --out, --in;
0930 #endif
0931 r.wa[out] = v.wa[in];
0932 }
0933 }
0934
0935 return r;
0936 }
0937
0938 template <typename UInt, typename UIntX2>
0939 uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
0940 const bitcount_t shift)
0941 {
0942 uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
0943 const bitcount_t bits = uint_x4<UInt,UIntX2>::UINT_BITS;
0944 const bitcount_t bitmask = bits - 1;
0945 const bitcount_t shiftdiv = shift / bits;
0946 const bitcount_t shiftmod = shift & bitmask;
0947
0948 if (shiftmod) {
0949 UInt carryover = 0;
0950 #if PCG_LITTLE_ENDIAN
0951 for (uint8_t out = 4-shiftdiv, in = 4; out != 0; ) {
0952 --out, --in;
0953 #else
0954 for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0955 #endif
0956 r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
0957 carryover = (v.wa[in] << (bits - shiftmod));
0958 }
0959 } else {
0960 #if PCG_LITTLE_ENDIAN
0961 for (uint8_t out = 4-shiftdiv, in = 4; out != 0; ) {
0962 --out, --in;
0963 #else
0964 for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
0965 #endif
0966 r.wa[out] = v.wa[in];
0967 }
0968 }
0969
0970 return r;
0971 }
0972
0973 #if PCG_64BIT_SPECIALIZATIONS
0974 template <typename UInt32>
0975 uint_x4<UInt32,uint64_t> operator<<(const uint_x4<UInt32,uint64_t>& v,
0976 const bitcount_t shift)
0977 {
0978 constexpr bitcount_t bits2 = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
0979
0980 if (shift >= bits2) {
0981 return {v.d.v01 << (shift-bits2), uint64_t(0u)};
0982 } else {
0983 return {shift ? (v.d.v23 << shift) | (v.d.v01 >> (bits2-shift))
0984 : v.d.v23,
0985 v.d.v01 << shift};
0986 }
0987 }
0988
0989 template <typename UInt32>
0990 uint_x4<UInt32,uint64_t> operator>>(const uint_x4<UInt32,uint64_t>& v,
0991 const bitcount_t shift)
0992 {
0993 constexpr bitcount_t bits2 = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
0994
0995 if (shift >= bits2) {
0996 return {uint64_t(0u), v.d.v23 >> (shift-bits2)};
0997 } else {
0998 return {v.d.v23 >> shift,
0999 shift ? (v.d.v01 >> shift) | (v.d.v23 << (bits2-shift))
1000 : v.d.v01};
1001 }
1002 }
1003 #endif
1004
1005 }
1006 }
1007
1008 #endif