File indexing completed on 2025-01-17 09:55:23
0001 #pragma once
0002 #ifndef FXDIV_H
0003 #define FXDIV_H
0004
0005 #if defined(__cplusplus) && (__cplusplus >= 201103L)
0006 #include <cstddef>
0007 #include <cstdint>
0008 #include <climits>
0009 #elif !defined(__OPENCL_VERSION__)
0010 #include <stddef.h>
0011 #include <stdint.h>
0012 #include <limits.h>
0013 #endif
0014
0015 #if defined(_MSC_VER)
0016 #include <intrin.h>
0017 #if defined(_M_IX86) || defined(_M_X64)
0018 #include <immintrin.h>
0019 #endif
0020 #endif
0021
0022 #ifndef FXDIV_USE_INLINE_ASSEMBLY
0023 #define FXDIV_USE_INLINE_ASSEMBLY 0
0024 #endif
0025
0026 static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
0027 #if defined(_MSC_VER) && defined(_M_IX86)
0028 return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
0029 #else
0030 return (uint64_t) a * (uint64_t) b;
0031 #endif
0032 }
0033
0034 static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
0035 #if defined(__OPENCL_VERSION__)
0036 return mul_hi(a, b);
0037 #elif defined(__CUDA_ARCH__)
0038 return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
0039 #elif defined(_MSC_VER) && defined(_M_IX86)
0040 return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
0041 #elif defined(_MSC_VER) && defined(_M_ARM)
0042 return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
0043 #else
0044 return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
0045 #endif
0046 }
0047
0048 static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
0049 #if defined(__OPENCL_VERSION__)
0050 return mul_hi(a, b);
0051 #elif defined(__CUDA_ARCH__)
0052 return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
0053 #elif defined(_MSC_VER) && defined(_M_X64)
0054 return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
0055 #elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
0056 return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
0057 #else
0058 const uint32_t a_lo = (uint32_t) a;
0059 const uint32_t a_hi = (uint32_t) (a >> 32);
0060 const uint32_t b_lo = (uint32_t) b;
0061 const uint32_t b_hi = (uint32_t) (b >> 32);
0062
0063 const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
0064 (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
0065 return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
0066 ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
0067 #endif
0068 }
0069
0070 static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
0071 #if SIZE_MAX == UINT32_MAX
0072 return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
0073 #elif SIZE_MAX == UINT64_MAX
0074 return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
0075 #else
0076 #error Unsupported platform
0077 #endif
0078 }
0079
0080 struct fxdiv_divisor_uint32_t {
0081 uint32_t value;
0082 uint32_t m;
0083 uint8_t s1;
0084 uint8_t s2;
0085 };
0086
0087 struct fxdiv_result_uint32_t {
0088 uint32_t quotient;
0089 uint32_t remainder;
0090 };
0091
0092 struct fxdiv_divisor_uint64_t {
0093 uint64_t value;
0094 uint64_t m;
0095 uint8_t s1;
0096 uint8_t s2;
0097 };
0098
0099 struct fxdiv_result_uint64_t {
0100 uint64_t quotient;
0101 uint64_t remainder;
0102 };
0103
0104 struct fxdiv_divisor_size_t {
0105 size_t value;
0106 size_t m;
0107 uint8_t s1;
0108 uint8_t s2;
0109 };
0110
0111 struct fxdiv_result_size_t {
0112 size_t quotient;
0113 size_t remainder;
0114 };
0115
0116 static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
0117 struct fxdiv_divisor_uint32_t result = { d };
0118 if (d == 1) {
0119 result.m = UINT32_C(1);
0120 result.s1 = 0;
0121 result.s2 = 0;
0122 } else {
0123 #if defined(__OPENCL_VERSION__)
0124 const uint32_t l_minus_1 = 31 - clz(d - 1);
0125 #elif defined(__CUDA_ARCH__)
0126 const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
0127 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
0128 unsigned long l_minus_1;
0129 _BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
0130 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
0131 uint32_t l_minus_1;
0132 __asm__("BSRL %[d_minus_1], %[l_minus_1]"
0133 : [l_minus_1] "=r" (l_minus_1)
0134 : [d_minus_1] "r" (d - 1)
0135 : "cc");
0136 #elif defined(__GNUC__)
0137 const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
0138 #else
0139
0140
0141 uint32_t l_minus_1 = 0;
0142 uint32_t x = d - 1;
0143 uint32_t y = x >> 16;
0144 if (y != 0) {
0145 l_minus_1 += 16;
0146 x = y;
0147 }
0148 y = x >> 8;
0149 if (y != 0) {
0150 l_minus_1 += 8;
0151 x = y;
0152 }
0153 y = x >> 4;
0154 if (y != 0) {
0155 l_minus_1 += 4;
0156 x = y;
0157 }
0158 y = x >> 2;
0159 if (y != 0) {
0160 l_minus_1 += 2;
0161 x = y;
0162 }
0163 if ((x & 2) != 0) {
0164 l_minus_1 += 1;
0165 }
0166 #endif
0167 uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
0168
0169
0170 #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
0171 uint32_t q;
0172 __asm__("DIVL %[d]"
0173 : "=a" (q), "+d" (u_hi)
0174 : [d] "r" (d), "a" (0)
0175 : "cc");
0176 #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
0177 unsigned int remainder;
0178 const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
0179 #else
0180 const uint32_t q = ((uint64_t) u_hi << 32) / d;
0181 #endif
0182
0183 result.m = q + UINT32_C(1);
0184 result.s1 = 1;
0185 result.s2 = (uint8_t) l_minus_1;
0186 }
0187 return result;
0188 }
0189
0190 static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
0191 struct fxdiv_divisor_uint64_t result = { d };
0192 if (d == 1) {
0193 result.m = UINT64_C(1);
0194 result.s1 = 0;
0195 result.s2 = 0;
0196 } else {
0197 #if defined(__OPENCL_VERSION__)
0198 const uint32_t nlz_d = clz(d);
0199 const uint32_t l_minus_1 = 63 - clz(d - 1);
0200 #elif defined(__CUDA_ARCH__)
0201 const uint32_t nlz_d = __clzll((long long) d);
0202 const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
0203 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
0204 unsigned long l_minus_1;
0205 _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
0206 unsigned long bsr_d;
0207 _BitScanReverse64(&bsr_d, (unsigned __int64) d);
0208 const uint32_t nlz_d = bsr_d ^ 0x3F;
0209 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
0210 const uint64_t d_minus_1 = d - 1;
0211 const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
0212 unsigned long l_minus_1;
0213 if ((uint32_t) (d_minus_1 >> 32) == 0) {
0214 _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
0215 } else {
0216 _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
0217 l_minus_1 += 32;
0218 }
0219 const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
0220 #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
0221 uint64_t l_minus_1;
0222 __asm__("BSRQ %[d_minus_1], %[l_minus_1]"
0223 : [l_minus_1] "=r" (l_minus_1)
0224 : [d_minus_1] "r" (d - 1)
0225 : "cc");
0226 #elif defined(__GNUC__)
0227 const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
0228 const uint32_t nlz_d = __builtin_clzll(d);
0229 #else
0230
0231 const uint64_t d_minus_1 = d - 1;
0232 const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
0233 uint32_t l_minus_1 = 0;
0234 uint32_t x = (uint32_t) d_minus_1;
0235 uint32_t y = d_minus_1 >> 32;
0236 if (y != 0) {
0237 l_minus_1 += 32;
0238 x = y;
0239 }
0240 y = x >> 16;
0241 if (y != 0) {
0242 l_minus_1 += 16;
0243 x = y;
0244 }
0245 y = x >> 8;
0246 if (y != 0) {
0247 l_minus_1 += 8;
0248 x = y;
0249 }
0250 y = x >> 4;
0251 if (y != 0) {
0252 l_minus_1 += 4;
0253 x = y;
0254 }
0255 y = x >> 2;
0256 if (y != 0) {
0257 l_minus_1 += 2;
0258 x = y;
0259 }
0260 if ((x & 2) != 0) {
0261 l_minus_1 += 1;
0262 }
0263 const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
0264 #endif
0265 uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
0266
0267
0268 #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
0269 uint64_t q;
0270 __asm__("DIVQ %[d]"
0271 : "=a" (q), "+d" (u_hi)
0272 : [d] "r" (d), "a" (UINT64_C(0))
0273 : "cc");
0274 #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
0275
0276 const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
0277 #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
0278 unsigned __int64 remainder;
0279 const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
0280 #else
0281
0282
0283
0284 d <<= nlz_d;
0285 u_hi <<= nlz_d;
0286
0287 const uint64_t d_hi = (uint32_t) (d >> 32);
0288 const uint32_t d_lo = (uint32_t) d;
0289
0290
0291 uint64_t q1 = u_hi / d_hi;
0292 uint64_t r1 = u_hi - q1 * d_hi;
0293
0294 while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
0295 q1 -= 1;
0296 r1 += d_hi;
0297 if ((r1 >> 32) != 0) {
0298 break;
0299 }
0300 }
0301
0302
0303 u_hi = (u_hi << 32) - q1 * d;
0304
0305
0306 uint64_t q0 = u_hi / d_hi;
0307 uint64_t r0 = u_hi - q0 * d_hi;
0308
0309 while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
0310 q0 -= 1;
0311 r0 += d_hi;
0312 if ((r0 >> 32) != 0) {
0313 break;
0314 }
0315 }
0316 const uint64_t q = (q1 << 32) | (uint32_t) q0;
0317 #endif
0318 result.m = q + UINT64_C(1);
0319 result.s1 = 1;
0320 result.s2 = (uint8_t) l_minus_1;
0321 }
0322 return result;
0323 }
0324
0325 static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
0326 #if SIZE_MAX == UINT32_MAX
0327 const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
0328 #elif SIZE_MAX == UINT64_MAX
0329 const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
0330 #else
0331 #error Unsupported platform
0332 #endif
0333 struct fxdiv_divisor_size_t size_result = {
0334 (size_t) uint_result.value,
0335 (size_t) uint_result.m,
0336 uint_result.s1,
0337 uint_result.s2
0338 };
0339 return size_result;
0340 }
0341
0342 static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
0343 const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
0344 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
0345 }
0346
0347 static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
0348 const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
0349 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
0350 }
0351
0352 static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
0353 #if SIZE_MAX == UINT32_MAX
0354 const struct fxdiv_divisor_uint32_t uint32_divisor = {
0355 (uint32_t) divisor.value,
0356 (uint32_t) divisor.m,
0357 divisor.s1,
0358 divisor.s2
0359 };
0360 return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
0361 #elif SIZE_MAX == UINT64_MAX
0362 const struct fxdiv_divisor_uint64_t uint64_divisor = {
0363 (uint64_t) divisor.value,
0364 (uint64_t) divisor.m,
0365 divisor.s1,
0366 divisor.s2
0367 };
0368 return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
0369 #else
0370 #error Unsupported platform
0371 #endif
0372 }
0373
0374 static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
0375 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
0376 return n - quotient * divisor.value;
0377 }
0378
0379 static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
0380 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
0381 return n - quotient * divisor.value;
0382 }
0383
0384 static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
0385 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
0386 return n - quotient * divisor.value;
0387 }
0388
0389 static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
0390 const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
0391 return quotient * granularity.value;
0392 }
0393
0394 static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
0395 const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
0396 return quotient * granularity.value;
0397 }
0398
0399 static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
0400 const size_t quotient = fxdiv_quotient_size_t(n, granularity);
0401 return quotient * granularity.value;
0402 }
0403
0404 static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
0405 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
0406 const uint32_t remainder = n - quotient * divisor.value;
0407 struct fxdiv_result_uint32_t result = { quotient, remainder };
0408 return result;
0409 }
0410
0411 static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
0412 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
0413 const uint64_t remainder = n - quotient * divisor.value;
0414 struct fxdiv_result_uint64_t result = { quotient, remainder };
0415 return result;
0416 }
0417
0418 static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
0419 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
0420 const size_t remainder = n - quotient * divisor.value;
0421 struct fxdiv_result_size_t result = { quotient, remainder };
0422 return result;
0423 }
0424
0425 #endif