File indexing completed on 2025-08-28 09:11:38
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #include <cmath>
0013 #include <cstdint>
0014 #include <cstring>
0015
0016 namespace xsimd
0017 {
0018 namespace detail
0019 {
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030 #if defined(_MSC_VER)
0031 #define ONCE0 \
0032 __pragma(warning(push)) \
0033 __pragma(warning(disable : 4127)) while (0) \
0034 __pragma(warning(pop))
0035 #else
0036 #define ONCE0 while (0)
0037 #endif
0038
0039
0040
0041
0042
0043
0044
0045
0046
0047
0048
0049
0050 #if defined(__GNUC__) && defined(__BYTE_ORDER__)
0051 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0052 #define XSIMD_LITTLE_ENDIAN
0053 #endif
0054 #elif defined(_WIN32)
0055
0056 #define XSIMD_LITTLE_ENDIAN
0057 #elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
0058 #define XSIMD_LITTLE_ENDIAN
0059 #endif
0060
0061 #ifdef XSIMD_LITTLE_ENDIAN
0062 #define LOW_WORD_IDX 0
0063 #define HIGH_WORD_IDX sizeof(std::uint32_t)
0064 #else
0065 #define LOW_WORD_IDX sizeof(std::uint32_t)
0066 #define HIGH_WORD_IDX 0
0067 #endif
0068
0069 #define GET_HIGH_WORD(i, d) \
0070 do \
0071 { \
0072 double f = (d); \
0073 std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
0074 sizeof(std::uint32_t)); \
0075 } \
0076 ONCE0 \
0077
0078
0079 #define GET_LOW_WORD(i, d) \
0080 do \
0081 { \
0082 double f = (d); \
0083 std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
0084 sizeof(std::uint32_t)); \
0085 } \
0086 ONCE0 \
0087
0088
0089 #define SET_HIGH_WORD(d, v) \
0090 do \
0091 { \
0092 double f = (d); \
0093 std::uint32_t value = (v); \
0094 std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
0095 &value, sizeof(std::uint32_t)); \
0096 (d) = f; \
0097 } \
0098 ONCE0 \
0099
0100
0101 #define SET_LOW_WORD(d, v) \
0102 do \
0103 { \
0104 double f = (d); \
0105 std::uint32_t value = (v); \
0106 std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
0107 &value, sizeof(std::uint32_t)); \
0108 (d) = f; \
0109 } \
0110 ONCE0 \
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123
0124
0125
0126
0127
0128
0129
0130
0131
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162
0163
0164
0165
0166
0167
0168
0169
0170
0171
0172
0173
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219
0220 XSIMD_INLINE int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
0221 {
0222 static const int32_t init_jk[] = { 2, 3, 4, 6 };
0223
0224 static const double PIo2[] = {
0225 1.57079625129699707031e+00,
0226 7.54978941586159635335e-08,
0227 5.39030252995776476554e-15,
0228 3.28200341580791294123e-22,
0229 1.27065575308067607349e-29,
0230 1.22933308981111328932e-36,
0231 2.73370053816464559624e-44,
0232 2.16741683877804819444e-51,
0233 };
0234
0235 static const double
0236 zero
0237 = 0.0,
0238 one = 1.0,
0239 two24 = 1.67772160000000000000e+07,
0240 twon24 = 5.96046447753906250000e-08;
0241
0242 int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
0243 double z, fw, f[20], fq[20], q[20];
0244
0245
0246 jk = init_jk[prec];
0247 jp = jk;
0248
0249
0250 jx = nx - 1;
0251 jv = (e0 - 3) / 24;
0252 if (jv < 0)
0253 jv = 0;
0254 q0 = e0 - 24 * (jv + 1);
0255
0256
0257 j = jv - jx;
0258 m = jx + jk;
0259 for (i = 0; i <= m; i++, j++)
0260 f[i] = (j < 0) ? zero : (double)ipio2[j];
0261
0262
0263 for (i = 0; i <= jk; i++)
0264 {
0265 for (j = 0, fw = 0.0; j <= jx; j++)
0266 fw += x[j] * f[jx + i - j];
0267 q[i] = fw;
0268 }
0269
0270 jz = jk;
0271
0272 recompute:
0273
0274 for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
0275 {
0276 fw = (double)((int32_t)(twon24 * z));
0277 iq[i] = (int)(z - two24 * fw);
0278 z = q[j - 1] + fw;
0279 }
0280
0281
0282 z = std::scalbn(z, q0);
0283 z -= 8.0 * std::floor(z * 0.125);
0284 n = (int32_t)z;
0285 z -= (double)n;
0286 ih = 0;
0287 if (q0 > 0)
0288 {
0289 i = (iq[jz - 1] >> (24 - q0));
0290 n += i;
0291 iq[jz - 1] -= i << (24 - q0);
0292 ih = iq[jz - 1] >> (23 - q0);
0293 }
0294 else if (q0 == 0)
0295 ih = iq[jz - 1] >> 23;
0296 else if (z >= 0.5)
0297 ih = 2;
0298
0299 if (ih > 0)
0300 {
0301 n += 1;
0302 carry = 0;
0303 for (i = 0; i < jz; i++)
0304 {
0305 j = iq[i];
0306 if (carry == 0)
0307 {
0308 if (j != 0)
0309 {
0310 carry = 1;
0311 iq[i] = 0x1000000 - j;
0312 }
0313 }
0314 else
0315 iq[i] = 0xffffff - j;
0316 }
0317 if (q0 > 0)
0318 {
0319 switch (q0)
0320 {
0321 case 1:
0322 iq[jz - 1] &= 0x7fffff;
0323 break;
0324 case 2:
0325 iq[jz - 1] &= 0x3fffff;
0326 break;
0327 }
0328 }
0329 if (ih == 2)
0330 {
0331 z = one - z;
0332 if (carry != 0)
0333 z -= std::scalbn(one, q0);
0334 }
0335 }
0336
0337
0338 if (z == zero)
0339 {
0340 j = 0;
0341 for (i = jz - 1; i >= jk; i--)
0342 j |= iq[i];
0343 if (j == 0)
0344 {
0345 for (k = 1; iq[jk - k] == 0; k++)
0346 ;
0347
0348 for (i = jz + 1; i <= jz + k; i++)
0349 {
0350 f[jx + i] = (double)ipio2[jv + i];
0351 for (j = 0, fw = 0.0; j <= jx; j++)
0352 fw += x[j] * f[jx + i - j];
0353 q[i] = fw;
0354 }
0355 jz += k;
0356 goto recompute;
0357 }
0358 }
0359
0360
0361 if (z == 0.0)
0362 {
0363 jz -= 1;
0364 q0 -= 24;
0365 while (iq[jz] == 0)
0366 {
0367 jz--;
0368 q0 -= 24;
0369 }
0370 }
0371 else
0372 {
0373 z = std::scalbn(z, -q0);
0374 if (z >= two24)
0375 {
0376 fw = (double)((int32_t)(twon24 * z));
0377 iq[jz] = (int32_t)(z - two24 * fw);
0378 jz += 1;
0379 q0 += 24;
0380 iq[jz] = (int32_t)fw;
0381 }
0382 else
0383 iq[jz] = (int32_t)z;
0384 }
0385
0386
0387 fw = scalbn(one, q0);
0388 for (i = jz; i >= 0; i--)
0389 {
0390 q[i] = fw * (double)iq[i];
0391 fw *= twon24;
0392 }
0393
0394
0395 for (i = jz; i >= 0; i--)
0396 {
0397 for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
0398 fw += PIo2[k] * q[i + k];
0399 fq[jz - i] = fw;
0400 }
0401
0402
0403 switch (prec)
0404 {
0405 case 0:
0406 fw = 0.0;
0407 for (i = jz; i >= 0; i--)
0408 fw += fq[i];
0409 y[0] = (ih == 0) ? fw : -fw;
0410 break;
0411 case 1:
0412 case 2:
0413 fw = 0.0;
0414 for (i = jz; i >= 0; i--)
0415 fw += fq[i];
0416 y[0] = (ih == 0) ? fw : -fw;
0417 fw = fq[0] - fw;
0418 for (i = 1; i <= jz; i++)
0419 fw += fq[i];
0420 y[1] = (ih == 0) ? fw : -fw;
0421 break;
0422 case 3:
0423 for (i = jz; i > 0; i--)
0424 {
0425 fw = fq[i - 1] + fq[i];
0426 fq[i] += fq[i - 1] - fw;
0427 fq[i - 1] = fw;
0428 }
0429 for (i = jz; i > 1; i--)
0430 {
0431 fw = fq[i - 1] + fq[i];
0432 fq[i] += fq[i - 1] - fw;
0433 fq[i - 1] = fw;
0434 }
0435 for (fw = 0.0, i = jz; i >= 2; i--)
0436 fw += fq[i];
0437 if (ih == 0)
0438 {
0439 y[0] = fq[0];
0440 y[1] = fq[1];
0441 y[2] = fw;
0442 }
0443 else
0444 {
0445 y[0] = -fq[0];
0446 y[1] = -fq[1];
0447 y[2] = -fw;
0448 }
0449 }
0450 return n & 7;
0451 }
0452
0453 XSIMD_INLINE std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
0454 {
0455 static const std::int32_t two_over_pi[] = {
0456 0xA2F983,
0457 0x6E4E44,
0458 0x1529FC,
0459 0x2757D1,
0460 0xF534DD,
0461 0xC0DB62,
0462 0x95993C,
0463 0x439041,
0464 0xFE5163,
0465 0xABDEBB,
0466 0xC561B7,
0467 0x246E3A,
0468 0x424DD2,
0469 0xE00649,
0470 0x2EEA09,
0471 0xD1921C,
0472 0xFE1DEB,
0473 0x1CB129,
0474 0xA73EE8,
0475 0x8235F5,
0476 0x2EBB44,
0477 0x84E99C,
0478 0x7026B4,
0479 0x5F7E41,
0480 0x3991D6,
0481 0x398353,
0482 0x39F49C,
0483 0x845F8B,
0484 0xBDF928,
0485 0x3B1FF8,
0486 0x97FFDE,
0487 0x05980F,
0488 0xEF2F11,
0489 0x8B5A0A,
0490 0x6D1F6D,
0491 0x367ECF,
0492 0x27CB09,
0493 0xB74F46,
0494 0x3F669E,
0495 0x5FEA2D,
0496 0x7527BA,
0497 0xC7EBE5,
0498 0xF17B3D,
0499 0x0739F7,
0500 0x8A5292,
0501 0xEA6BFB,
0502 0x5FB11F,
0503 0x8D5D08,
0504 0x560330,
0505 0x46FC7B,
0506 0x6BABF0,
0507 0xCFBC20,
0508 0x9AF436,
0509 0x1DA9E3,
0510 0x91615E,
0511 0xE61B08,
0512 0x659985,
0513 0x5F14A0,
0514 0x68408D,
0515 0xFFD880,
0516 0x4D7327,
0517 0x310606,
0518 0x1556CA,
0519 0x73A8C9,
0520 0x60E27B,
0521 0xC08C6B,
0522 };
0523
0524 static const std::int32_t npio2_hw[] = {
0525 0x3FF921FB,
0526 0x400921FB,
0527 0x4012D97C,
0528 0x401921FB,
0529 0x401F6A7A,
0530 0x4022D97C,
0531 0x4025FDBB,
0532 0x402921FB,
0533 0x402C463A,
0534 0x402F6A7A,
0535 0x4031475C,
0536 0x4032D97C,
0537 0x40346B9C,
0538 0x4035FDBB,
0539 0x40378FDB,
0540 0x403921FB,
0541 0x403AB41B,
0542 0x403C463A,
0543 0x403DD85A,
0544 0x403F6A7A,
0545 0x40407E4C,
0546 0x4041475C,
0547 0x4042106C,
0548 0x4042D97C,
0549 0x4043A28C,
0550 0x40446B9C,
0551 0x404534AC,
0552 0x4045FDBB,
0553 0x4046C6CB,
0554 0x40478FDB,
0555 0x404858EB,
0556 0x404921FB,
0557 };
0558
0559
0560
0561
0562
0563
0564
0565
0566
0567
0568
0569 static const double
0570 zero
0571 = 0.00000000000000000000e+00,
0572 half = 5.00000000000000000000e-01,
0573 two24 = 1.67772160000000000000e+07,
0574 invpio2 = 6.36619772367581382433e-01,
0575 pio2_1 = 1.57079632673412561417e+00,
0576 pio2_1t = 6.07710050650619224932e-11,
0577 pio2_2 = 6.07710050630396597660e-11,
0578 pio2_2t = 2.02226624879595063154e-21,
0579 pio2_3 = 2.02226624871116645580e-21,
0580 pio2_3t = 8.47842766036889956997e-32;
0581
0582 double z = 0., w, t, r, fn;
0583 double tx[3];
0584 std::int32_t e0, i, j, nx, n, ix, hx;
0585 std::uint32_t low;
0586
0587 GET_HIGH_WORD(hx, x);
0588 ix = hx & 0x7fffffff;
0589 if (ix <= 0x3fe921fb)
0590 {
0591 y[0] = x;
0592 y[1] = 0;
0593 return 0;
0594 }
0595 if (ix < 0x4002d97c)
0596 {
0597 if (hx > 0)
0598 {
0599 z = x - pio2_1;
0600 if (ix != 0x3ff921fb)
0601 {
0602 y[0] = z - pio2_1t;
0603 y[1] = (z - y[0]) - pio2_1t;
0604 }
0605 else
0606 {
0607 z -= pio2_2;
0608 y[0] = z - pio2_2t;
0609 y[1] = (z - y[0]) - pio2_2t;
0610 }
0611 return 1;
0612 }
0613 else
0614 {
0615 z = x + pio2_1;
0616 if (ix != 0x3ff921fb)
0617 {
0618 y[0] = z + pio2_1t;
0619 y[1] = (z - y[0]) + pio2_1t;
0620 }
0621 else
0622 {
0623 z += pio2_2;
0624 y[0] = z + pio2_2t;
0625 y[1] = (z - y[0]) + pio2_2t;
0626 }
0627
0628 return -1;
0629 }
0630 }
0631 if (ix <= 0x413921fb)
0632 {
0633 t = std::fabs(x);
0634 n = (std::int32_t)(t * invpio2 + half);
0635 fn = (double)n;
0636 r = t - fn * pio2_1;
0637 w = fn * pio2_1t;
0638 if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
0639 {
0640 y[0] = r - w;
0641 }
0642 else
0643 {
0644 std::uint32_t high;
0645 j = ix >> 20;
0646 y[0] = r - w;
0647 GET_HIGH_WORD(high, y[0]);
0648 i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
0649 if (i > 16)
0650 {
0651 t = r;
0652 w = fn * pio2_2;
0653 r = t - w;
0654 w = fn * pio2_2t - ((t - r) - w);
0655 y[0] = r - w;
0656 GET_HIGH_WORD(high, y[0]);
0657 i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
0658 if (i > 49)
0659 {
0660 t = r;
0661 w = fn * pio2_3;
0662 r = t - w;
0663 w = fn * pio2_3t - ((t - r) - w);
0664 y[0] = r - w;
0665 }
0666 }
0667 }
0668 y[1] = (r - y[0]) - w;
0669 if (hx < 0)
0670 {
0671 y[0] = -y[0];
0672 y[1] = -y[1];
0673 return -n;
0674 }
0675 else
0676 return n;
0677 }
0678
0679
0680
0681 if (ix >= 0x7ff00000)
0682 {
0683 y[0] = y[1] = x - x;
0684 return 0;
0685 }
0686
0687 GET_LOW_WORD(low, x);
0688 SET_LOW_WORD(z, low);
0689 e0 = (ix >> 20) - 1046;
0690 SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
0691 for (i = 0; i < 2; i++)
0692 {
0693 tx[i] = (double)((std::int32_t)(z));
0694 z = (z - tx[i]) * two24;
0695 }
0696 tx[2] = z;
0697 nx = 3;
0698 while (tx[nx - 1] == zero)
0699 nx--;
0700 n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
0701 if (hx < 0)
0702 {
0703 y[0] = -y[0];
0704 y[1] = -y[1];
0705 return -n;
0706 }
0707 return n;
0708 }
0709 }
0710
0711 #undef XSIMD_LITTLE_ENDIAN
0712 #undef SET_LOW_WORD
0713 #undef SET_HIGH_WORD
0714 #undef GET_LOW_WORD
0715 #undef GET_HIGH_WORD
0716 #undef HIGH_WORD_IDX
0717 #undef LOW_WORD_IDX
0718 #undef ONCE0
0719 }