xsimd/math/xsimd_rem_pio2.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #include <cmath>
0013 #include <cstdint>
0014 #include <cstring>
0015
0016 namespace xsimd
0017 {
0018     namespace detail
0019     {
0020
0021         /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
0022         /*
0023          * ====================================================
0024          * copyright 2016 NumScale SAS
0025          *
0026          * Distributed under the Boost Software License, Version 1.0.
0027          * (See copy at http://boost.org/LICENSE_1_0.txt)
0028          * ====================================================
0029          */
0030 #if defined(_MSC_VER)
0031 #define ONCE0                                       \
0032     __pragma(warning(push))                         \
0033         __pragma(warning(disable : 4127)) while (0) \
0034             __pragma(warning(pop)) /**/
0035 #else
0036 #define ONCE0 while (0)
0037 #endif
0038
0039         /*
0040          * ====================================================
0041          * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
0042          *
0043          * Developed at SunPro, a Sun Microsystems, Inc. business.
0044          * Permission to use, copy, modify, and distribute this
0045          * software is freely granted, provided that this notice
0046          * is preserved.
0047          * ====================================================
0048          */
0049
0050 #if defined(__GNUC__) && defined(__BYTE_ORDER__)
0051 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
0052 #define XSIMD_LITTLE_ENDIAN
0053 #endif
0054 #elif defined(_WIN32)
0055         // We can safely assume that Windows is always little endian
0056 #define XSIMD_LITTLE_ENDIAN
0057 #elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
0058 #define XSIMD_LITTLE_ENDIAN
0059 #endif
0060
0061 #ifdef XSIMD_LITTLE_ENDIAN
0062 #define LOW_WORD_IDX 0
0063 #define HIGH_WORD_IDX sizeof(std::uint32_t)
0064 #else
0065 #define LOW_WORD_IDX sizeof(std::uint32_t)
0066 #define HIGH_WORD_IDX 0
0067 #endif
0068
0069 #define GET_HIGH_WORD(i, d)                                            \
0070     do                                                                 \
0071     {                                                                  \
0072         double f = (d);                                                \
0073         std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
0074                     sizeof(std::uint32_t));                            \
0075     }                                                                  \
0076     ONCE0                                                              \
0077     /**/
0078
0079 #define GET_LOW_WORD(i, d)                                            \
0080     do                                                                \
0081     {                                                                 \
0082         double f = (d);                                               \
0083         std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
0084                     sizeof(std::uint32_t));                           \
0085     }                                                                 \
0086     ONCE0                                                             \
0087     /**/
0088
0089 #define SET_HIGH_WORD(d, v)                                      \
0090     do                                                           \
0091     {                                                            \
0092         double f = (d);                                          \
0093         std::uint32_t value = (v);                               \
0094         std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
0095                     &value, sizeof(std::uint32_t));              \
0096         (d) = f;                                                 \
0097     }                                                            \
0098     ONCE0                                                        \
0099     /**/
0100
0101 #define SET_LOW_WORD(d, v)                                      \
0102     do                                                          \
0103     {                                                           \
0104         double f = (d);                                         \
0105         std::uint32_t value = (v);                              \
0106         std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
0107                     &value, sizeof(std::uint32_t));             \
0108         (d) = f;                                                \
0109     }                                                           \
0110     ONCE0                                                       \
0111     /**/
0112
0113         /*
0114          * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
0115          * double x[],y[]; int e0,nx,prec; int ipio2[];
0116          *
0117          * __kernel_rem_pio2 return the last three digits of N with
0118          *      y = x - N*pi/2
0119          * so that |y| < pi/2.
0120          *
0121          * The method is to compute the integer (mod 8) and fraction parts of
0122          * (2/pi)*x without doing the full multiplication. In general we
0123          * skip the part of the product that are known to be a huge integer (
0124          * more accurately, = 0 mod 8 ). Thus the number of operations are
0125          * independent of the exponent of the input.
0126          *
0127          * (2/pi) is represented by an array of 24-bit integers in ipio2[].
0128          *
0129          * Input parameters:
0130          *  x[] The input value (must be positive) is broken into nx
0131          *      pieces of 24-bit integers in double precision format.
0132          *      x[i] will be the i-th 24 bit of x. The scaled exponent
0133          *      of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
0134          *      match x's up to 24 bits.
0135          *
0136          *      Example of breaking a double positive z into x[0]+x[1]+x[2]:
0137          *          e0 = ilogb(z)-23
0138          *          z  = scalbn(z,-e0)
0139          *      for i = 0,1,2
0140          *          x[i] = floor(z)
0141          *          z    = (z-x[i])*2**24
0142          *
0143          *
0144          *  y[] ouput result in an array of double precision numbers.
0145          *      The dimension of y[] is:
0146          *          24-bit  precision   1
0147          *          53-bit  precision   2
0148          *          64-bit  precision   2
0149          *          113-bit precision   3
0150          *      The actual value is the sum of them. Thus for 113-bit
0151          *      precison, one may have to do something like:
0152          *
0153          *      long double t,w,r_head, r_tail;
0154          *      t = (long double)y[2] + (long double)y[1];
0155          *      w = (long double)y[0];
0156          *      r_head = t+w;
0157          *      r_tail = w - (r_head - t);
0158          *
0159          *  e0  The exponent of x[0]
0160          *
0161          *  nx  dimension of x[]
0162          *
0163          *      prec    an integer indicating the precision:
0164          *          0   24  bits (single)
0165          *          1   53  bits (double)
0166          *          2   64  bits (extended)
0167          *          3   113 bits (quad)
0168          *
0169          *  ipio2[]
0170          *      integer array, contains the (24*i)-th to (24*i+23)-th
0171          *      bit of 2/pi after binary point. The corresponding
0172          *      floating value is
0173          *
0174          *          ipio2[i] * 2^(-24(i+1)).
0175          *
0176          * External function:
0177          *  double scalbn(), floor();
0178          *
0179          *
0180          * Here is the description of some local variables:
0181          *
0182          *  jk  jk+1 is the initial number of terms of ipio2[] needed
0183          *      in the computation. The recommended value is 2,3,4,
0184          *      6 for single, double, extended,and quad.
0185          *
0186          *  jz  local integer variable indicating the number of
0187          *      terms of ipio2[] used.
0188          *
0189          *  jx  nx - 1
0190          *
0191          *  jv  index for pointing to the suitable ipio2[] for the
0192          *      computation. In general, we want
0193          *          ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
0194          *      is an integer. Thus
0195          *          e0-3-24*jv >= 0 or (e0-3)/24 >= jv
0196          *      Hence jv = max(0,(e0-3)/24).
0197          *
0198          *  jp  jp+1 is the number of terms in PIo2[] needed, jp = jk.
0199          *
0200          *  q[] double array with integral value, representing the
0201          *      24-bits chunk of the product of x and 2/pi.
0202          *
0203          *  q0  the corresponding exponent of q[0]. Note that the
0204          *      exponent for q[i] would be q0-24*i.
0205          *
0206          *  PIo2[]  double precision array, obtained by cutting pi/2
0207          *      into 24 bits chunks.
0208          *
0209          *  f[] ipio2[] in floating point
0210          *
0211          *  iq[]    integer array by breaking up q[] in 24-bits chunk.
0212          *
0213          *  fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
0214          *
0215          *  ih  integer. If >0 it indicates q[] is >= 0.5, hence
0216          *      it also indicates the *sign* of the result.
0217          *
0218          */
0219
0220         XSIMD_INLINE int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
0221         {
0222             static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
0223
0224             static const double PIo2[] = {
0225                 1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
0226                 7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
0227                 5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
0228                 3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
0229                 1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
0230                 1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
0231                 2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
0232                 2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
0233             };
0234
0235             static const double
0236                 zero
0237                 = 0.0,
0238                 one = 1.0,
0239                 two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
0240                 twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
0241
0242             int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
0243             double z, fw, f[20], fq[20], q[20];
0244
0245             /* initialize jk*/
0246             jk = init_jk[prec];
0247             jp = jk;
0248
0249             /* determine jx,jv,q0, note that 3>q0 */
0250             jx = nx - 1;
0251             jv = (e0 - 3) / 24;
0252             if (jv < 0)
0253                 jv = 0;
0254             q0 = e0 - 24 * (jv + 1);
0255
0256             /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
0257             j = jv - jx;
0258             m = jx + jk;
0259             for (i = 0; i <= m; i++, j++)
0260                 f[i] = (j < 0) ? zero : (double)ipio2[j];
0261
0262             /* compute q[0],q[1],...q[jk] */
0263             for (i = 0; i <= jk; i++)
0264             {
0265                 for (j = 0, fw = 0.0; j <= jx; j++)
0266                     fw += x[j] * f[jx + i - j];
0267                 q[i] = fw;
0268             }
0269
0270             jz = jk;
0271
0272         recompute:
0273             /* distill q[] into iq[] reversingly */
0274             for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
0275             {
0276                 fw = (double)((int32_t)(twon24 * z));
0277                 iq[i] = (int)(z - two24 * fw);
0278                 z = q[j - 1] + fw;
0279             }
0280
0281             /* compute n */
0282             z = std::scalbn(z, q0); /* actual value of z */
0283             z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
0284             n = (int32_t)z;
0285             z -= (double)n;
0286             ih = 0;
0287             if (q0 > 0)
0288             { /* need iq[jz-1] to determine n */
0289                 i = (iq[jz - 1] >> (24 - q0));
0290                 n += i;
0291                 iq[jz - 1] -= i << (24 - q0);
0292                 ih = iq[jz - 1] >> (23 - q0);
0293             }
0294             else if (q0 == 0)
0295                 ih = iq[jz - 1] >> 23;
0296             else if (z >= 0.5)
0297                 ih = 2;
0298
0299             if (ih > 0)
0300             { /* q > 0.5 */
0301                 n += 1;
0302                 carry = 0;
0303                 for (i = 0; i < jz; i++)
0304                 { /* compute 1-q */
0305                     j = iq[i];
0306                     if (carry == 0)
0307                     {
0308                         if (j != 0)
0309                         {
0310                             carry = 1;
0311                             iq[i] = 0x1000000 - j;
0312                         }
0313                     }
0314                     else
0315                         iq[i] = 0xffffff - j;
0316                 }
0317                 if (q0 > 0)
0318                 { /* rare case: chance is 1 in 12 */
0319                     switch (q0)
0320                     {
0321                     case 1:
0322                         iq[jz - 1] &= 0x7fffff;
0323                         break;
0324                     case 2:
0325                         iq[jz - 1] &= 0x3fffff;
0326                         break;
0327                     }
0328                 }
0329                 if (ih == 2)
0330                 {
0331                     z = one - z;
0332                     if (carry != 0)
0333                         z -= std::scalbn(one, q0);
0334                 }
0335             }
0336
0337             /* check if recomputation is needed */
0338             if (z == zero)
0339             {
0340                 j = 0;
0341                 for (i = jz - 1; i >= jk; i--)
0342                     j |= iq[i];
0343                 if (j == 0)
0344                 { /* need recomputation */
0345                     for (k = 1; iq[jk - k] == 0; k++)
0346                         ; /* k = no. of terms needed */
0347
0348                     for (i = jz + 1; i <= jz + k; i++)
0349                     { /* add q[jz+1] to q[jz+k] */
0350                         f[jx + i] = (double)ipio2[jv + i];
0351                         for (j = 0, fw = 0.0; j <= jx; j++)
0352                             fw += x[j] * f[jx + i - j];
0353                         q[i] = fw;
0354                     }
0355                     jz += k;
0356                     goto recompute;
0357                 }
0358             }
0359
0360             /* chop off zero terms */
0361             if (z == 0.0)
0362             {
0363                 jz -= 1;
0364                 q0 -= 24;
0365                 while (iq[jz] == 0)
0366                 {
0367                     jz--;
0368                     q0 -= 24;
0369                 }
0370             }
0371             else
0372             { /* break z into 24-bit if necessary */
0373                 z = std::scalbn(z, -q0);
0374                 if (z >= two24)
0375                 {
0376                     fw = (double)((int32_t)(twon24 * z));
0377                     iq[jz] = (int32_t)(z - two24 * fw);
0378                     jz += 1;
0379                     q0 += 24;
0380                     iq[jz] = (int32_t)fw;
0381                 }
0382                 else
0383                     iq[jz] = (int32_t)z;
0384             }
0385
0386             /* convert integer "bit" chunk to floating-point value */
0387             fw = scalbn(one, q0);
0388             for (i = jz; i >= 0; i--)
0389             {
0390                 q[i] = fw * (double)iq[i];
0391                 fw *= twon24;
0392             }
0393
0394             /* compute PIo2[0,...,jp]*q[jz,...,0] */
0395             for (i = jz; i >= 0; i--)
0396             {
0397                 for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
0398                     fw += PIo2[k] * q[i + k];
0399                 fq[jz - i] = fw;
0400             }
0401
0402             /* compress fq[] into y[] */
0403             switch (prec)
0404             {
0405             case 0:
0406                 fw = 0.0;
0407                 for (i = jz; i >= 0; i--)
0408                     fw += fq[i];
0409                 y[0] = (ih == 0) ? fw : -fw;
0410                 break;
0411             case 1:
0412             case 2:
0413                 fw = 0.0;
0414                 for (i = jz; i >= 0; i--)
0415                     fw += fq[i];
0416                 y[0] = (ih == 0) ? fw : -fw;
0417                 fw = fq[0] - fw;
0418                 for (i = 1; i <= jz; i++)
0419                     fw += fq[i];
0420                 y[1] = (ih == 0) ? fw : -fw;
0421                 break;
0422             case 3: /* painful */
0423                 for (i = jz; i > 0; i--)
0424                 {
0425                     fw = fq[i - 1] + fq[i];
0426                     fq[i] += fq[i - 1] - fw;
0427                     fq[i - 1] = fw;
0428                 }
0429                 for (i = jz; i > 1; i--)
0430                 {
0431                     fw = fq[i - 1] + fq[i];
0432                     fq[i] += fq[i - 1] - fw;
0433                     fq[i - 1] = fw;
0434                 }
0435                 for (fw = 0.0, i = jz; i >= 2; i--)
0436                     fw += fq[i];
0437                 if (ih == 0)
0438                 {
0439                     y[0] = fq[0];
0440                     y[1] = fq[1];
0441                     y[2] = fw;
0442                 }
0443                 else
0444                 {
0445                     y[0] = -fq[0];
0446                     y[1] = -fq[1];
0447                     y[2] = -fw;
0448                 }
0449             }
0450             return n & 7;
0451         }
0452
0453         XSIMD_INLINE std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
0454         {
0455             static const std::int32_t two_over_pi[] = {
0456                 0xA2F983,
0457                 0x6E4E44,
0458                 0x1529FC,
0459                 0x2757D1,
0460                 0xF534DD,
0461                 0xC0DB62,
0462                 0x95993C,
0463                 0x439041,
0464                 0xFE5163,
0465                 0xABDEBB,
0466                 0xC561B7,
0467                 0x246E3A,
0468                 0x424DD2,
0469                 0xE00649,
0470                 0x2EEA09,
0471                 0xD1921C,
0472                 0xFE1DEB,
0473                 0x1CB129,
0474                 0xA73EE8,
0475                 0x8235F5,
0476                 0x2EBB44,
0477                 0x84E99C,
0478                 0x7026B4,
0479                 0x5F7E41,
0480                 0x3991D6,
0481                 0x398353,
0482                 0x39F49C,
0483                 0x845F8B,
0484                 0xBDF928,
0485                 0x3B1FF8,
0486                 0x97FFDE,
0487                 0x05980F,
0488                 0xEF2F11,
0489                 0x8B5A0A,
0490                 0x6D1F6D,
0491                 0x367ECF,
0492                 0x27CB09,
0493                 0xB74F46,
0494                 0x3F669E,
0495                 0x5FEA2D,
0496                 0x7527BA,
0497                 0xC7EBE5,
0498                 0xF17B3D,
0499                 0x0739F7,
0500                 0x8A5292,
0501                 0xEA6BFB,
0502                 0x5FB11F,
0503                 0x8D5D08,
0504                 0x560330,
0505                 0x46FC7B,
0506                 0x6BABF0,
0507                 0xCFBC20,
0508                 0x9AF436,
0509                 0x1DA9E3,
0510                 0x91615E,
0511                 0xE61B08,
0512                 0x659985,
0513                 0x5F14A0,
0514                 0x68408D,
0515                 0xFFD880,
0516                 0x4D7327,
0517                 0x310606,
0518                 0x1556CA,
0519                 0x73A8C9,
0520                 0x60E27B,
0521                 0xC08C6B,
0522             };
0523
0524             static const std::int32_t npio2_hw[] = {
0525                 0x3FF921FB,
0526                 0x400921FB,
0527                 0x4012D97C,
0528                 0x401921FB,
0529                 0x401F6A7A,
0530                 0x4022D97C,
0531                 0x4025FDBB,
0532                 0x402921FB,
0533                 0x402C463A,
0534                 0x402F6A7A,
0535                 0x4031475C,
0536                 0x4032D97C,
0537                 0x40346B9C,
0538                 0x4035FDBB,
0539                 0x40378FDB,
0540                 0x403921FB,
0541                 0x403AB41B,
0542                 0x403C463A,
0543                 0x403DD85A,
0544                 0x403F6A7A,
0545                 0x40407E4C,
0546                 0x4041475C,
0547                 0x4042106C,
0548                 0x4042D97C,
0549                 0x4043A28C,
0550                 0x40446B9C,
0551                 0x404534AC,
0552                 0x4045FDBB,
0553                 0x4046C6CB,
0554                 0x40478FDB,
0555                 0x404858EB,
0556                 0x404921FB,
0557             };
0558
0559             /*
0560              * invpio2:  53 bits of 2/pi
0561              * pio2_1:   first  33 bit of pi/2
0562              * pio2_1t:  pi/2 - pio2_1
0563              * pio2_2:   second 33 bit of pi/2
0564              * pio2_2t:  pi/2 - (pio2_1+pio2_2)
0565              * pio2_3:   third  33 bit of pi/2
0566              * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
0567              */
0568
0569             static const double
0570                 zero
0571                 = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
0572                 half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
0573                 two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
0574                 invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
0575                 pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
0576                 pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
0577                 pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
0578                 pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
0579                 pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
0580                 pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
0581
0582             double z = 0., w, t, r, fn;
0583             double tx[3];
0584             std::int32_t e0, i, j, nx, n, ix, hx;
0585             std::uint32_t low;
0586
0587             GET_HIGH_WORD(hx, x); /* high word of x */
0588             ix = hx & 0x7fffffff;
0589             if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
0590             {
0591                 y[0] = x;
0592                 y[1] = 0;
0593                 return 0;
0594             }
0595             if (ix < 0x4002d97c)
0596             { /* |x| < 3pi/4, special case with n=+-1 */
0597                 if (hx > 0)
0598                 {
0599                     z = x - pio2_1;
0600                     if (ix != 0x3ff921fb)
0601                     { /* 33+53 bit pi is good enough */
0602                         y[0] = z - pio2_1t;
0603                         y[1] = (z - y[0]) - pio2_1t;
0604                     }
0605                     else
0606                     { /* near pi/2, use 33+33+53 bit pi */
0607                         z -= pio2_2;
0608                         y[0] = z - pio2_2t;
0609                         y[1] = (z - y[0]) - pio2_2t;
0610                     }
0611                     return 1;
0612                 }
0613                 else
0614                 { /* negative x */
0615                     z = x + pio2_1;
0616                     if (ix != 0x3ff921fb)
0617                     { /* 33+53 bit pi is good enough */
0618                         y[0] = z + pio2_1t;
0619                         y[1] = (z - y[0]) + pio2_1t;
0620                     }
0621                     else
0622                     { /* near pi/2, use 33+33+53 bit pi */
0623                         z += pio2_2;
0624                         y[0] = z + pio2_2t;
0625                         y[1] = (z - y[0]) + pio2_2t;
0626                     }
0627
0628                     return -1;
0629                 }
0630             }
0631             if (ix <= 0x413921fb)
0632             { /* |x| ~<= 2^19*(pi/2), medium_ size */
0633                 t = std::fabs(x);
0634                 n = (std::int32_t)(t * invpio2 + half);
0635                 fn = (double)n;
0636                 r = t - fn * pio2_1;
0637                 w = fn * pio2_1t; /* 1st round good to 85 bit */
0638                 if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
0639                 {
0640                     y[0] = r - w; /* quick check no cancellation */
0641                 }
0642                 else
0643                 {
0644                     std::uint32_t high;
0645                     j = ix >> 20;
0646                     y[0] = r - w;
0647                     GET_HIGH_WORD(high, y[0]);
0648                     i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
0649                     if (i > 16)
0650                     { /* 2nd iteration needed, good to 118 */
0651                         t = r;
0652                         w = fn * pio2_2;
0653                         r = t - w;
0654                         w = fn * pio2_2t - ((t - r) - w);
0655                         y[0] = r - w;
0656                         GET_HIGH_WORD(high, y[0]);
0657                         i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
0658                         if (i > 49)
0659                         { /* 3rd iteration need, 151 bits acc */
0660                             t = r; /* will cover all possible cases */
0661                             w = fn * pio2_3;
0662                             r = t - w;
0663                             w = fn * pio2_3t - ((t - r) - w);
0664                             y[0] = r - w;
0665                         }
0666                     }
0667                 }
0668                 y[1] = (r - y[0]) - w;
0669                 if (hx < 0)
0670                 {
0671                     y[0] = -y[0];
0672                     y[1] = -y[1];
0673                     return -n;
0674                 }
0675                 else
0676                     return n;
0677             }
0678             /*
0679              * all other (large) arguments
0680              */
0681             if (ix >= 0x7ff00000)
0682             { /* x is inf or NaN */
0683                 y[0] = y[1] = x - x;
0684                 return 0;
0685             }
0686             /* set z = scalbn(|x|,ilogb(x)-23) */
0687             GET_LOW_WORD(low, x);
0688             SET_LOW_WORD(z, low);
0689             e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
0690             SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
0691             for (i = 0; i < 2; i++)
0692             {
0693                 tx[i] = (double)((std::int32_t)(z));
0694                 z = (z - tx[i]) * two24;
0695             }
0696             tx[2] = z;
0697             nx = 3;
0698             while (tx[nx - 1] == zero)
0699                 nx--; /* skip zero term */
0700             n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
0701             if (hx < 0)
0702             {
0703                 y[0] = -y[0];
0704                 y[1] = -y[1];
0705                 return -n;
0706             }
0707             return n;
0708         }
0709     }
0710
0711 #undef XSIMD_LITTLE_ENDIAN
0712 #undef SET_LOW_WORD
0713 #undef SET_HIGH_WORD
0714 #undef GET_LOW_WORD
0715 #undef GET_HIGH_WORD
0716 #undef HIGH_WORD_IDX
0717 #undef LOW_WORD_IDX
0718 #undef ONCE0
0719 }