File indexing completed on 2025-01-18 09:40:07
0001
0002
0003
0004
0005
0006 #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
0007 #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
0008
0009 #ifdef _MSC_VER
0010 #pragma once
0011 #endif
0012
0013 #include <emmintrin.h>
0014
0015 #if defined(__GNUC__) || defined(__PGI) || defined(__SUNPRO_CC)
0016 #define ALIGN16 __attribute__((__aligned__(16)))
0017 #else
0018 #define ALIGN16 __declspec(align(16))
0019 #endif
0020
0021 namespace boost{ namespace math{ namespace lanczos{
0022
0023 template <>
0024 inline double lanczos13m53::lanczos_sum<double>(const double& x)
0025 {
0026 static const ALIGN16 double coeff[26] = {
0027 static_cast<double>(2.506628274631000270164908177133837338626L),
0028 static_cast<double>(1u),
0029 static_cast<double>(210.8242777515793458725097339207133627117L),
0030 static_cast<double>(66u),
0031 static_cast<double>(8071.672002365816210638002902272250613822L),
0032 static_cast<double>(1925u),
0033 static_cast<double>(186056.2653952234950402949897160456992822L),
0034 static_cast<double>(32670u),
0035 static_cast<double>(2876370.628935372441225409051620849613599L),
0036 static_cast<double>(357423u),
0037 static_cast<double>(31426415.58540019438061423162831820536287L),
0038 static_cast<double>(2637558u),
0039 static_cast<double>(248874557.8620541565114603864132294232163L),
0040 static_cast<double>(13339535u),
0041 static_cast<double>(1439720407.311721673663223072794912393972L),
0042 static_cast<double>(45995730u),
0043 static_cast<double>(6039542586.35202800506429164430729792107L),
0044 static_cast<double>(105258076u),
0045 static_cast<double>(17921034426.03720969991975575445893111267L),
0046 static_cast<double>(150917976u),
0047 static_cast<double>(35711959237.35566804944018545154716670596L),
0048 static_cast<double>(120543840u),
0049 static_cast<double>(42919803642.64909876895789904700198885093L),
0050 static_cast<double>(39916800u),
0051 static_cast<double>(23531376880.41075968857200767445163675473L),
0052 static_cast<double>(0u)
0053 };
0054
0055 static const double lim = 4.31965e+25;
0056
0057 if (x > lim)
0058 {
0059 double z = 1 / x;
0060 return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]);
0061 }
0062
0063 __m128d vx = _mm_load1_pd(&x);
0064 __m128d sum_even = _mm_load_pd(coeff);
0065 __m128d sum_odd = _mm_load_pd(coeff+2);
0066 __m128d nc_odd, nc_even;
0067 __m128d vx2 = _mm_mul_pd(vx, vx);
0068
0069 sum_even = _mm_mul_pd(sum_even, vx2);
0070 nc_even = _mm_load_pd(coeff + 4);
0071 sum_odd = _mm_mul_pd(sum_odd, vx2);
0072 nc_odd = _mm_load_pd(coeff + 6);
0073 sum_even = _mm_add_pd(sum_even, nc_even);
0074 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0075
0076 sum_even = _mm_mul_pd(sum_even, vx2);
0077 nc_even = _mm_load_pd(coeff + 8);
0078 sum_odd = _mm_mul_pd(sum_odd, vx2);
0079 nc_odd = _mm_load_pd(coeff + 10);
0080 sum_even = _mm_add_pd(sum_even, nc_even);
0081 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0082
0083 sum_even = _mm_mul_pd(sum_even, vx2);
0084 nc_even = _mm_load_pd(coeff + 12);
0085 sum_odd = _mm_mul_pd(sum_odd, vx2);
0086 nc_odd = _mm_load_pd(coeff + 14);
0087 sum_even = _mm_add_pd(sum_even, nc_even);
0088 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0089
0090 sum_even = _mm_mul_pd(sum_even, vx2);
0091 nc_even = _mm_load_pd(coeff + 16);
0092 sum_odd = _mm_mul_pd(sum_odd, vx2);
0093 nc_odd = _mm_load_pd(coeff + 18);
0094 sum_even = _mm_add_pd(sum_even, nc_even);
0095 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0096
0097 sum_even = _mm_mul_pd(sum_even, vx2);
0098 nc_even = _mm_load_pd(coeff + 20);
0099 sum_odd = _mm_mul_pd(sum_odd, vx2);
0100 nc_odd = _mm_load_pd(coeff + 22);
0101 sum_even = _mm_add_pd(sum_even, nc_even);
0102 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0103
0104 sum_even = _mm_mul_pd(sum_even, vx2);
0105 nc_even = _mm_load_pd(coeff + 24);
0106 sum_odd = _mm_mul_pd(sum_odd, vx);
0107 sum_even = _mm_add_pd(sum_even, nc_even);
0108 sum_even = _mm_add_pd(sum_even, sum_odd);
0109
0110
0111 double ALIGN16 t[2];
0112 _mm_store_pd(t, sum_even);
0113
0114 return t[0] / t[1];
0115 }
0116
0117 template <>
0118 inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
0119 {
0120 static const ALIGN16 double coeff[26] = {
0121 static_cast<double>(0.006061842346248906525783753964555936883222L),
0122 static_cast<double>(1u),
0123 static_cast<double>(0.5098416655656676188125178644804694509993L),
0124 static_cast<double>(66u),
0125 static_cast<double>(19.51992788247617482847860966235652136208L),
0126 static_cast<double>(1925u),
0127 static_cast<double>(449.9445569063168119446858607650988409623L),
0128 static_cast<double>(32670u),
0129 static_cast<double>(6955.999602515376140356310115515198987526L),
0130 static_cast<double>(357423u),
0131 static_cast<double>(75999.29304014542649875303443598909137092L),
0132 static_cast<double>(2637558u),
0133 static_cast<double>(601859.6171681098786670226533699352302507L),
0134 static_cast<double>(13339535u),
0135 static_cast<double>(3481712.15498064590882071018964774556468L),
0136 static_cast<double>(45995730u),
0137 static_cast<double>(14605578.08768506808414169982791359218571L),
0138 static_cast<double>(105258076u),
0139 static_cast<double>(43338889.32467613834773723740590533316085L),
0140 static_cast<double>(150917976u),
0141 static_cast<double>(86363131.28813859145546927288977868422342L),
0142 static_cast<double>(120543840u),
0143 static_cast<double>(103794043.1163445451906271053616070238554L),
0144 static_cast<double>(39916800u),
0145 static_cast<double>(56906521.91347156388090791033559122686859L),
0146 static_cast<double>(0u)
0147 };
0148
0149 static const double lim = 4.76886e+25;
0150
0151 if (x > lim)
0152 {
0153 double z = 1 / x;
0154 return ((((((((((((coeff[24] * z + coeff[22]) * z + coeff[20]) * z + coeff[18]) * z + coeff[16]) * z + coeff[14]) * z + coeff[12]) * z + coeff[10]) * z + coeff[8]) * z + coeff[6]) * z + coeff[4]) * z + coeff[2]) * z + coeff[0]) / ((((((((((((coeff[25] * z + coeff[23]) * z + coeff[21]) * z + coeff[19]) * z + coeff[17]) * z + coeff[15]) * z + coeff[13]) * z + coeff[11]) * z + coeff[9]) * z + coeff[7]) * z + coeff[5]) * z + coeff[3]) * z + coeff[1]);
0155 }
0156
0157 __m128d vx = _mm_load1_pd(&x);
0158 __m128d sum_even = _mm_load_pd(coeff);
0159 __m128d sum_odd = _mm_load_pd(coeff+2);
0160 __m128d nc_odd, nc_even;
0161 __m128d vx2 = _mm_mul_pd(vx, vx);
0162
0163 sum_even = _mm_mul_pd(sum_even, vx2);
0164 nc_even = _mm_load_pd(coeff + 4);
0165 sum_odd = _mm_mul_pd(sum_odd, vx2);
0166 nc_odd = _mm_load_pd(coeff + 6);
0167 sum_even = _mm_add_pd(sum_even, nc_even);
0168 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0169
0170 sum_even = _mm_mul_pd(sum_even, vx2);
0171 nc_even = _mm_load_pd(coeff + 8);
0172 sum_odd = _mm_mul_pd(sum_odd, vx2);
0173 nc_odd = _mm_load_pd(coeff + 10);
0174 sum_even = _mm_add_pd(sum_even, nc_even);
0175 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0176
0177 sum_even = _mm_mul_pd(sum_even, vx2);
0178 nc_even = _mm_load_pd(coeff + 12);
0179 sum_odd = _mm_mul_pd(sum_odd, vx2);
0180 nc_odd = _mm_load_pd(coeff + 14);
0181 sum_even = _mm_add_pd(sum_even, nc_even);
0182 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0183
0184 sum_even = _mm_mul_pd(sum_even, vx2);
0185 nc_even = _mm_load_pd(coeff + 16);
0186 sum_odd = _mm_mul_pd(sum_odd, vx2);
0187 nc_odd = _mm_load_pd(coeff + 18);
0188 sum_even = _mm_add_pd(sum_even, nc_even);
0189 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0190
0191 sum_even = _mm_mul_pd(sum_even, vx2);
0192 nc_even = _mm_load_pd(coeff + 20);
0193 sum_odd = _mm_mul_pd(sum_odd, vx2);
0194 nc_odd = _mm_load_pd(coeff + 22);
0195 sum_even = _mm_add_pd(sum_even, nc_even);
0196 sum_odd = _mm_add_pd(sum_odd, nc_odd);
0197
0198 sum_even = _mm_mul_pd(sum_even, vx2);
0199 nc_even = _mm_load_pd(coeff + 24);
0200 sum_odd = _mm_mul_pd(sum_odd, vx);
0201 sum_even = _mm_add_pd(sum_even, nc_even);
0202 sum_even = _mm_add_pd(sum_even, sum_odd);
0203
0204
0205 double ALIGN16 t[2];
0206 _mm_store_pd(t, sum_even);
0207
0208 return t[0] / t[1];
0209 }
0210
0211 #ifdef _MSC_VER
0212
0213 static_assert(sizeof(double) == sizeof(long double), "sizeof(long double) != sizeof(double) is not supported");
0214
0215 template <>
0216 inline long double lanczos13m53::lanczos_sum<long double>(const long double& x)
0217 {
0218 return lanczos_sum<double>(static_cast<double>(x));
0219 }
0220 template <>
0221 inline long double lanczos13m53::lanczos_sum_expG_scaled<long double>(const long double& x)
0222 {
0223 return lanczos_sum_expG_scaled<double>(static_cast<double>(x));
0224 }
0225 #endif
0226
0227 }
0228 }
0229 }
0230
0231 #undef ALIGN16
0232
0233 #endif
0234
0235
0236
0237
0238