File indexing completed on 2025-01-31 10:25:44
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_SSE_MATH_H_
0029 #define VC_SSE_MATH_H_
0030
0031 #include "const.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036
0037 Vc_INTRINSIC Vc_CONST SSE::float_v copysign(SSE::float_v mag, SSE::float_v sign)
0038 {
0039 return _mm_or_ps(_mm_and_ps(sign.data(), SSE::_mm_setsignmask_ps()),
0040 _mm_and_ps(mag.data(), SSE::_mm_setabsmask_ps()));
0041 }
0042 Vc_INTRINSIC Vc_CONST SSE::double_v copysign(SSE::double_v mag, SSE::double_v sign)
0043 {
0044 return _mm_or_pd(_mm_and_pd(sign.data(), SSE::_mm_setsignmask_pd()),
0045 _mm_and_pd(mag.data(), SSE::_mm_setabsmask_pd()));
0046 }
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056 inline SSE::double_v frexp(const SSE::double_v &v,
0057 SimdArray<int, 2, Scalar::int_v, 1> *e)
0058 {
0059 const __m128i exponentBits = SSE::Const<double>::exponentMask().dataI();
0060 const __m128i exponentPart = _mm_and_si128(_mm_castpd_si128(v.data()), exponentBits);
0061 SSE::int_v exponent =
0062 _mm_sub_epi32(_mm_srli_epi64(exponentPart, 52), _mm_set1_epi32(0x3fe));
0063 const __m128d exponentMaximized = _mm_or_pd(v.data(), _mm_castsi128_pd(exponentBits));
0064 SSE::double_v ret = _mm_and_pd(
0065 exponentMaximized,
0066 _mm_load_pd(reinterpret_cast<const double *>(&SSE::c_general::frexpMask[0])));
0067 SSE::double_m zeroMask = v == SSE::double_v::Zero();
0068 ret(isnan(v) || !isfinite(v) || zeroMask) = v;
0069 exponent.setZero(zeroMask.data());
0070 (*e)[0] = exponent[0];
0071 (*e)[1] = exponent[2];
0072 return ret;
0073 }
0074 inline SSE::float_v frexp(const SSE::float_v &v, SimdArray<int, 4, SSE::int_v, 4> *e)
0075 {
0076 const __m128i exponentBits = SSE::Const<float>::exponentMask().dataI();
0077 const __m128i exponentPart = _mm_and_si128(_mm_castps_si128(v.data()), exponentBits);
0078 internal_data(*e) =
0079 _mm_sub_epi32(_mm_srli_epi32(exponentPart, 23), _mm_set1_epi32(0x7e));
0080 const __m128 exponentMaximized = _mm_or_ps(v.data(), _mm_castsi128_ps(exponentBits));
0081 SSE::float_v ret =
0082 _mm_and_ps(exponentMaximized, _mm_castsi128_ps(_mm_set1_epi32(0xbf7fffffu)));
0083 ret(isnan(v) || !isfinite(v) || v == SSE::float_v::Zero()) = v;
0084 e->setZero(v == SSE::float_v::Zero());
0085 return ret;
0086 }
0087
0088
0089
0090
0091
0092 inline SSE::double_v ldexp(SSE::double_v::AsArg v,
0093 const SimdArray<int, 2, Scalar::int_v, 1> &_e)
0094 {
0095 SSE::int_v e = _mm_setr_epi32(_e[0], 0, _e[1], 0);
0096 e.setZero((v == SSE::double_v::Zero()).dataI());
0097 const __m128i exponentBits = _mm_slli_epi64(e.data(), 52);
0098 return _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(v.data()), exponentBits));
0099 }
0100 inline SSE::float_v ldexp(SSE::float_v::AsArg v,
0101 const SimdArray<int, 4, SSE::int_v, 4> &_e)
0102 {
0103 SSE::int_v e = internal_data(_e);
0104 e.setZero(simd_cast<SSE::int_m>(v == SSE::float_v::Zero()));
0105 return reinterpret_components_cast<SSE::float_v>(
0106 reinterpret_components_cast<SSE::int_v>(v) + (e << 23));
0107 }
0108
0109 #ifdef Vc_IMPL_SSE4_1
0110 inline SSE::double_v trunc(SSE::double_v::AsArg v) { return _mm_round_pd(v.data(), 0x3); }
0111 inline SSE::float_v trunc(SSE::float_v::AsArg v) { return _mm_round_ps(v.data(), 0x3); }
0112
0113 inline SSE::double_v floor(SSE::double_v::AsArg v) { return _mm_floor_pd(v.data()); }
0114 inline SSE::float_v floor(SSE::float_v::AsArg v) { return _mm_floor_ps(v.data()); }
0115
0116 inline SSE::double_v ceil(SSE::double_v::AsArg v) { return _mm_ceil_pd(v.data()); }
0117 inline SSE::float_v ceil(SSE::float_v::AsArg v) { return _mm_ceil_ps(v.data()); }
0118 #else
0119 inline SSE::Vector<float> trunc(SSE::Vector<float> x)
0120 {
0121 const auto truncated = _mm_cvtepi32_ps(_mm_cvttps_epi32(x.data()));
0122 const auto no_fractional_values = _mm_castsi128_ps(_mm_cmplt_epi32(
0123 _mm_and_si128(_mm_castps_si128(x.data()), _mm_set1_epi32(0x7f800000u)),
0124 _mm_set1_epi32(0x4b000000)));
0125
0126 return _mm_or_ps(_mm_andnot_ps(no_fractional_values, x.data()),
0127 _mm_and_ps(no_fractional_values, truncated));
0128 }
0129
0130 inline SSE::Vector<double> trunc(SSE::Vector<double> x)
0131 {
0132 const auto abs_x = Vc::abs(x).data();
0133 const auto min_no_fractional_bits =
0134 _mm_castsi128_pd(_mm_set1_epi64x(0x4330000000000000ull));
0135 __m128d truncated =
0136 _mm_sub_pd(_mm_add_pd(abs_x, min_no_fractional_bits), min_no_fractional_bits);
0137
0138
0139 truncated = _mm_sub_pd(truncated,
0140 _mm_and_pd(_mm_cmplt_pd(abs_x, truncated), _mm_set1_pd(1.)));
0141
0142 return _mm_or_pd(
0143 _mm_and_pd(_mm_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ull)), x.data()),
0144 truncated);
0145 }
0146
0147 template <typename T> inline SSE::Vector<T> floor(SSE::Vector<T> x)
0148 {
0149 auto y = trunc(x);
0150 y(!(y == x) && x < 0) -= 1;
0151 return y;
0152 }
0153
0154 template <typename T> inline SSE::Vector<T> ceil(SSE::Vector<T> x)
0155 {
0156 auto y = trunc(x);
0157 y(!(y == x || x < 0)) += 1;
0158 return y;
0159 }
0160 #endif
0161
0162 template <typename T>
0163 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Sse> fma(Vector<T, VectorAbi::Sse> a,
0164 Vector<T, VectorAbi::Sse> b,
0165 Vector<T, VectorAbi::Sse> c)
0166 {
0167 SSE::VectorHelper<T>::fma(a.data(), b.data(), c.data());
0168 return a;
0169 }
0170
0171 }
0172
0173 #endif
0174
0175