Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-31 10:25:33

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003 
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014 
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025 
0026 }}}*/
0027 
0028 #ifndef VC_AVX_MATH_H_
0029 #define VC_AVX_MATH_H_
0030 
0031 #include "const.h"
0032 #include "limits.h"
0033 #include "macros.h"
0034 
0035 namespace Vc_VERSIONED_NAMESPACE
0036 {
0037 // min & max {{{1
0038 #ifdef Vc_IMPL_AVX2
0039 Vc_ALWAYS_INLINE AVX2::int_v    min(const AVX2::int_v    &x, const AVX2::int_v    &y) { return _mm256_min_epi32(x.data(), y.data()); }
0040 Vc_ALWAYS_INLINE AVX2::uint_v   min(const AVX2::uint_v   &x, const AVX2::uint_v   &y) { return _mm256_min_epu32(x.data(), y.data()); }
0041 Vc_ALWAYS_INLINE AVX2::short_v  min(const AVX2::short_v  &x, const AVX2::short_v  &y) { return _mm256_min_epi16(x.data(), y.data()); }
0042 Vc_ALWAYS_INLINE AVX2::ushort_v min(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_min_epu16(x.data(), y.data()); }
0043 Vc_ALWAYS_INLINE AVX2::int_v    max(const AVX2::int_v    &x, const AVX2::int_v    &y) { return _mm256_max_epi32(x.data(), y.data()); }
0044 Vc_ALWAYS_INLINE AVX2::uint_v   max(const AVX2::uint_v   &x, const AVX2::uint_v   &y) { return _mm256_max_epu32(x.data(), y.data()); }
0045 Vc_ALWAYS_INLINE AVX2::short_v  max(const AVX2::short_v  &x, const AVX2::short_v  &y) { return _mm256_max_epi16(x.data(), y.data()); }
0046 Vc_ALWAYS_INLINE AVX2::ushort_v max(const AVX2::ushort_v &x, const AVX2::ushort_v &y) { return _mm256_max_epu16(x.data(), y.data()); }
0047 #endif
0048 Vc_ALWAYS_INLINE AVX2::float_v  min(const AVX2::float_v  &x, const AVX2::float_v  &y) { return _mm256_min_ps(x.data(), y.data()); }
0049 Vc_ALWAYS_INLINE AVX2::double_v min(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_min_pd(x.data(), y.data()); }
0050 Vc_ALWAYS_INLINE AVX2::float_v  max(const AVX2::float_v  &x, const AVX2::float_v  &y) { return _mm256_max_ps(x.data(), y.data()); }
0051 Vc_ALWAYS_INLINE AVX2::double_v max(const AVX2::double_v &x, const AVX2::double_v &y) { return _mm256_max_pd(x.data(), y.data()); }
0052 
0053 // sqrt {{{1
0054 template <typename T>
0055 Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> sqrt(const AVX2::Vector<T> &x)
0056 {
0057     return AVX::VectorHelper<T>::sqrt(x.data());
0058 }
0059 
0060 // rsqrt {{{1
0061 template <typename T>
0062 Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> rsqrt(const AVX2::Vector<T> &x)
0063 {
0064     return AVX::VectorHelper<T>::rsqrt(x.data());
0065 }
0066 
0067 // reciprocal {{{1
0068 template <typename T>
0069 Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> reciprocal(const AVX2::Vector<T> &x)
0070 {
0071     return AVX::VectorHelper<T>::reciprocal(x.data());
0072 }
0073 
0074 // round {{{1
0075 template <typename T>
0076 Vc_ALWAYS_INLINE Vc_PURE AVX2::Vector<T> round(const AVX2::Vector<T> &x)
0077 {
0078     return AVX::VectorHelper<T>::round(x.data());
0079 }
0080 
0081 // abs {{{1
0082 Vc_INTRINSIC Vc_CONST AVX2::double_v abs(AVX2::double_v x)
0083 {
0084     return Detail::and_(x.data(), AVX::setabsmask_pd());
0085 }
0086 Vc_INTRINSIC Vc_CONST AVX2::float_v abs(AVX2::float_v x)
0087 {
0088     return Detail::and_(x.data(), AVX::setabsmask_ps());
0089 }
0090 #ifdef Vc_IMPL_AVX2
0091 Vc_INTRINSIC Vc_CONST AVX2::int_v abs(AVX2::int_v x)
0092 {
0093     return _mm256_abs_epi32(x.data());
0094 }
0095 Vc_INTRINSIC Vc_CONST AVX2::short_v abs(AVX2::short_v x)
0096 {
0097     return _mm256_abs_epi16(x.data());
0098 }
0099 #endif
0100 
0101 // isfinite {{{1
0102 Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isfinite(const AVX2::double_v &x)
0103 {
0104     return AVX::cmpord_pd(x.data(), _mm256_mul_pd(Detail::zero<__m256d>(), x.data()));
0105 }
0106 
0107 Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isfinite(const AVX2::float_v &x)
0108 {
0109     return AVX::cmpord_ps(x.data(), _mm256_mul_ps(Detail::zero<__m256>(), x.data()));
0110 }
0111 
0112 // isinf {{{1
0113 Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isinf(const AVX2::double_v &x)
0114 {
0115     return _mm256_castsi256_pd(AVX::cmpeq_epi64(
0116         _mm256_castpd_si256(abs(x).data()),
0117         _mm256_castpd_si256(Detail::avx_broadcast(AVX::c_log<double>::d(1)))));
0118 }
0119 
0120 Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isinf(const AVX2::float_v &x)
0121 {
0122     return _mm256_castsi256_ps(
0123         AVX::cmpeq_epi32(_mm256_castps_si256(abs(x).data()),
0124                          _mm256_castps_si256(Detail::avx_broadcast(AVX::c_log<float>::d(1)))));
0125 }
0126 
0127 // isnan {{{1
0128 Vc_ALWAYS_INLINE Vc_PURE AVX2::double_m isnan(const AVX2::double_v &x)
0129 {
0130     return AVX::cmpunord_pd(x.data(), x.data());
0131 }
0132 
0133 Vc_ALWAYS_INLINE Vc_PURE AVX2::float_m isnan(const AVX2::float_v &x)
0134 {
0135     return AVX::cmpunord_ps(x.data(), x.data());
0136 }
0137 
0138 // copysign {{{1
0139 Vc_INTRINSIC Vc_CONST AVX2::float_v copysign(AVX2::float_v mag, AVX2::float_v sign)
0140 {
0141     return _mm256_or_ps(_mm256_and_ps(sign.data(), AVX::setsignmask_ps()),
0142                         _mm256_and_ps(mag.data(), AVX::setabsmask_ps()));
0143 }
0144 Vc_INTRINSIC Vc_CONST AVX2::double_v copysign(AVX2::double_v::AsArg mag,
0145                                               AVX2::double_v::AsArg sign)
0146 {
0147     return _mm256_or_pd(_mm256_and_pd(sign.data(), AVX::setsignmask_pd()),
0148                         _mm256_and_pd(mag.data(), AVX::setabsmask_pd()));
0149 }
0150 
0151 //}}}1
0152 // frexp {{{1
0153 /**
0154  * splits \p v into exponent and mantissa, the sign is kept with the mantissa
0155  *
0156  * The return value will be in the range [0.5, 1.0[
0157  * The \p e value will be an integer defining the power-of-two exponent
0158  */
0159 inline AVX2::double_v frexp(AVX2::double_v::AsArg v, SimdArray<int, 4> *e)
0160 {
0161     const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
0162     const __m256d exponentPart = _mm256_and_pd(v.data(), exponentBits);
0163     auto lo = AVX::avx_cast<__m128i>(AVX::lo128(exponentPart));
0164     auto hi = AVX::avx_cast<__m128i>(AVX::hi128(exponentPart));
0165     lo = _mm_sub_epi32(_mm_srli_epi64(lo, 52), _mm_set1_epi64x(0x3fe));
0166     hi = _mm_sub_epi32(_mm_srli_epi64(hi, 52), _mm_set1_epi64x(0x3fe));
0167     SSE::int_v exponent = Mem::shuffle<X0, X2, Y0, Y2>(lo, hi);
0168     const __m256d exponentMaximized = _mm256_or_pd(v.data(), exponentBits);
0169     AVX2::double_v ret =
0170         _mm256_and_pd(exponentMaximized,
0171                       _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask)));
0172     const double_m zeroMask = v == AVX2::double_v::Zero();
0173     ret(isnan(v) || !isfinite(v) || zeroMask) = v;
0174     exponent.setZero(simd_cast<SSE::int_m>(zeroMask));
0175     internal_data(*e) = exponent;
0176     return ret;
0177 }
0178 
0179 #ifdef Vc_IMPL_AVX2
0180 inline SimdArray<double, 8> frexp(const SimdArray<double, 8> &v, SimdArray<int, 8> *e)
0181 {
0182     const __m256d exponentBits = AVX::Const<double>::exponentMask().dataD();
0183     const __m256d w[2] = {internal_data(internal_data0(v)).data(),
0184                           internal_data(internal_data1(v)).data()};
0185     const __m256i exponentPart[2] = {
0186         _mm256_castpd_si256(_mm256_and_pd(w[0], exponentBits)),
0187         _mm256_castpd_si256(_mm256_and_pd(w[1], exponentBits))};
0188     const __m256i lo = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[0], 52),
0189                                         _mm256_set1_epi32(0x3fe));   // 0.1. 2.3.
0190     const __m256i hi = _mm256_sub_epi32(_mm256_srli_epi64(exponentPart[1], 52),
0191                                         _mm256_set1_epi32(0x3fe));   // 4.5. 6.7.
0192     const __m256i a = _mm256_unpacklo_epi32(lo, hi);                 // 04.. 26..
0193     const __m256i b = _mm256_unpackhi_epi32(lo, hi);                 // 15.. 37..
0194     const __m256i tmp = _mm256_unpacklo_epi32(a, b);                 // 0145 2367
0195     const __m256i exponent =
0196         AVX::concat(_mm_unpacklo_epi64(AVX::lo128(tmp), AVX::hi128(tmp)),
0197                     _mm_unpackhi_epi64(AVX::lo128(tmp), AVX::hi128(tmp)));  // 0123 4567
0198     const __m256d exponentMaximized[2] = {_mm256_or_pd(w[0], exponentBits),
0199                                           _mm256_or_pd(w[1], exponentBits)};
0200     const auto frexpMask =
0201         _mm256_broadcast_sd(reinterpret_cast<const double *>(&AVX::c_general::frexpMask));
0202     fixed_size_simd<double, 8> ret = {
0203         fixed_size_simd<double, 4>(
0204             AVX::double_v(_mm256_and_pd(exponentMaximized[0], frexpMask))),
0205         fixed_size_simd<double, 4>(
0206             AVX::double_v(_mm256_and_pd(exponentMaximized[1], frexpMask)))};
0207     const auto zeroMask = v == v.Zero();
0208     ret(isnan(v) || !isfinite(v) || zeroMask) = v;
0209     internal_data(*e) =
0210         Detail::andnot_(simd_cast<AVX2::int_m>(zeroMask).dataI(), exponent);
0211     return ret;
0212 }
0213 #endif  // Vc_IMPL_AVX2
0214 
0215 namespace Detail
0216 {
0217 Vc_INTRINSIC AVX2::float_v::IndexType extractExponent(__m256 e)
0218 {
0219     SimdArray<uint, float_v::Size> exponentPart;
0220     const auto ee = AVX::avx_cast<__m256i>(e);
0221 #ifdef Vc_IMPL_AVX2
0222     exponentPart = AVX2::uint_v(ee);
0223 #else
0224     internal_data(internal_data0(exponentPart)) = AVX::lo128(ee);
0225     internal_data(internal_data1(exponentPart)) = AVX::hi128(ee);
0226 #endif
0227     return (exponentPart >> 23) - 0x7e;
0228 }
0229 }  // namespace Detail
0230 inline AVX2::float_v frexp(AVX2::float_v::AsArg v, SimdArray<int, 8> *e)
0231 {
0232     using namespace Detail;
0233     using namespace AVX2;
0234     const __m256 exponentBits = Const<float>::exponentMask().data();
0235     *e = extractExponent(and_(v.data(), exponentBits));
0236     const __m256 exponentMaximized = or_(v.data(), exponentBits);
0237     AVX2::float_v ret = _mm256_and_ps(exponentMaximized, avx_cast<__m256>(set1_epi32(0xbf7fffffu)));
0238     ret(isnan(v) || !isfinite(v) || v == AVX2::float_v::Zero()) = v;
0239     e->setZero(simd_cast<decltype(*e == *e)>(v == AVX2::float_v::Zero()));
0240     return ret;
0241 }
0242 
0243 // ldexp {{{1
0244 /*             -> x * 2^e
0245  * x == NaN    -> NaN
0246  * x == (-)inf -> (-)inf
0247  */
0248 inline AVX2::double_v ldexp(AVX2::double_v::AsArg v, const SimdArray<int, 4> &_e)
0249 {
0250     SSE::int_v e = internal_data(_e);
0251     e.setZero(simd_cast<SSE::int_m>(v == AVX2::double_v::Zero()));
0252     const __m256i exponentBits =
0253         AVX::concat(_mm_slli_epi64(_mm_unpacklo_epi32(e.data(), e.data()), 52),
0254                     _mm_slli_epi64(_mm_unpackhi_epi32(e.data(), e.data()), 52));
0255     return AVX::avx_cast<__m256d>(
0256         AVX::add_epi64(AVX::avx_cast<__m256i>(v.data()), exponentBits));
0257 }
0258 inline AVX2::float_v ldexp(AVX2::float_v::AsArg v, SimdArray<int, 8> e)
0259 {
0260     e.setZero(simd_cast<decltype(e == e)>(v == AVX2::float_v::Zero()));
0261     e <<= 23;
0262 #ifdef Vc_IMPL_AVX2
0263     return {AVX::avx_cast<__m256>(
0264         AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
0265                                   AVX::lo128(internal_data(e).data())),
0266                     _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
0267                                   AVX::hi128(internal_data(e).data()))))};
0268 #else
0269     return {AVX::avx_cast<__m256>(
0270         AVX::concat(_mm_add_epi32(AVX::avx_cast<__m128i>(AVX::lo128(v.data())),
0271                                   internal_data(internal_data0(e)).data()),
0272                     _mm_add_epi32(AVX::avx_cast<__m128i>(AVX::hi128(v.data())),
0273                                   internal_data(internal_data1(e)).data())))};
0274 #endif
0275 }
0276 
0277 // trunc {{{1
0278 Vc_ALWAYS_INLINE AVX2::float_v trunc(AVX2::float_v::AsArg v)
0279 {
0280     return _mm256_round_ps(v.data(), 0x3);
0281 }
0282 Vc_ALWAYS_INLINE AVX2::double_v trunc(AVX2::double_v::AsArg v)
0283 {
0284     return _mm256_round_pd(v.data(), 0x3);
0285 }
0286 
0287 // floor {{{1
0288 Vc_ALWAYS_INLINE AVX2::float_v floor(AVX2::float_v::AsArg v)
0289 {
0290     return _mm256_floor_ps(v.data());
0291 }
0292 Vc_ALWAYS_INLINE AVX2::double_v floor(AVX2::double_v::AsArg v)
0293 {
0294     return _mm256_floor_pd(v.data());
0295 }
0296 
0297 // ceil {{{1
0298 Vc_ALWAYS_INLINE AVX2::float_v ceil(AVX2::float_v::AsArg v)
0299 {
0300     return _mm256_ceil_ps(v.data());
0301 }
0302 Vc_ALWAYS_INLINE AVX2::double_v ceil(AVX2::double_v::AsArg v)
0303 {
0304     return _mm256_ceil_pd(v.data());
0305 }
0306 
0307 // fma {{{1
0308 template <typename T>
0309 Vc_ALWAYS_INLINE Vector<T, VectorAbi::Avx> fma(Vector<T, VectorAbi::Avx> a,
0310                                                Vector<T, VectorAbi::Avx> b,
0311                                                Vector<T, VectorAbi::Avx> c)
0312 {
0313     return Detail::fma(a.data(), b.data(), c.data(), T());
0314 }
0315 
0316 // }}}1
0317 }  // namespace Vc
0318 
0319 #endif // VC_AVX_MATH_H_
0320 
0321 // vim: foldmethod=marker