Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-31 10:25:36

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003 
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014 
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025 
0026 }}}*/
0027 
0028 #ifndef VC_AVX_VECTORHELPER_H_
0029 #define VC_AVX_VECTORHELPER_H_
0030 
0031 #include <limits>
0032 #include "types.h"
0033 #include "intrinsics.h"
0034 #include "casts.h"
0035 #include "../common/loadstoreflags.h"
0036 #include "macros.h"
0037 
0038 namespace Vc_VERSIONED_NAMESPACE
0039 {
0040 namespace AVX
0041 {
0042         template<> struct VectorHelper<__m256>
0043         {
0044             typedef __m256 VectorType;
0045             typedef const VectorType VTArg;
0046 
0047             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_ps(mem, x); }
0048             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
0049             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_ps(mem, x); }
0050             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
0051 
0052             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
0053             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
0054         };
0055 
0056         template<> struct VectorHelper<__m256d>
0057         {
0058             typedef __m256d VectorType;
0059             typedef const VectorType VTArg;
0060 
0061             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_pd(mem, x); }
0062             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
0063             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_pd(mem, x); }
0064             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
0065 
0066             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
0067             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
0068         };
0069 
0070         template<> struct VectorHelper<__m256i>
0071         {
0072             typedef __m256i VectorType;
0073             typedef const VectorType VTArg;
0074 
0075             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned               = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
0076             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
0077             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming             = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
0078             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
0079 
0080             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
0081             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
0082         };
0083 
0084 #define Vc_OP1(op) \
0085         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
0086 #define Vc_OP(op) \
0087         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
0088 #define Vc_OP_(op) \
0089         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op    , Vc_SUFFIX)(a, b); }
0090 #define Vc_OPx(op, op2) \
0091         static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
0092 
0093         template<> struct VectorHelper<double> {
0094             typedef __m256d VectorType;
0095             typedef const VectorType VTArg;
0096             typedef double EntryType;
0097 #define Vc_SUFFIX pd
0098 
0099             static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
0100             static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
0101             static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
0102                 return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
0103             }
0104             static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
0105             static Vc_ALWAYS_INLINE VectorType one()  { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.); }
0106 
0107             static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
0108 #ifdef Vc_IMPL_FMA4
0109                 v1 = _mm256_macc_pd(v1, v2, v3);
0110 #else
0111                 VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0112                 VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0113 #if defined(Vc_GCC) && Vc_GCC < 0x40703
0114                 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
0115                 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
0116                 asm("":"+x"(h1), "+x"(h2));
0117 #endif
0118                 const VectorType l1 = _mm256_sub_pd(v1, h1);
0119                 const VectorType l2 = _mm256_sub_pd(v2, h2);
0120                 const VectorType ll = mul(l1, l2);
0121                 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
0122                 const VectorType hh = mul(h1, h2);
0123                 // ll < lh < hh for all entries is certain
0124                 const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
0125                 const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
0126                 const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
0127                 v1 = add(add(ll, b), add(c, hh));
0128 #endif
0129             }
0130 
0131             static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
0132             static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
0133             static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
0134 
0135             Vc_OP1(sqrt)
0136             static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
0137                 return _mm256_div_pd(one(), sqrt(x));
0138             }
0139             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
0140                 return _mm256_div_pd(one(), x);
0141             }
0142             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
0143                 return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
0144             }
0145 
0146             static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
0147             static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
0148             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
0149                 __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0150                 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
0151                 return _mm_cvtsd_f64(b);
0152             }
0153             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
0154                 __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0155                 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
0156                 return _mm_cvtsd_f64(b);
0157             }
0158             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
0159                 __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0160                 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
0161                 return _mm_cvtsd_f64(b);
0162             }
0163             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
0164                 __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0165                 b = _mm_hadd_pd(b, b); // or: b = _mm_add_sd(b, _mm256_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
0166                 return _mm_cvtsd_f64(b);
0167             }
0168 #undef Vc_SUFFIX
0169             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
0170                 return _mm256_round_pd(a, _MM_FROUND_NINT);
0171             }
0172         };
0173 
0174         template<> struct VectorHelper<float> {
0175             typedef float EntryType;
0176             typedef __m256 VectorType;
0177             typedef const VectorType VTArg;
0178 #define Vc_SUFFIX ps
0179 
0180             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
0181             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
0182             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
0183                     const float e, const float f, const float g, const float h) {
0184                 return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
0185             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
0186             static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(setone_, Vc_SUFFIX)(); }// set(1.f); }
0187             static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
0188 
0189             static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
0190 #ifdef Vc_IMPL_FMA4
0191                 v1 = _mm256_macc_ps(v1, v2, v3);
0192 #else
0193                 __m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
0194                 __m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
0195                 __m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
0196                 __m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
0197                 __m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
0198                 __m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
0199                 v1 = AVX::concat(
0200                         _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
0201                         _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
0202 #endif
0203             }
0204 
0205             static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
0206             static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
0207             static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
0208 
0209             Vc_OP1(sqrt) Vc_OP1(rsqrt)
0210             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
0211                 return _mm256_rcp_ps(x);
0212             }
0213             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
0214                 return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
0215             }
0216 
0217             static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
0218             static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
0219             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
0220                 __m128 b = _mm_min_ps(lo128(a), hi128(a));
0221                 b = _mm_min_ps(b, _mm_movehl_ps(b, b));   // b = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
0222                 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = min(a0, a1), a1, a2, a3
0223                 return _mm_cvtss_f32(b);
0224             }
0225             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
0226                 __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
0227                 b = _mm_max_ps(b, _mm_movehl_ps(b, b));   // b = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
0228                 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1))); // b = max(a0, a1), a1, a2, a3
0229                 return _mm_cvtss_f32(b);
0230             }
0231             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
0232                 __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
0233                 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
0234                 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
0235                 return _mm_cvtss_f32(b);
0236             }
0237             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
0238                 __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
0239                 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
0240                 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
0241                 return _mm_cvtss_f32(b);
0242             }
0243 #undef Vc_SUFFIX
0244             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
0245                 return _mm256_round_ps(a, _MM_FROUND_NINT);
0246             }
0247         };
0248 
0249 #undef Vc_OP1
0250 #undef Vc_OP
0251 #undef Vc_OP_
0252 #undef Vc_OPx
0253 
0254 }  // namespace AVX(2)
0255 }  // namespace Vc
0256 
0257 #endif // VC_AVX_VECTORHELPER_H_