File indexing completed on 2025-01-31 10:25:36
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_AVX_VECTORHELPER_H_
0029 #define VC_AVX_VECTORHELPER_H_
0030
0031 #include <limits>
0032 #include "types.h"
0033 #include "intrinsics.h"
0034 #include "casts.h"
0035 #include "../common/loadstoreflags.h"
0036 #include "macros.h"
0037
0038 namespace Vc_VERSIONED_NAMESPACE
0039 {
0040 namespace AVX
0041 {
0042 template<> struct VectorHelper<__m256>
0043 {
0044 typedef __m256 VectorType;
0045 typedef const VectorType VTArg;
0046
0047 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_ps(mem, x); }
0048 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_ps(mem, x); }
0049 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_ps(mem, x); }
0050 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_ps()); }
0051
0052 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
0053 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
0054 };
0055
0056 template<> struct VectorHelper<__m256d>
0057 {
0058 typedef __m256d VectorType;
0059 typedef const VectorType VTArg;
0060
0061 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_pd(mem, x); }
0062 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_pd(mem, x); }
0063 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_pd(mem, x); }
0064 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_pd()); }
0065
0066 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
0067 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
0068 };
0069
0070 template<> struct VectorHelper<__m256i>
0071 {
0072 typedef __m256i VectorType;
0073 typedef const VectorType VTArg;
0074
0075 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfAligned = nullptr) { _mm256_store_si256(reinterpret_cast<__m256i *>(mem), x); }
0076 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem), x); }
0077 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfStreaming = nullptr) { _mm256_stream_si256(reinterpret_cast<__m256i *>(mem), x); }
0078 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { AvxIntrinsics::stream_store(mem, x, setallone_si256()); }
0079
0080 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if<!Flags::IsStreaming, void *>::type = nullptr) { _mm256_maskstore(mem, m, x); }
0081 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VTArg x, VTArg m, typename std::enable_if< Flags::IsStreaming, void *>::type = nullptr) { AvxIntrinsics::stream_store(mem, x, m); }
0082 };
0083
0084 #define Vc_OP1(op) \
0085 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a) { return Vc_CAT2(_mm256_##op##_, Vc_SUFFIX)(a); }
0086 #define Vc_OP(op) \
0087 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(op##_ , Vc_SUFFIX)(a, b); }
0088 #define Vc_OP_(op) \
0089 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op , Vc_SUFFIX)(a, b); }
0090 #define Vc_OPx(op, op2) \
0091 static Vc_INTRINSIC VectorType Vc_CONST op(VTArg a, VTArg b) { return Vc_CAT2(_mm256_##op2##_, Vc_SUFFIX)(a, b); }
0092
0093 template<> struct VectorHelper<double> {
0094 typedef __m256d VectorType;
0095 typedef const VectorType VTArg;
0096 typedef double EntryType;
0097 #define Vc_SUFFIX pd
0098
0099 static Vc_ALWAYS_INLINE VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(_mm256_castps_pd(mask), a); }
0100 static Vc_ALWAYS_INLINE VectorType set(const double a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
0101 static Vc_ALWAYS_INLINE VectorType set(const double a, const double b, const double c, const double d) {
0102 return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d);
0103 }
0104 static Vc_ALWAYS_INLINE VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
0105 static Vc_ALWAYS_INLINE VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
0106
0107 static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
0108 #ifdef Vc_IMPL_FMA4
0109 v1 = _mm256_macc_pd(v1, v2, v3);
0110 #else
0111 VectorType h1 = _mm256_and_pd(v1, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0112 VectorType h2 = _mm256_and_pd(v2, _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0113 #if defined(Vc_GCC) && Vc_GCC < 0x40703
0114
0115
0116 asm("":"+x"(h1), "+x"(h2));
0117 #endif
0118 const VectorType l1 = _mm256_sub_pd(v1, h1);
0119 const VectorType l2 = _mm256_sub_pd(v2, h2);
0120 const VectorType ll = mul(l1, l2);
0121 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
0122 const VectorType hh = mul(h1, h2);
0123
0124 const VectorType lh_lt_v3 = cmplt_pd(abs(lh), abs(v3));
0125 const VectorType b = _mm256_blendv_pd(v3, lh, lh_lt_v3);
0126 const VectorType c = _mm256_blendv_pd(lh, v3, lh_lt_v3);
0127 v1 = add(add(ll, b), add(c, hh));
0128 #endif
0129 }
0130
0131 static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_pd(a,b); }
0132 static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_pd(a,b); }
0133 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_pd(a,b); }
0134
0135 Vc_OP1(sqrt)
0136 static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VTArg x) {
0137 return _mm256_div_pd(one(), sqrt(x));
0138 }
0139 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
0140 return _mm256_div_pd(one(), x);
0141 }
0142 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
0143 return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_pd());
0144 }
0145
0146 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_pd(a, b); }
0147 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_pd(a, b); }
0148 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
0149 __m128d b = _mm_min_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0150 b = _mm_min_sd(b, _mm_unpackhi_pd(b, b));
0151 return _mm_cvtsd_f64(b);
0152 }
0153 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
0154 __m128d b = _mm_max_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0155 b = _mm_max_sd(b, _mm_unpackhi_pd(b, b));
0156 return _mm_cvtsd_f64(b);
0157 }
0158 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
0159 __m128d b = _mm_mul_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0160 b = _mm_mul_sd(b, _mm_shuffle_pd(b, b, _MM_SHUFFLE2(0, 1)));
0161 return _mm_cvtsd_f64(b);
0162 }
0163 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
0164 __m128d b = _mm_add_pd(avx_cast<__m128d>(a), _mm256_extractf128_pd(a, 1));
0165 b = _mm_hadd_pd(b, b);
0166 return _mm_cvtsd_f64(b);
0167 }
0168 #undef Vc_SUFFIX
0169 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
0170 return _mm256_round_pd(a, _MM_FROUND_NINT);
0171 }
0172 };
0173
0174 template<> struct VectorHelper<float> {
0175 typedef float EntryType;
0176 typedef __m256 VectorType;
0177 typedef const VectorType VTArg;
0178 #define Vc_SUFFIX ps
0179
0180 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VTArg a, __m256 mask) { return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(mask, a); }
0181 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm256_set1_, Vc_SUFFIX)(a); }
0182 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d,
0183 const float e, const float f, const float g, const float h) {
0184 return Vc_CAT2(_mm256_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h); }
0185 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm256_setzero_, Vc_SUFFIX)(); }
0186 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(setone_, Vc_SUFFIX)(); }
0187 static Vc_ALWAYS_INLINE Vc_CONST __m256 concat(__m256d a, __m256d b) { return _mm256_insertf128_ps(avx_cast<__m256>(_mm256_cvtpd_ps(a)), _mm256_cvtpd_ps(b), 1); }
0188
0189 static inline void fma(VectorType &v1, VTArg v2, VTArg v3) {
0190 #ifdef Vc_IMPL_FMA4
0191 v1 = _mm256_macc_ps(v1, v2, v3);
0192 #else
0193 __m256d v1_0 = _mm256_cvtps_pd(lo128(v1));
0194 __m256d v1_1 = _mm256_cvtps_pd(hi128(v1));
0195 __m256d v2_0 = _mm256_cvtps_pd(lo128(v2));
0196 __m256d v2_1 = _mm256_cvtps_pd(hi128(v2));
0197 __m256d v3_0 = _mm256_cvtps_pd(lo128(v3));
0198 __m256d v3_1 = _mm256_cvtps_pd(hi128(v3));
0199 v1 = AVX::concat(
0200 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
0201 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
0202 #endif
0203 }
0204
0205 static Vc_INTRINSIC VectorType Vc_CONST add(VTArg a, VTArg b) { return _mm256_add_ps(a, b); }
0206 static Vc_INTRINSIC VectorType Vc_CONST sub(VTArg a, VTArg b) { return _mm256_sub_ps(a, b); }
0207 static Vc_INTRINSIC VectorType Vc_CONST mul(VTArg a, VTArg b) { return _mm256_mul_ps(a, b); }
0208
0209 Vc_OP1(sqrt) Vc_OP1(rsqrt)
0210 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VTArg x) {
0211 return _mm256_rcp_ps(x);
0212 }
0213 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(VTArg a) {
0214 return Vc_CAT2(_mm256_and_, Vc_SUFFIX)(a, setabsmask_ps());
0215 }
0216
0217 static Vc_INTRINSIC VectorType Vc_CONST min(VTArg a, VTArg b) { return _mm256_min_ps(a, b); }
0218 static Vc_INTRINSIC VectorType Vc_CONST max(VTArg a, VTArg b) { return _mm256_max_ps(a, b); }
0219 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VTArg a) {
0220 __m128 b = _mm_min_ps(lo128(a), hi128(a));
0221 b = _mm_min_ps(b, _mm_movehl_ps(b, b));
0222 b = _mm_min_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
0223 return _mm_cvtss_f32(b);
0224 }
0225 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VTArg a) {
0226 __m128 b = _mm_max_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
0227 b = _mm_max_ps(b, _mm_movehl_ps(b, b));
0228 b = _mm_max_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
0229 return _mm_cvtss_f32(b);
0230 }
0231 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VTArg a) {
0232 __m128 b = _mm_mul_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
0233 b = _mm_mul_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
0234 b = _mm_mul_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
0235 return _mm_cvtss_f32(b);
0236 }
0237 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VTArg a) {
0238 __m128 b = _mm_add_ps(avx_cast<__m128>(a), _mm256_extractf128_ps(a, 1));
0239 b = _mm_add_ps(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 1, 2, 3)));
0240 b = _mm_add_ss(b, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 2, 0, 1)));
0241 return _mm_cvtss_f32(b);
0242 }
0243 #undef Vc_SUFFIX
0244 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VTArg a) {
0245 return _mm256_round_ps(a, _MM_FROUND_NINT);
0246 }
0247 };
0248
0249 #undef Vc_OP1
0250 #undef Vc_OP
0251 #undef Vc_OP_
0252 #undef Vc_OPx
0253
0254 }
0255 }
0256
0257 #endif