File indexing completed on 2025-01-31 10:25:46
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_SSE_VECTORHELPER_H_
0029 #define VC_SSE_VECTORHELPER_H_
0030
0031 #include "types.h"
0032 #include "../common/loadstoreflags.h"
0033 #include <limits>
0034 #include "const_data.h"
0035 #include "macros.h"
0036
0037 namespace Vc_VERSIONED_NAMESPACE
0038 {
0039 namespace SSE
0040 {
0041 #define Vc_OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
0042 #define Vc_OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
0043 #define Vc_OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
0044 #define Vc_OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
0045
0046 template<> struct VectorHelper<__m128>
0047 {
0048 typedef __m128 VectorType;
0049
0050 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); }
0051 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
0052 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0053
0054 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); }
0055 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
0056 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); }
0057 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0058
0059
0060 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
0061
0062 Vc_OP0(allone, _mm_setallone_ps())
0063 Vc_OP0(zero, _mm_setzero_ps())
0064 Vc_OP3(blend, blendv_ps(a, b, c))
0065 };
0066
0067
0068 template<> struct VectorHelper<__m128d>
0069 {
0070 typedef __m128d VectorType;
0071
0072 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); }
0073 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
0074 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0075
0076 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); }
0077 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
0078 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); }
0079 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0080
0081
0082 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
0083
0084 Vc_OP0(allone, _mm_setallone_pd())
0085 Vc_OP0(zero, _mm_setzero_pd())
0086 Vc_OP3(blend, blendv_pd(a, b, c))
0087 };
0088
0089 template<> struct VectorHelper<__m128i>
0090 {
0091 typedef __m128i VectorType;
0092
0093 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
0094 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
0095 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0096
0097 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
0098 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
0099 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
0100 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0101
0102
0103 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
0104
0105 Vc_OP0(allone, _mm_setallone_si128())
0106 Vc_OP0(zero, _mm_setzero_si128())
0107 Vc_OP3(blend, blendv_epi8(a, b, c))
0108 };
0109
0110 #undef Vc_OP1
0111 #undef Vc_OP2
0112 #undef Vc_OP3
0113
0114 #define Vc_OP1(op) \
0115 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
0116 #define Vc_OP(op) \
0117 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
0118 #define Vc_OP_(op) \
0119 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); }
0120 #define Vc_OPx(op, op2) \
0121 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
0122 #define Vc_OP_CAST_(op) \
0123 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
0124 _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
0125 Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
0126 }
0127 #define Vc_MINMAX \
0128 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
0129 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
0130
0131 template<> struct VectorHelper<double> {
0132 typedef __m128d VectorType;
0133 typedef double EntryType;
0134 #define Vc_SUFFIX pd
0135
0136 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0137 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
0138 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0139 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
0140 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0141 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0142
0143 #ifdef Vc_IMPL_FMA4
0144 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0145 v1 = _mm_macc_pd(v1, v2, v3);
0146 }
0147 #else
0148 static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
0149 VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0150 VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0151 #if defined(Vc_GCC) && Vc_GCC < 0x40703
0152
0153
0154 asm("":"+x"(h1), "+x"(h2));
0155 #endif
0156 const VectorType l1 = _mm_sub_pd(v1, h1);
0157 const VectorType l2 = _mm_sub_pd(v2, h2);
0158 const VectorType ll = mul(l1, l2);
0159 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
0160 const VectorType hh = mul(h1, h2);
0161
0162 const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3));
0163 const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
0164 const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
0165 v1 = add(add(ll, b), add(c, hh));
0166 }
0167 #endif
0168
0169 Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
0170
0171 Vc_OP1(sqrt)
0172 static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
0173 return _mm_div_pd(one(), sqrt(x));
0174 }
0175 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
0176 return _mm_div_pd(one(), x);
0177 }
0178 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
0179 return _mm_cmpunord_pd(x, x);
0180 }
0181 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
0182 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
0183 }
0184 static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
0185 return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
0186 }
0187 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
0188 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
0189 }
0190
0191 Vc_MINMAX
0192 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0193 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
0194 return _mm_cvtsd_f64(a);
0195 }
0196 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0197 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
0198 return _mm_cvtsd_f64(a);
0199 }
0200 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0201 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
0202 return _mm_cvtsd_f64(a);
0203 }
0204 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0205 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
0206 return _mm_cvtsd_f64(a);
0207 }
0208 #undef Vc_SUFFIX
0209 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
0210 #ifdef Vc_IMPL_SSE4_1
0211 return _mm_round_pd(a, _MM_FROUND_NINT);
0212 #else
0213
0214 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
0215 #endif
0216 }
0217 };
0218
0219 template<> struct VectorHelper<float> {
0220 typedef float EntryType;
0221 typedef __m128 VectorType;
0222 #define Vc_SUFFIX ps
0223
0224 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0225 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
0226 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0227 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0228 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0229 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0230 static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
0231
0232 #ifdef Vc_IMPL_FMA4
0233 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0234 v1 = _mm_macc_ps(v1, v2, v3);
0235 }
0236 #else
0237 static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
0238 __m128d v1_0 = _mm_cvtps_pd(v1);
0239 __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
0240 __m128d v2_0 = _mm_cvtps_pd(v2);
0241 __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
0242 __m128d v3_0 = _mm_cvtps_pd(v3);
0243 __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
0244 v1 = _mm_movelh_ps(
0245 _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
0246 _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
0247 }
0248 #endif
0249
0250 Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
0251
0252 Vc_OP1(sqrt) Vc_OP1(rsqrt)
0253 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
0254 return _mm_cmpunord_ps(x, x);
0255 }
0256 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
0257 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
0258 }
0259 static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
0260 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
0261 }
0262 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
0263 return _mm_rcp_ps(x);
0264 }
0265 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
0266 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
0267 }
0268
0269 Vc_MINMAX
0270 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0271 a = _mm_min_ps(a, _mm_movehl_ps(a, a));
0272 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
0273 return _mm_cvtss_f32(a);
0274 }
0275 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0276 a = _mm_max_ps(a, _mm_movehl_ps(a, a));
0277 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
0278 return _mm_cvtss_f32(a);
0279 }
0280 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0281 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
0282 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
0283 return _mm_cvtss_f32(a);
0284 }
0285 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0286 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
0287 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
0288 return _mm_cvtss_f32(a);
0289 }
0290 #undef Vc_SUFFIX
0291 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
0292 #ifdef Vc_IMPL_SSE4_1
0293 return _mm_round_ps(a, _MM_FROUND_NINT);
0294 #else
0295
0296 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
0297 #endif
0298 }
0299 };
0300
0301 template<> struct VectorHelper<int> {
0302 typedef int EntryType;
0303 typedef __m128i VectorType;
0304 #define Vc_SUFFIX si128
0305
0306 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0307 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0308 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0309 #undef Vc_SUFFIX
0310 #define Vc_SUFFIX epi32
0311 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0312
0313 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0314 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0315
0316 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0317
0318 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0319 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0320 }
0321 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0322 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
0323 }
0324 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
0325 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
0326 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
0327 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0328 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0329
0330 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0331 return _mm_cvtsi128_si32(a);
0332 }
0333 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0334 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0335
0336 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0337 return _mm_cvtsi128_si32(a);
0338 }
0339 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0340 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0341 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0342 return _mm_cvtsi128_si32(a);
0343 }
0344 #ifdef Vc_IMPL_SSE4_1
0345 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
0346 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0347 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0348 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0349 return _mm_cvtsi128_si32(a);
0350 }
0351 #else
0352 static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
0353 const VectorType aShift = _mm_srli_si128(a, 4);
0354 const VectorType ab02 = _mm_mul_epu32(a, b);
0355 const VectorType bShift = _mm_srli_si128(b, 4);
0356 const VectorType ab13 = _mm_mul_epu32(aShift, bShift);
0357 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
0358 }
0359 #endif
0360
0361 Vc_OP(add) Vc_OP(sub)
0362 #undef Vc_SUFFIX
0363 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0364 };
0365
0366 template<> struct VectorHelper<unsigned int> {
0367 typedef unsigned int EntryType;
0368 typedef __m128i VectorType;
0369 #define Vc_SUFFIX si128
0370 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
0371 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0372 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0373
0374 #undef Vc_SUFFIX
0375 #define Vc_SUFFIX epu32
0376 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0377
0378 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
0379 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
0380 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0381 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0382
0383 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0384 return _mm_cvtsi128_si32(a);
0385 }
0386 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0387 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0388
0389 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0390 return _mm_cvtsi128_si32(a);
0391 }
0392 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0393 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0394
0395 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0396 return _mm_cvtsi128_si32(a);
0397 }
0398 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0399 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0400
0401 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0402 return _mm_cvtsi128_si32(a);
0403 }
0404
0405 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0406
0407 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
0408 return VectorHelper<int>::mul(a, b);
0409 }
0410
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427
0428
0429 #undef Vc_SUFFIX
0430 #define Vc_SUFFIX epi32
0431 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0432 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0433 }
0434 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0435 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
0436 }
0437 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0438 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0439
0440 Vc_OP(add) Vc_OP(sub)
0441 #undef Vc_SUFFIX
0442 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0443 };
0444
0445 template<> struct VectorHelper<signed short> {
0446 typedef __m128i VectorType;
0447 typedef signed short EntryType;
0448 #define Vc_SUFFIX si128
0449
0450 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0451 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0452 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0453 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
0454 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
0455 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
0456
0457 #undef Vc_SUFFIX
0458 #define Vc_SUFFIX epi16
0459 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0460
0461 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0462 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0463 }
0464 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0465 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
0466 }
0467 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0468 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
0469 const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
0470 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
0471 }
0472
0473 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0474 v1 = add(mul(v1, v2), v3); }
0475
0476 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
0477
0478 Vc_OPx(mul, mullo)
0479 Vc_OP(min) Vc_OP(max)
0480 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0481
0482 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0483 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0484 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0485 return _mm_cvtsi128_si32(a);
0486 }
0487 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0488
0489 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0490 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0491 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0492 return _mm_cvtsi128_si32(a);
0493 }
0494 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0495 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0496 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0497 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0498 return _mm_cvtsi128_si32(a);
0499 }
0500 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0501 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0502 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0503 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0504 return _mm_cvtsi128_si32(a);
0505 }
0506
0507 Vc_OP(add) Vc_OP(sub)
0508 #undef Vc_SUFFIX
0509 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0510 };
0511
0512 template<> struct VectorHelper<unsigned short> {
0513 typedef __m128i VectorType;
0514 typedef unsigned short EntryType;
0515 #define Vc_SUFFIX si128
0516 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
0517 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0518 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0519 #ifdef Vc_IMPL_SSE4_1
0520 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
0521 #else
0522
0523 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
0524 auto tmp0 = _mm_unpacklo_epi16(a, b);
0525 auto tmp1 = _mm_unpackhi_epi16(a, b);
0526 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0527 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0528 return _mm_unpacklo_epi16(tmp2, tmp3);
0529 }
0530 #endif
0531 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
0532 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
0533
0534 #undef Vc_SUFFIX
0535 #define Vc_SUFFIX epu16
0536 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0537
0538
0539
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
0557 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
0558 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
0559 #endif
0560 #undef Vc_SUFFIX
0561 #define Vc_SUFFIX epi16
0562 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0563 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0564 }
0565 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0566 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
0567 }
0568
0569 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0570
0571 Vc_OPx(mul, mullo)
0572 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
0573 Vc_OP(min) Vc_OP(max)
0574 #endif
0575 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0576
0577 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0578 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0579 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0580 return _mm_cvtsi128_si32(a);
0581 }
0582 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0583
0584 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0585 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0586 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0587 return _mm_cvtsi128_si32(a);
0588 }
0589 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0590
0591 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0592 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0593 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0594 return _mm_cvtsi128_si32(a);
0595 }
0596 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0597
0598 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0599 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0600 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0601 return _mm_cvtsi128_si32(a);
0602 }
0603 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0604 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
0605 const EntryType d, const EntryType e, const EntryType f,
0606 const EntryType g, const EntryType h) {
0607 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
0608 }
0609
0610 Vc_OP(add) Vc_OP(sub)
0611 #undef Vc_SUFFIX
0612 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0613 };
0614 #undef Vc_OP1
0615 #undef Vc_OP
0616 #undef Vc_OP_
0617 #undef Vc_OPx
0618 #undef Vc_OP_CAST_
0619 #undef Vc_MINMAX
0620
0621 }
0622 }
0623
0624 #include "vectorhelper.tcc"
0625
0626 #endif