Warning, file /include/Vc/sse/vectorhelper.h was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_SSE_VECTORHELPER_H_
0029 #define VC_SSE_VECTORHELPER_H_
0030
0031 #include "types.h"
0032 #include "../common/loadstoreflags.h"
0033 #include <limits>
0034 #include "const_data.h"
0035 #include "macros.h"
0036
0037 namespace Vc_VERSIONED_NAMESPACE
0038 {
0039 namespace SSE
0040 {
0041 #define Vc_OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
0042 #define Vc_OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
0043 #define Vc_OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
0044 #define Vc_OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
0045
0046 template<> struct VectorHelper<__m128>
0047 {
0048 typedef __m128 VectorType;
0049
0050 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_ps(x); }
0051 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
0052 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0053
0054 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_ps(mem, x); }
0055 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
0056 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_ps(mem, x); }
0057 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0058
0059
0060 template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
0061
0062 Vc_OP0(allone, _mm_setallone_ps())
0063 Vc_OP0(zero, _mm_setzero_ps())
0064 Vc_OP3(blend, blendv_ps(a, b, c))
0065 };
0066
0067
0068 template<> struct VectorHelper<__m128d>
0069 {
0070 typedef __m128d VectorType;
0071
0072 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_pd(x); }
0073 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
0074 template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0075
0076 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_pd(mem, x); }
0077 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
0078 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_pd(mem, x); }
0079 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0080
0081
0082 template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
0083
0084 Vc_OP0(allone, _mm_setallone_pd())
0085 Vc_OP0(zero, _mm_setzero_pd())
0086 Vc_OP3(blend, blendv_pd(a, b, c))
0087 };
0088
0089 template<> struct VectorHelper<__m128i>
0090 {
0091 typedef __m128i VectorType;
0092
0093 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
0094 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
0095 template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0096
0097 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
0098 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
0099 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
0100 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0101
0102
0103 template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
0104
0105 Vc_OP0(allone, _mm_setallone_si128())
0106 Vc_OP0(zero, _mm_setzero_si128())
0107 Vc_OP3(blend, blendv_epi8(a, b, c))
0108 };
0109
0110 #undef Vc_OP1
0111 #undef Vc_OP2
0112 #undef Vc_OP3
0113
0114 #define Vc_OP1(op) \
0115 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
0116 #define Vc_OP(op) \
0117 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
0118 #define Vc_OP_(op) \
0119 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op , Vc_SUFFIX)(a, b); }
0120 #define Vc_OPx(op, op2) \
0121 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
0122 #define Vc_OP_CAST_(op) \
0123 static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
0124 _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
0125 Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
0126 }
0127 #define Vc_MINMAX \
0128 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
0129 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
0130
0131 template<> struct VectorHelper<double> {
0132 typedef __m128d VectorType;
0133 typedef double EntryType;
0134 #define Vc_SUFFIX pd
0135
0136 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0137 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
0138 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0139 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
0140 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0141 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0142
0143 #ifdef Vc_IMPL_FMA4
0144 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0145 v1 = _mm_macc_pd(v1, v2, v3);
0146 }
0147 #else
0148 static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
0149 VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0150 VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0151 #if defined(Vc_GCC) && Vc_GCC < 0x40703
0152
0153
0154 asm("":"+x"(h1), "+x"(h2));
0155 #endif
0156 const VectorType l1 = _mm_sub_pd(v1, h1);
0157 const VectorType l2 = _mm_sub_pd(v2, h2);
0158 const VectorType ll = mul(l1, l2);
0159 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
0160 const VectorType hh = mul(h1, h2);
0161
0162 const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3));
0163 const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
0164 const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
0165 v1 = add(add(ll, b), add(c, hh));
0166 }
0167 #endif
0168
0169 Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
0170
0171 Vc_OP1(sqrt)
0172 static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
0173 return _mm_div_pd(one(), sqrt(x));
0174 }
0175 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
0176 return _mm_div_pd(one(), x);
0177 }
0178 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
0179 return _mm_cmpunord_pd(x, x);
0180 }
0181 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
0182 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
0183 }
0184 static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
0185 return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
0186 }
0187 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
0188 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
0189 }
0190
0191 Vc_MINMAX
0192 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0193 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
0194 return _mm_cvtsd_f64(a);
0195 }
0196 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0197 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
0198 return _mm_cvtsd_f64(a);
0199 }
0200 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0201 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
0202 return _mm_cvtsd_f64(a);
0203 }
0204 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0205 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
0206 return _mm_cvtsd_f64(a);
0207 }
0208 #undef Vc_SUFFIX
0209 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
0210 #ifdef Vc_IMPL_SSE4_1
0211 return _mm_round_pd(a, _MM_FROUND_NINT);
0212 #else
0213
0214 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
0215 #endif
0216 }
0217 };
0218
0219 template<> struct VectorHelper<float> {
0220 typedef float EntryType;
0221 typedef __m128 VectorType;
0222 #define Vc_SUFFIX ps
0223
0224 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0225 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
0226 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0227 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0228 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0229 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0230 static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
0231
0232 #ifdef Vc_IMPL_FMA4
0233 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0234 v1 = _mm_macc_ps(v1, v2, v3);
0235 }
0236 #else
0237 static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
0238 __m128d v1_0 = _mm_cvtps_pd(v1);
0239 __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
0240 __m128d v2_0 = _mm_cvtps_pd(v2);
0241 __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
0242 __m128d v3_0 = _mm_cvtps_pd(v3);
0243 __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
0244 v1 = _mm_movelh_ps(
0245 _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
0246 _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
0247 }
0248 #endif
0249
0250 Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
0251
0252 Vc_OP1(sqrt) Vc_OP1(rsqrt)
0253 static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
0254 return _mm_cmpunord_ps(x, x);
0255 }
0256 static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
0257 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
0258 }
0259 static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
0260 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
0261 }
0262 static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
0263 return _mm_rcp_ps(x);
0264 }
0265 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
0266 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
0267 }
0268
0269 Vc_MINMAX
0270 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0271 a = _mm_min_ps(a, _mm_movehl_ps(a, a));
0272 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
0273 return _mm_cvtss_f32(a);
0274 }
0275 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0276 a = _mm_max_ps(a, _mm_movehl_ps(a, a));
0277 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
0278 return _mm_cvtss_f32(a);
0279 }
0280 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0281 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
0282 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
0283 return _mm_cvtss_f32(a);
0284 }
0285 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0286 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
0287 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
0288 return _mm_cvtss_f32(a);
0289 }
0290 #undef Vc_SUFFIX
0291 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
0292 #ifdef Vc_IMPL_SSE4_1
0293 return _mm_round_ps(a, _MM_FROUND_NINT);
0294 #else
0295
0296 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
0297 #endif
0298 }
0299 };
0300
0301 template<> struct VectorHelper<int> {
0302 typedef int EntryType;
0303 typedef __m128i VectorType;
0304 #define Vc_SUFFIX si128
0305
0306 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0307 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0308 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0309 #undef Vc_SUFFIX
0310 #define Vc_SUFFIX epi32
0311 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0312
0313 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0314 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0315
0316 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0317
0318 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0319 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0320 }
0321 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0322 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
0323 }
0324 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
0325 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
0326 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
0327 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0328 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0329
0330 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0331 return _mm_cvtsi128_si32(a);
0332 }
0333 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0334 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0335
0336 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0337 return _mm_cvtsi128_si32(a);
0338 }
0339 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0340 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0341 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0342 return _mm_cvtsi128_si32(a);
0343 }
0344 #ifdef Vc_IMPL_SSE4_1
0345 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
0346 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0347 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0348 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0349 return _mm_cvtsi128_si32(a);
0350 }
0351 #else
0352 static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
0353 const VectorType aShift = _mm_srli_si128(a, 4);
0354 const VectorType ab02 = _mm_mul_epu32(a, b);
0355 const VectorType bShift = _mm_srli_si128(b, 4);
0356 const VectorType ab13 = _mm_mul_epu32(aShift, bShift);
0357 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
0358 }
0359 #endif
0360
0361 Vc_OP(add) Vc_OP(sub)
0362 #undef Vc_SUFFIX
0363 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0364 };
0365
0366 template<> struct VectorHelper<unsigned int> {
0367 typedef unsigned int EntryType;
0368 typedef __m128i VectorType;
0369 #define Vc_SUFFIX si128
0370 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
0371 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0372 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0373
0374 #undef Vc_SUFFIX
0375 #define Vc_SUFFIX epu32
0376 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0377
0378 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
0379 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
0380 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0381 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0382
0383 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0384 return _mm_cvtsi128_si32(a);
0385 }
0386 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0387 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0388
0389 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0390 return _mm_cvtsi128_si32(a);
0391 }
0392 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0393 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0394
0395 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0396 return _mm_cvtsi128_si32(a);
0397 }
0398 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0399 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0400
0401 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0402 return _mm_cvtsi128_si32(a);
0403 }
0404
0405 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0406
0407 static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
0408 return VectorHelper<int>::mul(a, b);
0409 }
0410
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420
0421
0422
0423
0424
0425
0426
0427
0428
0429 #undef Vc_SUFFIX
0430 #define Vc_SUFFIX epi32
0431 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0432 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0433 }
0434 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0435 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
0436 }
0437 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0438 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0439
0440 Vc_OP(add) Vc_OP(sub)
0441 #undef Vc_SUFFIX
0442 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0443 };
0444
0445 template<> struct VectorHelper<signed short> {
0446 typedef __m128i VectorType;
0447 typedef signed short EntryType;
0448 #define Vc_SUFFIX si128
0449
0450 Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0451 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0452 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0453 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
0454 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
0455 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
0456
0457 #undef Vc_SUFFIX
0458 #define Vc_SUFFIX epi16
0459 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0460
0461 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0462 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0463 }
0464 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0465 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
0466 }
0467 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0468 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
0469 const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
0470 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
0471 }
0472
0473 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0474 v1 = add(mul(v1, v2), v3); }
0475
0476 static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
0477
0478 Vc_OPx(mul, mullo)
0479 Vc_OP(min) Vc_OP(max)
0480 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0481
0482 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0483 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0484 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0485 return _mm_cvtsi128_si32(a);
0486 }
0487 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0488
0489 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0490 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0491 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0492 return _mm_cvtsi128_si32(a);
0493 }
0494 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0495 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0496 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0497 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0498 return _mm_cvtsi128_si32(a);
0499 }
0500 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0501 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0502 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0503 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0504 return _mm_cvtsi128_si32(a);
0505 }
0506
0507 Vc_OP(add) Vc_OP(sub)
0508 #undef Vc_SUFFIX
0509 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0510 };
0511
0512 template<> struct VectorHelper<unsigned short> {
0513 typedef __m128i VectorType;
0514 typedef unsigned short EntryType;
0515 #define Vc_SUFFIX si128
0516 Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
0517 static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0518 static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0519 #ifdef Vc_IMPL_SSE4_1
0520 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
0521 #else
0522
0523 static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
0524 auto tmp0 = _mm_unpacklo_epi16(a, b);
0525 auto tmp1 = _mm_unpackhi_epi16(a, b);
0526 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0527 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0528 return _mm_unpacklo_epi16(tmp2, tmp3);
0529 }
0530 #endif
0531 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
0532 static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
0533
0534 #undef Vc_SUFFIX
0535 #define Vc_SUFFIX epu16
0536 static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0537
0538
0539
0540
0541
0542
0543
0544
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
0557 static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
0558 static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
0559 #endif
0560 #undef Vc_SUFFIX
0561 #define Vc_SUFFIX epi16
0562 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0563 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0564 }
0565 static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0566 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
0567 }
0568
0569 static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0570
0571 Vc_OPx(mul, mullo)
0572 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
0573 Vc_OP(min) Vc_OP(max)
0574 #endif
0575 static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0576
0577 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0578 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0579 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0580 return _mm_cvtsi128_si32(a);
0581 }
0582 static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0583
0584 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0585 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0586 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0587 return _mm_cvtsi128_si32(a);
0588 }
0589 static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0590
0591 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0592 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0593 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0594 return _mm_cvtsi128_si32(a);
0595 }
0596 static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0597
0598 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0599 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0600 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0601 return _mm_cvtsi128_si32(a);
0602 }
0603 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0604 static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
0605 const EntryType d, const EntryType e, const EntryType f,
0606 const EntryType g, const EntryType h) {
0607 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
0608 }
0609
0610 Vc_OP(add) Vc_OP(sub)
0611 #undef Vc_SUFFIX
0612 static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0613 };
0614 #undef Vc_OP1
0615 #undef Vc_OP
0616 #undef Vc_OP_
0617 #undef Vc_OPx
0618 #undef Vc_OP_CAST_
0619 #undef Vc_MINMAX
0620
0621 }
0622 }
0623
0624 #include "vectorhelper.tcc"
0625
0626 #endif