File indexing completed on 2025-01-31 10:25:44
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_SSE_INTRINSICS_H_
0029 #define VC_SSE_INTRINSICS_H_
0030
0031 #ifdef Vc_MSVC
0032 #include <intrin.h>
0033 #else
0034 #include <x86intrin.h>
0035 #endif
0036
0037 #include "../common/storage.h"
0038 #include "const_data.h"
0039 #include <cstdlib>
0040 #include "types.h"
0041 #include "debug.h"
0042
0043 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
0044
0045 #pragma GCC diagnostic push
0046 #pragma GCC diagnostic ignored "-Wold-style-cast"
0047 #endif
0048
0049 namespace Vc_VERSIONED_NAMESPACE
0050 {
0051 namespace SseIntrinsics
0052 {
0053 using SSE::c_general;
0054
0055 constexpr std::size_t VectorAlignment = 16;
0056
0057 #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
0058 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
0059 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
0060 static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
0061 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
0062 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
0063 static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
0064 #endif
0065
0066 #ifdef Vc_GCC
0067
0068
0069 static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
0070 static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
0071 static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
0072 static Vc_INTRINSIC Vc_CONST __m128 _mm_mul_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
0073 static Vc_INTRINSIC Vc_CONST __m128 _mm_add_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
0074 static Vc_INTRINSIC Vc_CONST __m128 _mm_sub_ps(__m128 a, __m128 b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
0075 #endif
0076
0077 static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
0078 static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
0079 static Vc_INTRINSIC Vc_CONST __m128 _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
0080
0081 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
0082 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16() { return _mm_setone_epi16(); }
0083 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
0084 static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32() { return _mm_setone_epi32(); }
0085
0086 static Vc_INTRINSIC __m128 Vc_CONST _mm_setone_ps() { return _mm_load_ps(c_general::oneFloat); }
0087 static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd() { return _mm_load_pd(c_general::oneDouble); }
0088
0089 static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
0090 static Vc_INTRINSIC __m128 Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
0091 static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
0092 static Vc_INTRINSIC __m128 Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
0093
0094 static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
0095 static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
0096 static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
0097
0098 #if defined(Vc_IMPL_XOP)
0099 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
0100 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
0101 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
0102 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
0103 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
0104 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
0105 #else
0106 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
0107 {
0108 return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
0109 _mm_xor_si128(b, setmin_epi8()));
0110 }
0111 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
0112 {
0113 return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
0114 _mm_xor_si128(b, setmin_epi16()));
0115 }
0116 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
0117 {
0118 return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
0119 _mm_xor_si128(b, setmin_epi16()));
0120 }
0121 static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
0122 {
0123 return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
0124 _mm_xor_si128(b, setmin_epi32()));
0125 }
0126 static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
0127 {
0128 return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
0129 _mm_xor_si128(b, setmin_epi32()));
0130 }
0131 Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
0132 {
0133 #ifdef Vc_IMPL_SSE4_2
0134 return _mm_cmpgt_epi64(a, b);
0135 #else
0136 const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
0137 const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
0138 const auto gt = _mm_cmpgt_epi32(aa, bb);
0139 const auto eq = _mm_cmpeq_epi32(aa, bb);
0140
0141
0142
0143
0144
0145
0146
0147
0148 const auto gt2 =
0149 _mm_shuffle_epi32(gt, 0xf5);
0150 const auto lo =
0151 _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
0152 return _mm_or_si128(gt2, lo);
0153 #endif
0154 }
0155 #endif
0156 }
0157 }
0158
0159
0160 #ifdef Vc_IMPL_SSSE3
0161 namespace Vc_VERSIONED_NAMESPACE
0162 {
0163 namespace SseIntrinsics
0164 {
0165
0166 Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
0167 Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
0168 Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
0169 template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
0170 {
0171 return _mm_alignr_epi8(a, b, s & 0x1fu);
0172 }
0173 }
0174 }
0175
0176 #else
0177
0178 namespace Vc_VERSIONED_NAMESPACE
0179 {
0180 namespace SseIntrinsics
0181 {
0182 Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
0183 __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
0184 return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative, _mm_set1_epi8(1)));
0185 }
0186
0187
0188
0189
0190
0191
0192
0193
0194
0195
0196 Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
0197 __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
0198 return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
0199 }
0200 Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
0201 __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
0202 return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
0203 }
0204 template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
0205 {
0206 switch (s & 0x1fu) {
0207 case 0: return b;
0208 case 1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b, 1));
0209 case 2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b, 2));
0210 case 3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b, 3));
0211 case 4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b, 4));
0212 case 5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b, 5));
0213 case 6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b, 6));
0214 case 7: return _mm_or_si128(_mm_slli_si128(a, 9), _mm_srli_si128(b, 7));
0215 case 8: return _mm_or_si128(_mm_slli_si128(a, 8), _mm_srli_si128(b, 8));
0216 case 9: return _mm_or_si128(_mm_slli_si128(a, 7), _mm_srli_si128(b, 9));
0217 case 10: return _mm_or_si128(_mm_slli_si128(a, 6), _mm_srli_si128(b, 10));
0218 case 11: return _mm_or_si128(_mm_slli_si128(a, 5), _mm_srli_si128(b, 11));
0219 case 12: return _mm_or_si128(_mm_slli_si128(a, 4), _mm_srli_si128(b, 12));
0220 case 13: return _mm_or_si128(_mm_slli_si128(a, 3), _mm_srli_si128(b, 13));
0221 case 14: return _mm_or_si128(_mm_slli_si128(a, 2), _mm_srli_si128(b, 14));
0222 case 15: return _mm_or_si128(_mm_slli_si128(a, 1), _mm_srli_si128(b, 15));
0223 case 16: return a;
0224 case 17: return _mm_srli_si128(a, 1);
0225 case 18: return _mm_srli_si128(a, 2);
0226 case 19: return _mm_srli_si128(a, 3);
0227 case 20: return _mm_srli_si128(a, 4);
0228 case 21: return _mm_srli_si128(a, 5);
0229 case 22: return _mm_srli_si128(a, 6);
0230 case 23: return _mm_srli_si128(a, 7);
0231 case 24: return _mm_srli_si128(a, 8);
0232 case 25: return _mm_srli_si128(a, 9);
0233 case 26: return _mm_srli_si128(a, 10);
0234 case 27: return _mm_srli_si128(a, 11);
0235 case 28: return _mm_srli_si128(a, 12);
0236 case 29: return _mm_srli_si128(a, 13);
0237 case 30: return _mm_srli_si128(a, 14);
0238 case 31: return _mm_srli_si128(a, 15);
0239 }
0240 return _mm_setzero_si128();
0241 }
0242 }
0243 }
0244 #endif
0245
0246
0247 #ifdef Vc_IMPL_SSE4_1
0248 namespace Vc_VERSIONED_NAMESPACE
0249 {
0250 namespace SseIntrinsics
0251 {
0252 Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
0253 {
0254 return _mm_cmpeq_epi64(a, b);
0255 }
0256 template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
0257 {
0258 return _mm_extract_epi32(v, index);
0259 }
0260 Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
0261 {
0262 return _mm_blendv_pd(a, b, c);
0263 }
0264 Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
0265 {
0266 return _mm_blendv_ps(a, b, c);
0267 }
0268 Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
0269 {
0270 return _mm_blendv_epi8(a, b, c);
0271 }
0272 template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
0273 {
0274 return _mm_blend_pd(a, b, mask);
0275 }
0276 template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
0277 {
0278 return _mm_blend_ps(a, b, mask);
0279 }
0280 template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
0281 {
0282 return _mm_blend_epi16(a, b, mask);
0283 }
0284 Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
0285 {
0286 return _mm_max_epi8(a, b);
0287 }
0288 Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
0289 {
0290 return _mm_max_epi32(a, b);
0291 }
0292 Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
0293 {
0294 return _mm_max_epu16(a, b);
0295 }
0296 Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
0297 {
0298 return _mm_max_epu32(a, b);
0299 }
0300 Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
0301 {
0302 return _mm_min_epu16(a, b);
0303 }
0304 Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
0305 {
0306 return _mm_min_epu32(a, b);
0307 }
0308 Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
0309 {
0310 return _mm_min_epi8(a, b);
0311 }
0312 Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
0313 {
0314 return _mm_min_epi32(a, b);
0315 }
0316 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
0317 {
0318 return _mm_cvtepu8_epi16(epu8);
0319 }
0320 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
0321 {
0322 return _mm_cvtepi8_epi16(epi8);
0323 }
0324 Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
0325 {
0326 return _mm_cvtepu16_epi32(epu16);
0327 }
0328 Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
0329 {
0330 return _mm_cvtepi16_epi32(epu16);
0331 }
0332 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
0333 {
0334 return _mm_cvtepu8_epi32(epu8);
0335 }
0336 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
0337 {
0338 return _mm_cvtepi8_epi32(epi8);
0339 }
0340 }
0341 }
0342 #else
0343
0344 namespace Vc_VERSIONED_NAMESPACE
0345 {
0346 namespace SseIntrinsics
0347 {
0348 Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
0349 auto tmp = _mm_cmpeq_epi32(a, b);
0350 return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
0351 }
0352 template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
0353 {
0354 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
0355 typedef int int32v4 __attribute__((__vector_size__(16)));
0356 return aliasing_cast<int32v4>(v)[index];
0357 #else
0358 return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
0359 #endif
0360 }
0361 Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
0362 #ifdef Vc_GCC
0363 return reinterpret_cast<__m128d>(
0364 (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
0365 (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
0366 #else
0367 return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
0368 #endif
0369 }
0370 Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c) {
0371 #ifdef Vc_GCC
0372 return reinterpret_cast<__m128>(
0373 (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
0374 (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
0375 #else
0376 return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
0377 #endif
0378 }
0379 Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
0380 #ifdef Vc_GCC
0381 return (~c & a) | (c & b);
0382 #else
0383 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
0384 #endif
0385 }
0386
0387
0388
0389 template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
0390 {
0391 switch (mask) {
0392 case 0x0:
0393 return a;
0394 case 0x1:
0395 return _mm_shuffle_pd(b, a, 2);
0396 case 0x2:
0397 return _mm_shuffle_pd(a, b, 2);
0398 case 0x3:
0399 return b;
0400 default:
0401 abort();
0402 return a;
0403 }
0404 }
0405 template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
0406 {
0407 __m128i c;
0408 switch (mask) {
0409 case 0x0:
0410 return a;
0411 case 0x1:
0412 c = _mm_srli_si128(_mm_setallone_si128(), 12);
0413 break;
0414 case 0x2:
0415 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
0416 break;
0417 case 0x3:
0418 c = _mm_srli_si128(_mm_setallone_si128(), 8);
0419 break;
0420 case 0x4:
0421 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
0422 break;
0423 case 0x5:
0424 c = _mm_set_epi32(0, -1, 0, -1);
0425 break;
0426 case 0x6:
0427 c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
0428 break;
0429 case 0x7:
0430 c = _mm_srli_si128(_mm_setallone_si128(), 4);
0431 break;
0432 case 0x8:
0433 c = _mm_slli_si128(_mm_setallone_si128(), 12);
0434 break;
0435 case 0x9:
0436 c = _mm_set_epi32(-1, 0, 0, -1);
0437 break;
0438 case 0xa:
0439 c = _mm_set_epi32(-1, 0, -1, 0);
0440 break;
0441 case 0xb:
0442 c = _mm_set_epi32(-1, 0, -1, -1);
0443 break;
0444 case 0xc:
0445 c = _mm_slli_si128(_mm_setallone_si128(), 8);
0446 break;
0447 case 0xd:
0448 c = _mm_set_epi32(-1, -1, 0, -1);
0449 break;
0450 case 0xe:
0451 c = _mm_slli_si128(_mm_setallone_si128(), 4);
0452 break;
0453 case 0xf:
0454 return b;
0455 default:
0456 abort();
0457 c = _mm_setzero_si128();
0458 break;
0459 }
0460 __m128 _c = _mm_castsi128_ps(c);
0461 return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
0462 }
0463 template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
0464 {
0465 __m128i c;
0466 switch (mask) {
0467 case 0x00:
0468 return a;
0469 case 0x01:
0470 c = _mm_srli_si128(_mm_setallone_si128(), 14);
0471 break;
0472 case 0x03:
0473 c = _mm_srli_si128(_mm_setallone_si128(), 12);
0474 break;
0475 case 0x07:
0476 c = _mm_srli_si128(_mm_setallone_si128(), 10);
0477 break;
0478 case 0x0f:
0479 return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
0480 case 0x1f:
0481 c = _mm_srli_si128(_mm_setallone_si128(), 6);
0482 break;
0483 case 0x3f:
0484 c = _mm_srli_si128(_mm_setallone_si128(), 4);
0485 break;
0486 case 0x7f:
0487 c = _mm_srli_si128(_mm_setallone_si128(), 2);
0488 break;
0489 case 0x80:
0490 c = _mm_slli_si128(_mm_setallone_si128(), 14);
0491 break;
0492 case 0xc0:
0493 c = _mm_slli_si128(_mm_setallone_si128(), 12);
0494 break;
0495 case 0xe0:
0496 c = _mm_slli_si128(_mm_setallone_si128(), 10);
0497 break;
0498 case 0xf0:
0499 c = _mm_slli_si128(_mm_setallone_si128(), 8);
0500 break;
0501 case 0xf8:
0502 c = _mm_slli_si128(_mm_setallone_si128(), 6);
0503 break;
0504 case 0xfc:
0505 c = _mm_slli_si128(_mm_setallone_si128(), 4);
0506 break;
0507 case 0xfe:
0508 c = _mm_slli_si128(_mm_setallone_si128(), 2);
0509 break;
0510 case 0xff:
0511 return b;
0512 case 0xcc:
0513 return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
0514 case 0x33:
0515 return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
0516 default:
0517 const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
0518 c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
0519 break;
0520 }
0521 return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
0522 }
0523
0524 Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
0525 return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
0526 }
0527 Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
0528 return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
0529 }
0530 Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
0531 return blendv_epi8(b, a, cmpgt_epu16(a, b));
0532 }
0533 Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
0534 return blendv_epi8(b, a, cmpgt_epu32(a, b));
0535 }
0536 Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
0537 return blendv_epi8(a, b, cmpgt_epu16(a, b));
0538 }
0539 Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
0540 return blendv_epi8(a, b, cmpgt_epu32(a, b));
0541 }
0542 Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
0543 return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
0544 }
0545 Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
0546 return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
0547 }
0548 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
0549 return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
0550 }
0551 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
0552 return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
0553 }
0554 Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
0555 return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
0556 }
0557 Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
0558 return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
0559 }
0560 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
0561 return cvtepu16_epi32(cvtepu8_epi16(epu8));
0562 }
0563 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
0564 const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
0565 const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
0566 return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
0567 }
0568 }
0569 }
0570 #endif
0571
0572
0573 namespace Vc_VERSIONED_NAMESPACE
0574 {
0575 namespace SseIntrinsics
0576 {
0577 static Vc_INTRINSIC Vc_PURE __m128 _mm_stream_load(const float *mem) {
0578 #ifdef Vc_IMPL_SSE4_1
0579 return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
0580 #else
0581 return _mm_load_ps(mem);
0582 #endif
0583 }
0584 static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
0585 #ifdef Vc_IMPL_SSE4_1
0586 return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
0587 #else
0588 return _mm_load_pd(mem);
0589 #endif
0590 }
0591 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
0592 #ifdef Vc_IMPL_SSE4_1
0593 return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
0594 #else
0595 return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
0596 #endif
0597 }
0598 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
0599 return _mm_stream_load(reinterpret_cast<const int *>(mem));
0600 }
0601 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
0602 return _mm_stream_load(reinterpret_cast<const int *>(mem));
0603 }
0604 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
0605 return _mm_stream_load(reinterpret_cast<const int *>(mem));
0606 }
0607 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
0608 return _mm_stream_load(reinterpret_cast<const int *>(mem));
0609 }
0610 static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
0611 return _mm_stream_load(reinterpret_cast<const int *>(mem));
0612 }
0613
0614 #ifndef __x86_64__
0615 Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
0616 return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
0617 }
0618 #endif
0619
0620 #ifdef Vc_IMPL_AVX2
0621 template <int Scale> __m128 gather(const float *addr, __m128i idx)
0622 {
0623 return _mm_i32gather_ps(addr, idx, Scale);
0624 }
0625 template <int Scale> __m128d gather(const double *addr, __m128i idx)
0626 {
0627 return _mm_i32gather_pd(addr, idx, Scale);
0628 }
0629 template <int Scale> __m128i gather(const int *addr, __m128i idx)
0630 {
0631 return _mm_i32gather_epi32(addr, idx, Scale);
0632 }
0633 template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
0634 {
0635 return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
0636 }
0637
0638 template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
0639 {
0640 return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
0641 }
0642 template <int Scale>
0643 __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
0644 {
0645 return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
0646 }
0647 template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
0648 {
0649 return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
0650 }
0651 template <int Scale>
0652 __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
0653 {
0654 return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
0655 }
0656 #endif
0657
0658 }
0659 }
0660
0661 namespace Vc_VERSIONED_NAMESPACE
0662 {
0663 namespace SSE
0664 {
0665 using namespace SseIntrinsics;
0666
0667 template <typename T> struct ParameterHelper
0668 {
0669 typedef T ByValue;
0670 typedef T &Reference;
0671 typedef const T &ConstRef;
0672 };
0673
0674 template <typename T> struct VectorHelper
0675 {
0676 };
0677
0678 template <typename T> struct VectorTypeHelper
0679 {
0680 typedef __m128i Type;
0681 };
0682 template <> struct VectorTypeHelper<double>
0683 {
0684 typedef __m128d Type;
0685 };
0686 template <> struct VectorTypeHelper<float>
0687 {
0688 typedef __m128 Type;
0689 };
0690
0691 template <typename T> struct DetermineGatherMask
0692 {
0693 typedef T Type;
0694 };
0695
0696 template <typename T> struct VectorTraits
0697 {
0698 typedef typename VectorTypeHelper<T>::Type VectorType;
0699 using EntryType = T;
0700 static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
0701 typedef Mask<T> MaskType;
0702 typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
0703 typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
0704 };
0705
0706 template <typename T> struct VectorHelperSize;
0707 }
0708 }
0709
0710 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
0711 #pragma GCC diagnostic pop
0712 #endif
0713
0714 #include "shuffle.h"
0715
0716 #endif