File indexing completed on 2025-01-31 10:25:32
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_AVX_DETAIL_H_
0029 #define VC_AVX_DETAIL_H_
0030
0031 #include "../sse/detail.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 namespace Detail
0037 {
0038
0039 template <typename Flags>
0040 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0041 typename Flags::EnableIfAligned = nullptr)
0042 {
0043 return _mm256_load_ps(x);
0044 }
0045 template <typename Flags>
0046 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0047 typename Flags::EnableIfUnaligned = nullptr)
0048 {
0049 return _mm256_loadu_ps(x);
0050 }
0051 template <typename Flags>
0052 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0053 typename Flags::EnableIfStreaming = nullptr)
0054 {
0055 return AvxIntrinsics::stream_load<__m256>(x);
0056 }
0057
0058 template <typename Flags>
0059 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0060 typename Flags::EnableIfAligned = nullptr)
0061 {
0062 return _mm256_load_pd(x);
0063 }
0064 template <typename Flags>
0065 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0066 typename Flags::EnableIfUnaligned = nullptr)
0067 {
0068 return _mm256_loadu_pd(x);
0069 }
0070 template <typename Flags>
0071 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0072 typename Flags::EnableIfStreaming = nullptr)
0073 {
0074 return AvxIntrinsics::stream_load<__m256d>(x);
0075 }
0076
0077 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0078 Vc_INTRINSIC Vc_PURE __m256i
0079 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
0080 {
0081 return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
0082 }
0083 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0084 Vc_INTRINSIC Vc_PURE __m256i
0085 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
0086 {
0087 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
0088 }
0089 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0090 Vc_INTRINSIC Vc_PURE __m256i
0091 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
0092 {
0093 return AvxIntrinsics::stream_load<__m256i>(x);
0094 }
0095
0096
0097 Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
0098 {
0099 return _mm256_load_ps(mem);
0100 }
0101 Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
0102 {
0103 return _mm256_loadu_ps(mem);
0104 }
0105 Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
0106 {
0107 return AvxIntrinsics::stream_load<__m256>(mem);
0108 }
0109 Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
0110 {
0111 return _mm256_load_pd(mem);
0112 }
0113 Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
0114 {
0115 return _mm256_loadu_pd(mem);
0116 }
0117 Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
0118 {
0119 return AvxIntrinsics::stream_load<__m256d>(mem);
0120 }
0121 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
0122 {
0123 static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0124 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0125 }
0126 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
0127 {
0128 static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0129 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0130 }
0131 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
0132 {
0133 static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0134 return AvxIntrinsics::stream_load<__m256i>(mem);
0135 }
0136
0137
0138 #ifdef Vc_MSVC
0139
0140 Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
0141 {
0142 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0143 }
0144
0145 Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
0146 {
0147 return _mm256_loadu_pd(mem);
0148 }
0149
0150 template <typename V, typename DstT>
0151 Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
0152 enable_if<(std::is_same<DstT, float>::value &&
0153 std::is_same<V, __m256>::value)> = nullarg)
0154 {
0155 return _mm256_load_ps(mem);
0156 }
0157
0158 template <typename V, typename DstT>
0159 Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
0160 enable_if<(std::is_same<DstT, float>::value &&
0161 std::is_same<V, __m256>::value)> = nullarg)
0162 {
0163 return _mm256_loadu_ps(mem);
0164 }
0165
0166 template <typename V, typename DstT>
0167 Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
0168 enable_if<(std::is_same<DstT, float>::value &&
0169 std::is_same<V, __m256>::value)> = nullarg)
0170 {
0171 return AvxIntrinsics::stream_load<__m256>(mem);
0172 }
0173
0174 template <typename V, typename DstT>
0175 Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
0176 enable_if<(std::is_same<DstT, double>::value &&
0177 std::is_same<V, __m256d>::value)> = nullarg)
0178 {
0179 return _mm256_load_pd(mem);
0180 }
0181
0182 template <typename V, typename DstT>
0183 Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
0184 enable_if<(std::is_same<DstT, double>::value &&
0185 std::is_same<V, __m256d>::value)> = nullarg)
0186 {
0187 return _mm256_loadu_pd(mem);
0188 }
0189
0190 template <typename V, typename DstT>
0191 Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
0192 enable_if<(std::is_same<DstT, double>::value &&
0193 std::is_same<V, __m256d>::value)> = nullarg)
0194 {
0195 return AvxIntrinsics::stream_load<__m256d>(mem);
0196 }
0197
0198 template <typename V, typename DstT>
0199 Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
0200 enable_if<(std::is_same<DstT, uint>::value &&
0201 std::is_same<V, __m256i>::value)> = nullarg)
0202 {
0203 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0204 }
0205
0206 template <typename V, typename DstT>
0207 Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
0208 enable_if<(std::is_same<DstT, uint>::value &&
0209 std::is_same<V, __m256i>::value)> = nullarg)
0210 {
0211 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0212 }
0213
0214 template <typename V, typename DstT>
0215 Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
0216 enable_if<(std::is_same<DstT, uint>::value &&
0217 std::is_same<V, __m256i>::value)> = nullarg)
0218 {
0219 return AvxIntrinsics::stream_load<__m256i>(mem);
0220 }
0221
0222 template <typename V, typename DstT>
0223 Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
0224 enable_if<(std::is_same<DstT, int>::value &&
0225 std::is_same<V, __m256i>::value)> = nullarg)
0226 {
0227 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0228 }
0229
0230 template <typename V, typename DstT>
0231 Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
0232 enable_if<(std::is_same<DstT, int>::value &&
0233 std::is_same<V, __m256i>::value)> = nullarg)
0234 {
0235 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0236 }
0237
0238 template <typename V, typename DstT>
0239 Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
0240 enable_if<(std::is_same<DstT, int>::value &&
0241 std::is_same<V, __m256i>::value)> = nullarg)
0242 {
0243 return AvxIntrinsics::stream_load<__m256i>(mem);
0244 }
0245
0246 template <typename V, typename DstT>
0247 Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
0248 enable_if<(std::is_same<DstT, short>::value &&
0249 std::is_same<V, __m256i>::value)> = nullarg)
0250 {
0251 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0252 }
0253
0254 template <typename V, typename DstT>
0255 Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
0256 enable_if<(std::is_same<DstT, short>::value &&
0257 std::is_same<V, __m256i>::value)> = nullarg)
0258 {
0259 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0260 }
0261
0262 template <typename V, typename DstT>
0263 Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
0264 enable_if<(std::is_same<DstT, short>::value &&
0265 std::is_same<V, __m256i>::value)> = nullarg)
0266 {
0267 return AvxIntrinsics::stream_load<__m256i>(mem);
0268 }
0269
0270 template <typename V, typename DstT>
0271 Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
0272 enable_if<(std::is_same<DstT, ushort>::value &&
0273 std::is_same<V, __m256i>::value)> = nullarg)
0274 {
0275 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0276 }
0277
0278 template <typename V, typename DstT>
0279 Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
0280 enable_if<(std::is_same<DstT, ushort>::value &&
0281 std::is_same<V, __m256i>::value)> = nullarg)
0282 {
0283 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0284 }
0285
0286 template <typename V, typename DstT>
0287 Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
0288 enable_if<(std::is_same<DstT, ushort>::value &&
0289 std::is_same<V, __m256i>::value)> = nullarg)
0290 {
0291 return AvxIntrinsics::stream_load<__m256i>(mem);
0292 }
0293
0294 #endif
0295
0296
0297 template <typename Flags>
0298 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
0299 {
0300 return load32(mem, f);
0301 }
0302 template <typename Flags>
0303 Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
0304 {
0305 return AVX::cvtepu8_epi16(load16(mem, f));
0306 }
0307 template <typename Flags>
0308 Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
0309 {
0310 return AVX::cvtepi8_epi16(load16(mem, f));
0311 }
0312
0313
0314 template <typename Flags>
0315 Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
0316 {
0317 return AVX::cvtepu8_epi16(load16(mem, f));
0318 }
0319
0320
0321 template <typename Flags>
0322 Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
0323 {
0324 return load32(mem, f);
0325 }
0326 template <typename Flags>
0327 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
0328 {
0329 return AVX::cvtepu16_epi32(load16(mem, f));
0330 }
0331 template <typename Flags>
0332 Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
0333 {
0334 return AVX::cvtepi16_epi32(load16(mem, f));
0335 }
0336 template <typename Flags>
0337 Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
0338 {
0339 return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0340 }
0341 template <typename Flags>
0342 Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
0343 {
0344 return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0345 }
0346
0347
0348 template <typename Flags>
0349 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
0350 {
0351 return AVX::cvtepu16_epi32(load16(mem, f));
0352 }
0353 template <typename Flags>
0354 Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
0355 {
0356 return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0357 }
0358
0359
0360 template <typename Flags>
0361 Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
0362 {
0363 return AVX::convert<float, double>(load16(mem, f));
0364 }
0365 template <typename Flags>
0366 Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
0367 {
0368 return AVX::convert<uint, double>(load16(mem, f));
0369 }
0370 template <typename Flags>
0371 Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
0372 {
0373 return AVX::convert<int, double>(load16(mem, f));
0374 }
0375 template <typename Flags>
0376 Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
0377 {
0378 return AVX::convert<int, double>(load16(mem, f));
0379 }
0380 template <typename Flags>
0381 Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
0382 {
0383 return AVX::convert<int, double>(load16(mem, f));
0384 }
0385 template <typename Flags>
0386 Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
0387 {
0388 return AVX::convert<int, double>(load16(mem, f));
0389 }
0390 template <typename Flags>
0391 Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
0392 {
0393 return AVX::convert<int, double>(load16(mem, f));
0394 }
0395
0396
0397 template <typename Flags>
0398 Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
0399 {
0400 return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
0401 _mm256_cvtpd_ps(load32(&mem[4], f)));
0402 }
0403 template <typename Flags>
0404 Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
0405 {
0406 const auto v = load32(mem, f);
0407 return _mm256_blendv_ps(
0408 _mm256_cvtepi32_ps(v),
0409 _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
0410 AVX::set2power31_ps()),
0411 _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
0412 }
0413 template <typename Flags>
0414 Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
0415 {
0416 return AVX::convert<int, float>(load32(mem, f));
0417 }
0418 template <typename T, typename Flags,
0419 typename = enable_if<!std::is_same<T, float>::value>>
0420 Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
0421 {
0422 return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
0423 }
0424 template <typename Flags>
0425 Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
0426 {
0427 return AVX::convert<ushort, float>(load16(mem, f));
0428 }
0429 template <typename Flags>
0430 Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
0431 {
0432 return AVX::convert<short, float>(load16(mem, f));
0433 }
0434
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452 template <int amount, typename T>
0453 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
0454 {
0455 return AVX::avx_cast<T>(AVX::zeroExtend(
0456 _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
0457 }
0458 template <int amount, typename T>
0459 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
0460 shifted(T k)
0461 {
0462 return AVX::avx_cast<T>(
0463 AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
0464 AVX::avx_cast<__m256i>(k)));
0465 }
0466 template <int amount, typename T>
0467 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
0468 {
0469 return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
0470 _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
0471 }
0472 template <int amount, typename T>
0473 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
0474 shifted(T k)
0475 {
0476 return AVX::avx_cast<T>(
0477 AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
0478 Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
0479 }
0480
0481 template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
0482 {
0483 static_assert(From == To, "Incorrect mask cast.");
0484 static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
0485 return AVX::avx_cast<__m256>(k);
0486 }
0487
0488
0489 template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
0490 {
0491 return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
0492 }
0493
0494 template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
0495 {
0496 const auto kk = _mm_castsi128_ps(k);
0497 return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
0498 }
0499
0500
0501 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
0502 {
0503
0504 return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
0505 _mm_setzero_si128()));
0506 }
0507
0508 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
0509 {
0510
0511 return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
0512 }
0513
0514 template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
0515 {
0516 return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
0517 }
0518
0519
0520 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
0521 {
0522
0523 return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
0524 }
0525
0526
0527 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
0528 {
0529
0530 const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
0531 return AVX::concat(_mm_unpacklo_ps(lo, lo),
0532 _mm_unpackhi_ps(lo, lo));
0533 }
0534
0535 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
0536 {
0537 return AVX::avx_cast<__m128>(AVX::lo128(k));
0538 }
0539
0540 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
0541 {
0542
0543 const auto tmp = _mm_unpacklo_epi16(k, k);
0544 return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp),
0545 _mm_unpackhi_epi32(tmp, tmp)));
0546 }
0547
0548
0549 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
0550 {
0551
0552 return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
0553 }
0554
0555 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
0556 {
0557 return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
0558 _mm_unpackhi_epi16(k, k)));
0559 }
0560
0561
0562 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
0563 {
0564
0565 return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
0566 }
0567
0568
0569 #ifdef Vc_IMPL_AVX2
0570 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
0571 {
0572
0573 const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
0574 return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
0575 }
0576 #endif
0577
0578
0579 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
0580 {
0581
0582 const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k));
0583 return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
0584 }
0585
0586
0587 template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); }
0588 template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
0589 template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }
0590
0591
0592 template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); }
0593 template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
0594 template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }
0595
0596
0597 Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); }
0598 Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); }
0599 Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); }
0600 Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); }
0601 Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
0602 Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
0603 Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
0604 Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }
0605
0606
0607 Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
0608 {
0609 return _mm256_xor_ps(v, AVX::setsignmask_ps());
0610 }
0611 Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
0612 {
0613 return _mm256_xor_pd(v, AVX::setsignmask_pd());
0614 }
0615 Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
0616 {
0617 return AVX::sign_epi32(v, Detail::allone<__m256i>());
0618 }
0619 Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
0620 {
0621 return AVX::sign_epi16(v, Detail::allone<__m256i>());
0622 }
0623
0624
0625 Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
0626 Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
0627 Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
0628 {
0629 #ifdef Vc_IMPL_AVX2
0630 return _mm256_xor_si256(a, b);
0631 #else
0632 return _mm256_castps_si256(
0633 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0634 #endif
0635 }
0636
0637
0638 Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
0639 Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
0640 Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
0641 {
0642 #ifdef Vc_IMPL_AVX2
0643 return _mm256_or_si256(a, b);
0644 #else
0645 return _mm256_castps_si256(
0646 _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0647 #endif
0648 }
0649
0650
0651 Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
0652 Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
0653 Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
0654 #ifdef Vc_IMPL_AVX2
0655 return _mm256_and_si256(a, b);
0656 #else
0657 return _mm256_castps_si256(
0658 _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0659 #endif
0660 }
0661
0662
0663 Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
0664 Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
0665 Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
0666 {
0667 #ifdef Vc_IMPL_AVX2
0668 return _mm256_andnot_si256(a, b);
0669 #else
0670 return _mm256_castps_si256(
0671 _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0672 #endif
0673 }
0674
0675
0676 Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); }
0677 Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
0678 Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }
0679
0680
0681 Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); }
0682 Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
0683 Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }
0684
0685
0686 Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); }
0687 Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
0688 Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); }
0689 Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; }
0690 Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); }
0691 Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
0692 Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); }
0693 Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; }
0694
0695
0696 Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); }
0697 Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
0698 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); }
0699 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); }
0700 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); }
0701 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
0702
0703
0704 Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); }
0705 Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
0706 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); }
0707 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); }
0708 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); }
0709 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
0710
0711
0712 Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); }
0713 Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
0714 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); }
0715 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); }
0716 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); }
0717 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }
0718
0719
0720 Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); }
0721 Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
0722 Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) {
0723 using namespace AVX;
0724 const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
0725 const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
0726 const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
0727 const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
0728 return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
0729 _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
0730 }
0731 Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) {
0732
0733
0734
0735
0736 using namespace AVX;
0737 const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
0738 const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
0739 const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
0740 const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
0741 const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
0742 const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
0743
0744
0745 return avx_cast<__m256i>(_mm256_blendv_ps(
0746 avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
0747 _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
0748 avx_cast<__m256>(a),
0749 avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
0750 }
0751 Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) {
0752 using namespace AVX;
0753 const __m256 lo =
0754 _mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
0755 const __m256 hi =
0756 _mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
0757 return concat(convert<float, short>(lo), convert<float, short>(hi));
0758 }
0759
0760
0761 template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0762 {
0763 return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
0764 }
0765
0766
0767 template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0768 {
0769 return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
0770 }
0771
0772
0773 template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0774 {
0775 return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
0776 }
0777
0778
0779 template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0780 {
0781 return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
0782 }
0783
0784 Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); }
0785 Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
0786 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
0787 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
0788 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
0789 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
0790
0791
0792 Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); }
0793 Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
0794 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
0795 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
0796 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
0797 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
0798 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
0799 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
0800
0801
0802 Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); }
0803 Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
0804 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); }
0805 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); }
0806 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); }
0807 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
0808 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); }
0809 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); }
0810
0811
0812 Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); }
0813 Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
0814 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); }
0815 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); }
0816 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); }
0817 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
0818 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
0819 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }
0820
0821
0822 Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); }
0823 Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
0824 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); }
0825 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); }
0826 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); }
0827 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
0828 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
0829 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }
0830
0831
0832 Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); }
0833 Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
0834 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); }
0835 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); }
0836 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); }
0837 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
0838 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); }
0839 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); }
0840
0841
0842 Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) {
0843 #ifdef Vc_IMPL_FMA4
0844 return _mm256_macc_ps(a, b, c);
0845 #elif defined Vc_IMPL_FMA
0846 return _mm256_fmadd_ps(a, b, c);
0847 #else
0848 using namespace AVX;
0849 __m256d v1_0 = _mm256_cvtps_pd(lo128(a));
0850 __m256d v1_1 = _mm256_cvtps_pd(hi128(a));
0851 __m256d v2_0 = _mm256_cvtps_pd(lo128(b));
0852 __m256d v2_1 = _mm256_cvtps_pd(hi128(b));
0853 __m256d v3_0 = _mm256_cvtps_pd(lo128(c));
0854 __m256d v3_1 = _mm256_cvtps_pd(hi128(c));
0855 return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
0856 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
0857 #endif
0858 }
0859 Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
0860 {
0861 #ifdef Vc_IMPL_FMA4
0862 return _mm256_macc_pd(a, b, c);
0863 #elif defined Vc_IMPL_FMA
0864 return _mm256_fmadd_pd(a, b, c);
0865 #else
0866 using namespace AVX;
0867 __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
0868 &c_general::highMaskDouble)));
0869 __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
0870 &c_general::highMaskDouble)));
0871 const __m256d l1 = _mm256_sub_pd(a, h1);
0872 const __m256d l2 = _mm256_sub_pd(b, h2);
0873 const __m256d ll = mul(l1, l2, double());
0874 const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
0875 const __m256d hh = mul(h1, h2, double());
0876
0877 const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());
0878 const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
0879 const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
0880 return add(add(ll, x, double()), add(y, hh, double()), double());
0881 #endif
0882 }
0883 template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
0884 {
0885 return add(mul(a, b, T()), c, T());
0886 }
0887
0888
0889 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32<shift>(a); }
0890 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32<shift>(a); }
0891 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16<shift>(a); }
0892 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
0893
0894
0895
0896 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
0897 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
0898 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
0899 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
0900
0901
0902
0903
0904 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32<shift>(a); }
0905 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32<shift>(a); }
0906 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16<shift>(a); }
0907 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
0908
0909
0910
0911 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
0912 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
0913 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
0914 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
0915
0916
0917
0918
0919 Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; }
0920 Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
0921 Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
0922 Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); }
0923 Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
0924 Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }
0925
0926
0927 Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); }
0928 Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
0929 Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); }
0930 Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); }
0931 Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
0932 Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
0933 Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); }
0934 Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
0935 Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }
0936
0937
0938 template <Vc::Implementation Impl, typename T,
0939 typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
0940 Vc_CONST_L AVX2::Vector<T> Vc_VDECL sorted(AVX2::Vector<T> x) Vc_CONST_R;
0941 template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
0942 {
0943 return sorted<CurrentImplementation::current()>(x);
0944 }
0945
0946
0947 template <typename T, typename V>
0948 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
0949 {
0950 using namespace AVX;
0951 constexpr int S = sizeof(T);
0952 switch (amount) {
0953 case 0: return v;
0954 case 1: return shifted<sanitize<V>( 1 * S)>(v);
0955 case 2: return shifted<sanitize<V>( 2 * S)>(v);
0956 case 3: return shifted<sanitize<V>( 3 * S)>(v);
0957 case -1: return shifted<sanitize<V>(-1 * S)>(v);
0958 case -2: return shifted<sanitize<V>(-2 * S)>(v);
0959 case -3: return shifted<sanitize<V>(-3 * S)>(v);
0960 }
0961 if (sizeof(T) <= 4) {
0962 switch (amount) {
0963 case 4: return shifted<sanitize<V>( 4 * S)>(v);
0964 case 5: return shifted<sanitize<V>( 5 * S)>(v);
0965 case 6: return shifted<sanitize<V>( 6 * S)>(v);
0966 case 7: return shifted<sanitize<V>( 7 * S)>(v);
0967 case -4: return shifted<sanitize<V>(-4 * S)>(v);
0968 case -5: return shifted<sanitize<V>(-5 * S)>(v);
0969 case -6: return shifted<sanitize<V>(-6 * S)>(v);
0970 case -7: return shifted<sanitize<V>(-7 * S)>(v);
0971 }
0972 if (sizeof(T) <= 2) {
0973 switch (amount) {
0974 case 8: return shifted<sanitize<V>( 8 * S)>(v);
0975 case 9: return shifted<sanitize<V>( 9 * S)>(v);
0976 case 10: return shifted<sanitize<V>( 10 * S)>(v);
0977 case 11: return shifted<sanitize<V>( 11 * S)>(v);
0978 case 12: return shifted<sanitize<V>( 12 * S)>(v);
0979 case 13: return shifted<sanitize<V>( 13 * S)>(v);
0980 case 14: return shifted<sanitize<V>( 14 * S)>(v);
0981 case 15: return shifted<sanitize<V>( 15 * S)>(v);
0982 case -8: return shifted<sanitize<V>(- 8 * S)>(v);
0983 case -9: return shifted<sanitize<V>(- 9 * S)>(v);
0984 case -10: return shifted<sanitize<V>(-10 * S)>(v);
0985 case -11: return shifted<sanitize<V>(-11 * S)>(v);
0986 case -12: return shifted<sanitize<V>(-12 * S)>(v);
0987 case -13: return shifted<sanitize<V>(-13 * S)>(v);
0988 case -14: return shifted<sanitize<V>(-14 * S)>(v);
0989 case -15: return shifted<sanitize<V>(-15 * S)>(v);
0990 }
0991 if (sizeof(T) == 1) {
0992 switch (amount) {
0993 case 16: return shifted<sanitize<V>( 16)>(v);
0994 case 17: return shifted<sanitize<V>( 17)>(v);
0995 case 18: return shifted<sanitize<V>( 18)>(v);
0996 case 19: return shifted<sanitize<V>( 19)>(v);
0997 case 20: return shifted<sanitize<V>( 20)>(v);
0998 case 21: return shifted<sanitize<V>( 21)>(v);
0999 case 22: return shifted<sanitize<V>( 22)>(v);
1000 case 23: return shifted<sanitize<V>( 23)>(v);
1001 case 24: return shifted<sanitize<V>( 24)>(v);
1002 case 25: return shifted<sanitize<V>( 25)>(v);
1003 case 26: return shifted<sanitize<V>( 26)>(v);
1004 case 27: return shifted<sanitize<V>( 27)>(v);
1005 case 28: return shifted<sanitize<V>( 28)>(v);
1006 case 29: return shifted<sanitize<V>( 29)>(v);
1007 case 30: return shifted<sanitize<V>( 30)>(v);
1008 case 31: return shifted<sanitize<V>( 31)>(v);
1009 case -16: return shifted<sanitize<V>(-16)>(v);
1010 case -17: return shifted<sanitize<V>(-17)>(v);
1011 case -18: return shifted<sanitize<V>(-18)>(v);
1012 case -19: return shifted<sanitize<V>(-19)>(v);
1013 case -20: return shifted<sanitize<V>(-20)>(v);
1014 case -21: return shifted<sanitize<V>(-21)>(v);
1015 case -22: return shifted<sanitize<V>(-22)>(v);
1016 case -23: return shifted<sanitize<V>(-23)>(v);
1017 case -24: return shifted<sanitize<V>(-24)>(v);
1018 case -25: return shifted<sanitize<V>(-25)>(v);
1019 case -26: return shifted<sanitize<V>(-26)>(v);
1020 case -27: return shifted<sanitize<V>(-27)>(v);
1021 case -28: return shifted<sanitize<V>(-28)>(v);
1022 case -29: return shifted<sanitize<V>(-29)>(v);
1023 case -30: return shifted<sanitize<V>(-30)>(v);
1024 case -31: return shifted<sanitize<V>(-31)>(v);
1025 }
1026 }
1027 }
1028 }
1029 return avx_cast<V>(_mm256_setzero_ps());
1030 }
1031
1032 template <typename T, typename V>
1033 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
1034 {
1035 using namespace AVX;
1036 switch (amount) {
1037 case 0: return v;
1038 case 1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
1039 case 2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
1040 case 3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
1041 case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
1042 case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
1043 case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
1044 }
1045 if (sizeof(T) <= 2) {
1046 switch (amount) {
1047 case 4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
1048 case 5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
1049 case 6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
1050 case 7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
1051 case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
1052 case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
1053 case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
1054 case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
1055 }
1056 }
1057 return avx_cast<V>(_mm_setzero_ps());
1058 }
1059
1060 template <typename T, size_t N, typename V>
1061 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
1062 int amount)
1063 {
1064 using namespace AVX;
1065 const __m128i vLo = avx_cast<__m128i>(lo128(v));
1066 const __m128i vHi = avx_cast<__m128i>(hi128(v));
1067 switch (static_cast<unsigned int>(amount) % N) {
1068 case 0:
1069 return v;
1070 case 1:
1071 return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
1072 SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
1073 case 2:
1074 return Mem::permute128<X1, X0>(v);
1075 case 3:
1076 return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
1077 SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
1078 }
1079 return avx_cast<V>(_mm256_setzero_ps());
1080 }
1081
1082 template <typename T, size_t N, typename V>
1083 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
1084 int amount)
1085 {
1086 using namespace AVX;
1087 const __m128i vLo = avx_cast<__m128i>(lo128(v));
1088 const __m128i vHi = avx_cast<__m128i>(hi128(v));
1089 switch (static_cast<unsigned int>(amount) % N) {
1090 case 0:
1091 return v;
1092 case 1:
1093 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
1094 SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
1095 case 2:
1096 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
1097 SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
1098 case 3:
1099 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
1100 SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
1101 case 4:
1102 return Mem::permute128<X1, X0>(v);
1103 case 5:
1104 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
1105 SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
1106 case 6:
1107 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
1108 SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
1109 case 7:
1110 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
1111 SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
1112 }
1113 return avx_cast<V>(_mm256_setzero_ps());
1114 }
1115
1116 #ifdef Vc_IMPL_AVX2
1117 template <typename T, size_t N, typename V>
1118 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
1119 V v, int amount)
1120 {
1121 using namespace AVX;
1122 const __m128i vLo = avx_cast<__m128i>(lo128(v));
1123 const __m128i vHi = avx_cast<__m128i>(hi128(v));
1124 switch (static_cast<unsigned int>(amount) % N) {
1125 case 0:
1126 return v;
1127 case 1:
1128 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
1129 SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
1130 case 2:
1131 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
1132 SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
1133 case 3:
1134 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
1135 SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
1136 case 4:
1137 return Mem::permute4x64<X1, X2, X3, X0>(v);
1138 case 5:
1139 return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
1140 SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
1141 case 6:
1142 return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
1143 SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
1144 case 7:
1145 return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
1146 SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
1147 case 8:
1148 return Mem::permute128<X1, X0>(v);
1149 case 9:
1150 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
1151 SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
1152 case 10:
1153 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
1154 SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
1155 case 11:
1156 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
1157 SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
1158 case 12:
1159 return Mem::permute4x64<X3, X0, X1, X2>(v);
1160 case 13:
1161 return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
1162 SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
1163 case 14:
1164 return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
1165 SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
1166 case 15:
1167 return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
1168 SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
1169 }
1170 return avx_cast<V>(_mm256_setzero_ps());
1171 }
1172 #endif
1173
1174
1175 Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1176 Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); }
1177 Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
1178 Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }
1179
1180
1181 Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1182 Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); }
1183 Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
1184 Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }
1185
1186
1187 Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1188 Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); }
1189 Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
1190 Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }
1191
1192
1193 Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
1194 Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
1195 Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
1196 Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
1197 Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); }
1198 Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); }
1199
1200
1201 template <size_t N, typename Flags>
1202 Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
1203 {
1204 static_assert(
1205 N == 4 || N == 8 || N == 16,
1206 "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
1207 switch (N) {
1208 case 4:
1209 *aliasing_cast<int32_t>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) |
1210 (_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
1211 0x01010101;
1212 break;
1213 case 8: {
1214 const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
1215 const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
1216 #ifdef __x86_64__
1217 *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k3);
1218 #else
1219 *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(k3);
1220 *aliasing_cast<int32_t>(mem + 4) = _mm_extract_epi32(k3, 1);
1221 #endif
1222 } break;
1223 case 16: {
1224 const auto bools = Detail::and_(_mm_set1_epi8(1),
1225 _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
1226 if (Flags::IsAligned) {
1227 _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
1228 } else {
1229 _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
1230 }
1231 } break;
1232 default:
1233 Vc_UNREACHABLE();
1234 }
1235 }
1236
1237
1238 template <typename R, size_t N, typename Flags>
1239 Vc_INTRINSIC R mask_load(const bool *mem, Flags,
1240 enable_if<std::is_same<R, __m128>::value> = nullarg)
1241 {
1242 static_assert(N == 4 || N == 8,
1243 "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
1244 switch (N) {
1245 case 4: {
1246 __m128i k = _mm_cvtsi32_si128(*aliasing_cast<int32_t>(mem));
1247 k = _mm_unpacklo_epi8(k, k);
1248 k = _mm_unpacklo_epi16(k, k);
1249 k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
1250 return AVX::avx_cast<__m128>(k);
1251 }
1252 case 8: {
1253 #ifdef __x86_64__
1254 __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
1255 #else
1256 __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
1257 #endif
1258 return AVX::avx_cast<__m128>(
1259 _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
1260 }
1261 default:
1262 Vc_UNREACHABLE();
1263 }
1264 }
1265
1266 template <typename R, size_t N, typename Flags>
1267 Vc_INTRINSIC R mask_load(const bool *mem, Flags,
1268 enable_if<std::is_same<R, __m256>::value> = nullarg)
1269 {
1270 static_assert(
1271 N == 4 || N == 8 || N == 16,
1272 "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
1273 switch (N) {
1274 case 4: {
1275 __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
1276 _mm_set1_ps(*aliasing_cast<float>(mem)),
1277 AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
1278 k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
1279 return AVX::avx_cast<__m256>(
1280 AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
1281 }
1282 case 8: {
1283 #ifdef __x86_64__
1284 __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
1285 #else
1286 __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
1287 #endif
1288 k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
1289 return AVX::avx_cast<__m256>(
1290 AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
1291 }
1292 case 16: {
1293 const auto k128 = _mm_cmpgt_epi8(
1294 Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
1295 : _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
1296 _mm_setzero_si128());
1297 return AVX::avx_cast<__m256>(
1298 AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
1299 }
1300 default:
1301 Vc_UNREACHABLE();
1302 return R();
1303 }
1304 }
1305
1306
1307 template <size_t Size>
1308 Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
1309 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
1310 {
1311 return movemask(AVX::avx_cast<__m256d>(k));
1312 }
1313 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
1314 {
1315 return movemask(AVX::avx_cast<__m256>(k));
1316 }
1317 #ifdef Vc_IMPL_BMI2
1318 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
1319 {
1320 return _pext_u32(movemask(k), 0x55555555u);
1321 }
1322 #endif
1323 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
1324 {
1325 return movemask(k);
1326 }
1327
1328
1329 template<typename V> struct InterleaveImpl<V, 16, 32> {
1330 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1331 const typename V::AsArg v0,
1332 const typename V::AsArg v1)
1333 {
1334 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
1335 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
1336 using namespace AVX;
1337 *aliasing_cast<uint32_t>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
1338 *aliasing_cast<uint32_t>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
1339 *aliasing_cast<uint32_t>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
1340 *aliasing_cast<uint32_t>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
1341 *aliasing_cast<uint32_t>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
1342 *aliasing_cast<uint32_t>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
1343 *aliasing_cast<uint32_t>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
1344 *aliasing_cast<uint32_t>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
1345 *aliasing_cast<uint32_t>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
1346 *aliasing_cast<uint32_t>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
1347 *aliasing_cast<uint32_t>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
1348 *aliasing_cast<uint32_t>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
1349 *aliasing_cast<uint32_t>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
1350 *aliasing_cast<uint32_t>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
1351 *aliasing_cast<uint32_t>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
1352 *aliasing_cast<uint32_t>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
1353 }
1354 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
1355 const typename V::AsArg v0, const typename V::AsArg v1)
1356 {
1357 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
1358 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
1359 V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
1360 V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
1361 }
1362 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1363 const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
1364 {
1365 interleave(data, i, v0, v1);
1366 v2.scatter(data + 2, i);
1367 }
1368 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1369 const typename V::AsArg v0, const typename V::AsArg v1,
1370 const typename V::AsArg v2, const typename V::AsArg v3)
1371 {
1372 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
1373 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
1374 const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
1375 const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
1376
1377 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1378 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1379 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1380 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1381
1382 using namespace AVX;
1383 auto &&store = [&](__m256i x, int offset) {
1384 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
1385 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
1386 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
1387 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
1388 };
1389 store(tmp4, 0);
1390 store(tmp5, 2);
1391 store(tmp6, 4);
1392 store(tmp7, 6);
1393 }
1394 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
1395 const typename V::AsArg v0, const typename V::AsArg v1,
1396 const typename V::AsArg v2, const typename V::AsArg v3)
1397 {
1398 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
1399 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
1400 const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
1401 const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
1402
1403 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1404 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1405 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1406 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1407
1408 V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
1409 V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
1410 V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
1411 V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
1412 }
1413 template <typename I>
1414 static inline void interleave(typename V::EntryType *const data, const I &i,
1415 const typename V::AsArg v0, const typename V::AsArg v1,
1416 const typename V::AsArg v2, const typename V::AsArg v3,
1417 const typename V::AsArg v4)
1418 {
1419 interleave(data, i, v0, v1, v2, v3);
1420 v4.scatter(data + 4, i);
1421 }
1422 template <typename I>
1423 static inline void interleave(typename V::EntryType *const data, const I &i,
1424 const typename V::AsArg v0, const typename V::AsArg v1,
1425 const typename V::AsArg v2, const typename V::AsArg v3,
1426 const typename V::AsArg v4, const typename V::AsArg v5)
1427 {
1428 interleave(data, i, v0, v1, v2, v3);
1429 interleave(data + 4, i, v4, v5);
1430 }
1431 template <typename I>
1432 static inline void interleave(typename V::EntryType *const data, const I &i,
1433 const typename V::AsArg v0, const typename V::AsArg v1,
1434 const typename V::AsArg v2, const typename V::AsArg v3,
1435 const typename V::AsArg v4, const typename V::AsArg v5,
1436 const typename V::AsArg v6)
1437 {
1438 interleave(data, i, v0, v1, v2, v3);
1439 interleave(data + 4, i, v4, v5, v6);
1440 }
1441 template <typename I>
1442 static inline void interleave(typename V::EntryType *const data, const I &i,
1443 const typename V::AsArg v0, const typename V::AsArg v1,
1444 const typename V::AsArg v2, const typename V::AsArg v3,
1445 const typename V::AsArg v4, const typename V::AsArg v5,
1446 const typename V::AsArg v6, const typename V::AsArg v7)
1447 {
1448 interleave(data, i, v0, v1, v2, v3);
1449 interleave(data + 4, i, v4, v5, v6, v7);
1450 }
1451
1452 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1453 const I &i, V &v0, V &v1)
1454 {
1455 const __m256i tmp4 =
1456 _mm256_setr_epi32(
1457 *aliasing_cast<int>(&data[i[0]]), *aliasing_cast<int>(&data[i[1]]),
1458 *aliasing_cast<int>(&data[i[2]]), *aliasing_cast<int>(&data[i[3]]),
1459 *aliasing_cast<int>(&data[i[8]]), *aliasing_cast<int>(&data[i[9]]),
1460 *aliasing_cast<int>(&data[i[10]]), *aliasing_cast<int>(&data[i[11]]));
1461 const __m256i tmp5 =
1462 _mm256_setr_epi32(
1463 *aliasing_cast<int>(&data[i[4]]), *aliasing_cast<int>(&data[i[5]]),
1464 *aliasing_cast<int>(&data[i[6]]), *aliasing_cast<int>(&data[i[7]]),
1465 *aliasing_cast<int>(&data[i[12]]), *aliasing_cast<int>(&data[i[13]]),
1466 *aliasing_cast<int>(&data[i[14]]), *aliasing_cast<int>(&data[i[15]]));
1467
1468 const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);
1469 const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);
1470
1471 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1472 const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);
1473
1474 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1475 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1476 }
1477 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1478 const I &i, V &v0, V &v1, V &v2)
1479 {
1480 using namespace AVX;
1481 const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
1482 *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
1483 *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
1484 const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
1485 *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
1486 *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
1487 const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
1488 *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
1489 *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
1490 const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
1491 *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
1492 *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
1493 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1494 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1495 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1496 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1497
1498 const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
1499 const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
1500 const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
1501 const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
1502
1503 v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
1504 v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
1505 v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
1506 }
1507 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1508 const I &i, V &v0, V &v1, V &v2, V &v3)
1509 {
1510 using namespace AVX;
1511 const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
1512 *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
1513 *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
1514 const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
1515 *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
1516 *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
1517 const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
1518 *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
1519 *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
1520 const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
1521 *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
1522 *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
1523 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1524 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1525 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1526 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1527
1528 const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
1529 const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
1530 const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
1531 const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
1532
1533 v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
1534 v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
1535 v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
1536 v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
1537 }
1538 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1539 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1540 {
1541 using namespace AVX;
1542 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1543 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1544 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1545 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1546 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1547 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1548 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1549 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1550 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1551 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1552 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1553 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1554 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1555 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1556 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1557 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1558
1559 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1560 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1561 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1562 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1563 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1564 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1565 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1566 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1567
1568 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1569 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1570 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1571 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1572 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1573 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1574
1575 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1576 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1577 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1578 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1579 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1580 }
1581 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1582 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1583 {
1584 using namespace AVX;
1585 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1586 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1587 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1588 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1589 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1590 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1591 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1592 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1593 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1594 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1595 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1596 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1597 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1598 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1599 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1600 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1601
1602 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1603 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1604 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1605 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1606 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1607 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1608 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1609 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1610
1611 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1612 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1613 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1614 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1615 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1616 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1617
1618 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1619 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1620 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1621 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1622 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1623 v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1624 }
1625 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1626 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1627 {
1628 using namespace AVX;
1629 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1630 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1631 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1632 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1633 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1634 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1635 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1636 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1637 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1638 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1639 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1640 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1641 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1642 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1643 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1644 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1645
1646 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1647 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1648 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1649 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1650 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1651 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1652 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1653 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1654
1655 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1656 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1657 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1658 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1659 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1660 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1661 const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
1662 const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
1663
1664 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1665 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1666 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1667 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1668 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1669 v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1670 v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
1671 }
1672 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1673 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1674 {
1675 using namespace AVX;
1676 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1677 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1678 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1679 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1680 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1681 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1682 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1683 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1684 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1685 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1686 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1687 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1688 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1689 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1690 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1691 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1692
1693 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1694 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1695 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1696 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1697 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1698 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1699 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1700 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1701
1702 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1703 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1704 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1705 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1706 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1707 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1708 const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
1709 const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
1710
1711 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1712 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1713 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1714 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1715 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1716 v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1717 v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
1718 v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
1719 }
1720 };
1721 template<typename V> struct InterleaveImpl<V, 8, 32> {
1722 static_assert(sizeof(typename V::value_type) == 4, "");
1723 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1724 const typename V::AsArg v0, const typename V::AsArg v1)
1725 {
1726 using namespace AVX;
1727
1728 const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1729
1730 const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1731 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
1732 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
1733 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
1734 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
1735 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
1736 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
1737 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
1738 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
1739 }
1740 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
1741 const typename V::AsArg v0, const typename V::AsArg v1)
1742 {
1743 using namespace AVX;
1744
1745 const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1746
1747 const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1748 _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(tmp0));
1749 _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(tmp1));
1750 _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(tmp0));
1751 _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(tmp1));
1752 }
1753
1754 template <typename I>
1755 static inline void interleave(typename V::EntryType *const data, const I &i,
1756 const typename V::AsArg v0, const typename V::AsArg v1,
1757 const typename V::AsArg v2)
1758 {
1759 using namespace AVX;
1760 #ifdef Vc_USE_MASKMOV_SCATTER
1761
1762 const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1763
1764 const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1765
1766 const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
1767
1768 const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
1769 const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
1770 const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
1771 const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
1772 const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
1773 const m128i mask = _mm_set_epi32(0, -1, -1, -1);
1774 _mm_maskstore_ps(aliasing_cast<float>(&data[i[0]]), mask, lo128(tmp4));
1775 _mm_maskstore_ps(aliasing_cast<float>(&data[i[1]]), mask, lo128(tmp5));
1776 _mm_maskstore_ps(aliasing_cast<float>(&data[i[2]]), mask, lo128(tmp6));
1777 _mm_maskstore_ps(aliasing_cast<float>(&data[i[3]]), mask, lo128(tmp7));
1778 _mm_maskstore_ps(aliasing_cast<float>(&data[i[4]]), mask, hi128(tmp4));
1779 _mm_maskstore_ps(aliasing_cast<float>(&data[i[5]]), mask, hi128(tmp5));
1780 _mm_maskstore_ps(aliasing_cast<float>(&data[i[6]]), mask, hi128(tmp6));
1781 _mm_maskstore_ps(aliasing_cast<float>(&data[i[7]]), mask, hi128(tmp7));
1782 #else
1783 interleave(data, i, v0, v1);
1784 v2.scatter(data + 2, i);
1785 #endif
1786 }
1787
1788 static inline void interleave(typename V::EntryType *const data,
1789 const Common::SuccessiveEntries<3> &i,
1790 const typename V::AsArg v0_,
1791 const typename V::AsArg v1_,
1792 const typename V::AsArg v2_)
1793 {
1794 __m256 v0 = AVX::avx_cast<__m256>(v0_.data());
1795 __m256 v1 = AVX::avx_cast<__m256>(v1_.data());
1796 __m256 v2 = AVX::avx_cast<__m256>(v2_.data());
1797
1798 v0 = _mm256_shuffle_ps(v0, v0, 0x6c);
1799 v1 = _mm256_shuffle_ps(v1, v1, 0xb1);
1800 v2 = _mm256_shuffle_ps(v2, v2, 0xc6);
1801
1802
1803 __m256 w0 = Mem::blend<X0, X1, Y2, X3, Y4, X5, X6, Y7>(
1804 Mem::blend<X0, Y1, X2, X3, X4, X5, Y6, X7>(v0, v1), v2);
1805
1806 __m256 w1 = Mem::blend<X0, Y1, X2, X3, X4, Y5, X6, X7>(
1807 Mem::blend<Y0, X1, X2, Y3, Y4, X5, X6, Y7>(v0, v1), v2);
1808
1809 __m256 w2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(
1810 Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(v0, v1), v2);
1811
1812
1813 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
1814 _mm256_permute2f128_ps(w0, w1, 0x20));
1815
1816 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8), w2);
1817
1818 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
1819 _mm256_permute2f128_ps(w1, w0, 0x31));
1820
1821 }
1822
1823 template <typename I>
1824 static inline void interleave(typename V::EntryType *const data, const I &i,
1825 const typename V::AsArg v0, const typename V::AsArg v1,
1826 const typename V::AsArg v2, const typename V::AsArg v3)
1827 {
1828 using namespace AVX;
1829 const __m256 tmp0 =
1830 _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1831 const __m256 tmp1 =
1832 _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1833 const __m256 tmp2 =
1834 _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1835 const __m256 tmp3 =
1836 _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1837 const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
1838 const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
1839 const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
1840 const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
1841 _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(_04));
1842 _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), lo128(_15));
1843 _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(_26));
1844 _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), lo128(_37));
1845 _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(_04));
1846 _mm_storeu_ps(aliasing_cast<float>(&data[i[5]]), hi128(_15));
1847 _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(_26));
1848 _mm_storeu_ps(aliasing_cast<float>(&data[i[7]]), hi128(_37));
1849 }
1850
1851
1852 static inline void interleave(typename V::EntryType *const data,
1853 const Common::SuccessiveEntries<4> &i,
1854 const typename V::AsArg v0, const typename V::AsArg v1,
1855 const typename V::AsArg v2, const typename V::AsArg v3)
1856 {
1857 using namespace AVX;
1858 const __m256 tmp0 =
1859 _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1860 const __m256 tmp1 =
1861 _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1862 const __m256 tmp2 =
1863 _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1864 const __m256 tmp3 =
1865 _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1866 const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
1867 const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
1868 const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
1869 const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
1870 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
1871 _mm256_permute2f128_ps(_04, _15, 0x20));
1872 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8),
1873 _mm256_permute2f128_ps(_26, _37, 0x20));
1874 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
1875 _mm256_permute2f128_ps(_04, _15, 0x31));
1876 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 24),
1877 _mm256_permute2f128_ps(_26, _37, 0x31));
1878 }
1879 template <typename I>
1880 static inline void interleave(typename V::EntryType *const data, const I &i,
1881 const typename V::AsArg v0, const typename V::AsArg v1,
1882 const typename V::AsArg v2, const typename V::AsArg v3,
1883 const typename V::AsArg v4)
1884 {
1885 interleave(data, i, v0, v1, v2, v3);
1886 v4.scatter(data + 4, i);
1887 }
1888 template <typename I>
1889 static inline void interleave(typename V::EntryType *const data, const I &i,
1890 const typename V::AsArg v0, const typename V::AsArg v1,
1891 const typename V::AsArg v2, const typename V::AsArg v3,
1892 const typename V::AsArg v4, const typename V::AsArg v5)
1893 {
1894 interleave(data, i, v0, v1, v2, v3);
1895 interleave(data + 4, i, v4, v5);
1896 }
1897 template <typename I>
1898 static inline void interleave(typename V::EntryType *const data, const I &i,
1899 const typename V::AsArg v0, const typename V::AsArg v1,
1900 const typename V::AsArg v2, const typename V::AsArg v3,
1901 const typename V::AsArg v4, const typename V::AsArg v5,
1902 const typename V::AsArg v6)
1903 {
1904 interleave(data, i, v0, v1, v2, v3);
1905 interleave(data + 4, i, v4, v5, v6);
1906 }
1907 template <typename I>
1908 static inline void interleave(typename V::EntryType *const data, const I &i,
1909 const typename V::AsArg v0, const typename V::AsArg v1,
1910 const typename V::AsArg v2, const typename V::AsArg v3,
1911 const typename V::AsArg v4, const typename V::AsArg v5,
1912 const typename V::AsArg v6, const typename V::AsArg v7)
1913 {
1914 interleave(data, i, v0, v1, v2, v3);
1915 interleave(data + 4, i, v4, v5, v6, v7);
1916 }
1917
1918
1919 template <typename I>
1920 static inline void deinterleave(typename V::EntryType const *const data, const I &i,
1921 V &v0, V &v1)
1922 {
1923 using namespace AVX;
1924 const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]]));
1925 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]]));
1926 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]]));
1927 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]]));
1928 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]]));
1929 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]]));
1930 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]]));
1931 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]]));
1932
1933 const m256 tmp2 = concat(il01, il45);
1934 const m256 tmp3 = concat(il23, il67);
1935
1936 const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
1937 const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
1938
1939 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
1940 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
1941 }
1942
1943 static inline void deinterleave(typename V::EntryType const *const data,
1944 const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
1945 {
1946 using namespace AVX;
1947 const m256 il0123 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1948 const m256 il4567 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[4]]));
1949
1950 const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
1951 const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);
1952
1953 const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
1954 const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
1955
1956 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
1957 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
1958 }
1959
1960 template <typename I>
1961 static inline void deinterleave(typename V::EntryType const *const data, const I &i,
1962 V &v0, V &v1, V &v2)
1963 {
1964 using namespace AVX;
1965 const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1966 const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
1967 const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
1968 const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
1969 const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
1970 const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
1971 const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
1972 const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
1973
1974 const m256 il04 = concat(il0, il4);
1975 const m256 il15 = concat(il1, il5);
1976 const m256 il26 = concat(il2, il6);
1977 const m256 il37 = concat(il3, il7);
1978 const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
1979 const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
1980 const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
1981 const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
1982 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
1983 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
1984 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
1985 }
1986
1987 static inline void deinterleave(typename V::EntryType const *const data,
1988 const Common::SuccessiveEntries<3> &i, V &v0, V &v1,
1989 V &v2)
1990 {
1991
1992 __m256 in0 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 0));
1993
1994 __m256 in1 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 8));
1995
1996 __m256 in2 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 16));
1997
1998
1999
2000
2001
2002 const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20);
2003 const __m256 cdddeeef = in1;
2004 const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31);
2005
2006
2007
2008
2009 const __m256 x0 = _mm256_blend_ps(
2010 _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80),
2011 bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0);
2012 const __m256 x1 = _mm256_blend_ps(
2013 _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0),
2014 bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0);
2015 const __m256 x2 = _mm256_blend_ps(
2016 _mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0),
2017 bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80);
2018
2019
2020
2021 v0 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x0, x0, 0x6c));
2022 v1 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x1, x1, 0xb1));
2023 v2 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x2, x2, 0xc6));
2024 }
2025
2026 template <typename I>
2027 static inline void deinterleave(typename V::EntryType const *const data, const I &i,
2028 V &v0, V &v1, V &v2, V &v3)
2029 {
2030 using namespace AVX;
2031 const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
2032 const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
2033 const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
2034 const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
2035 const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
2036 const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
2037 const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
2038 const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
2039
2040 const m256 il04 = concat(il0, il4);
2041 const m256 il15 = concat(il1, il5);
2042 const m256 il26 = concat(il2, il6);
2043 const m256 il37 = concat(il3, il7);
2044 const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
2045 const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
2046 const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
2047 const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
2048 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
2049 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
2050 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
2051 v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
2052 }
2053
2054 static inline void deinterleave(typename V::EntryType const *const data,
2055 const Common::SuccessiveEntries<4> &i, V &v0, V &v1,
2056 V &v2, V &v3)
2057 {
2058 using namespace AVX;
2059 const __m256 il01 = _mm256_loadu_ps(
2060 aliasing_cast<float>(&data[i[0]]));
2061 const __m256 il23 = _mm256_loadu_ps(
2062 aliasing_cast<float>(&data[i[2]]));
2063 const __m256 il45 = _mm256_loadu_ps(
2064 aliasing_cast<float>(&data[i[4]]));
2065 const __m256 il67 = _mm256_loadu_ps(
2066 aliasing_cast<float>(&data[i[6]]));
2067
2068 const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20);
2069 const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31);
2070 const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20);
2071 const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31);
2072 const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
2073 const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
2074 const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
2075 const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
2076 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
2077 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
2078 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
2079 v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
2080 }
2081 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2082 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
2083 {
2084 v4.gather(data + 4, i);
2085 deinterleave(data, i, v0, v1, v2, v3);
2086 }
2087 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2088 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2089 {
2090 deinterleave(data, i, v0, v1, v2, v3);
2091 deinterleave(data + 4, i, v4, v5);
2092 }
2093 static inline void deinterleave(typename V::EntryType const *const data,
2094 const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2095 {
2096 using namespace AVX;
2097 const m256 a = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
2098 const m256 b = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 1 * V::Size]));
2099 const m256 c = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 2 * V::Size]));
2100 const m256 d = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 3 * V::Size]));
2101 const m256 e = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 4 * V::Size]));
2102 const m256 f = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 5 * V::Size]));
2103 const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
2104 const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
2105 const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
2106 const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
2107 const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
2108 const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
2109 const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
2110 const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
2111 const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
2112 const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
2113 const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
2114 const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
2115 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
2116 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
2117 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
2118 v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
2119 v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
2120 v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
2121 }
2122 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2123 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
2124 {
2125 deinterleave(data, i, v0, v1, v2, v3);
2126 deinterleave(data + 4, i, v4, v5, v6);
2127 }
2128 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2129 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
2130 {
2131 deinterleave(data, i, v0, v1, v2, v3);
2132 deinterleave(data + 4, i, v4, v5, v6, v7);
2133 }
2134 };
2135 template<typename V> struct InterleaveImpl<V, 4, 32> {
2136 template <typename I>
2137 static inline void interleave(typename V::EntryType *const data, const I &i,
2138 const typename V::AsArg v0, const typename V::AsArg v1)
2139 {
2140 using namespace AVX;
2141 const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2142 const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2143 _mm_storeu_pd(&data[i[0]], lo128(tmp0));
2144 _mm_storeu_pd(&data[i[1]], lo128(tmp1));
2145 _mm_storeu_pd(&data[i[2]], hi128(tmp0));
2146 _mm_storeu_pd(&data[i[3]], hi128(tmp1));
2147 }
2148 template <typename I>
2149 static inline void interleave(typename V::EntryType *const data, const I &i,
2150 const typename V::AsArg v0, const typename V::AsArg v1,
2151 const typename V::AsArg v2)
2152 {
2153 using namespace AVX;
2154 #ifdef Vc_USE_MASKMOV_SCATTER
2155 const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2156 const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2157 const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
2158 const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());
2159
2160 #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
2161
2162 const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
2163 #else
2164 const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
2165 #endif
2166 _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
2167 _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
2168 _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
2169 _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
2170 #else
2171 interleave(data, i, v0, v1);
2172 v2.scatter(data + 2, i);
2173 #endif
2174 }
2175 template <typename I>
2176 static inline void interleave(typename V::EntryType *const data, const I &i,
2177 const typename V::AsArg v0, const typename V::AsArg v1,
2178 const typename V::AsArg v2, const typename V::AsArg v3)
2179 {
2180 using namespace AVX;
2181
2182 const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2183
2184 const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2185
2186 const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
2187
2188 const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
2189
2190
2191
2192
2193
2194
2195
2196 _mm_storeu_pd(&data[i[0] ], lo128(tmp0));
2197 _mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
2198 _mm_storeu_pd(&data[i[1] ], lo128(tmp1));
2199 _mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
2200 _mm_storeu_pd(&data[i[2] ], hi128(tmp0));
2201 _mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
2202 _mm_storeu_pd(&data[i[3] ], hi128(tmp1));
2203 _mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
2204 }
2205 template <typename I>
2206 static inline void interleave(typename V::EntryType *const data, const I &i,
2207 const typename V::AsArg v0, const typename V::AsArg v1,
2208 const typename V::AsArg v2, const typename V::AsArg v3,
2209 const typename V::AsArg v4)
2210 {
2211 interleave(data, i, v0, v1, v2, v3);
2212 v4.scatter(data + 4, i);
2213 }
2214 template <typename I>
2215 static inline void interleave(typename V::EntryType *const data, const I &i,
2216 const typename V::AsArg v0, const typename V::AsArg v1,
2217 const typename V::AsArg v2, const typename V::AsArg v3,
2218 const typename V::AsArg v4, const typename V::AsArg v5)
2219 {
2220 interleave(data, i, v0, v1, v2, v3);
2221 interleave(data + 4, i, v4, v5);
2222 }
2223 template <typename I>
2224 static inline void interleave(typename V::EntryType *const data, const I &i,
2225 const typename V::AsArg v0, const typename V::AsArg v1,
2226 const typename V::AsArg v2, const typename V::AsArg v3,
2227 const typename V::AsArg v4, const typename V::AsArg v5,
2228 const typename V::AsArg v6)
2229 {
2230 interleave(data, i, v0, v1, v2, v3);
2231 interleave(data + 4, i, v4, v5, v6);
2232 }
2233 template <typename I>
2234 static inline void interleave(typename V::EntryType *const data, const I &i,
2235 const typename V::AsArg v0, const typename V::AsArg v1,
2236 const typename V::AsArg v2, const typename V::AsArg v3,
2237 const typename V::AsArg v4, const typename V::AsArg v5,
2238 const typename V::AsArg v6, const typename V::AsArg v7)
2239 {
2240 interleave(data, i, v0, v1, v2, v3);
2241 interleave(data + 4, i, v4, v5, v6, v7);
2242 }
2243
2244 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2245 const I &i, V &v0, V &v1)
2246 {
2247 using namespace Vc::AVX;
2248 const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
2249 const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));
2250
2251 v0.data() = _mm256_unpacklo_pd(ab02, ab13);
2252 v1.data() = _mm256_unpackhi_pd(ab02, ab13);
2253 }
2254 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2255 const I &i, V &v0, V &v1, V &v2)
2256 {
2257 v2.gather(data + 2, i);
2258 deinterleave(data, i, v0, v1);
2259 }
2260 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2261 const I &i, V &v0, V &v1, V &v2, V &v3)
2262 {
2263 deinterleave(data, i, v0, v1);
2264 deinterleave(data + 2, i, v2, v3);
2265 }
2266 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2267 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
2268 {
2269 v4.gather(data + 4, i);
2270 deinterleave(data, i, v0, v1);
2271 deinterleave(data + 2, i, v2, v3);
2272 }
2273 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2274 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2275 {
2276 deinterleave(data, i, v0, v1);
2277 deinterleave(data + 2, i, v2, v3);
2278 deinterleave(data + 4, i, v4, v5);
2279 }
2280 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2281 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
2282 {
2283 v6.gather(data + 6, i);
2284 deinterleave(data, i, v0, v1);
2285 deinterleave(data + 2, i, v2, v3);
2286 deinterleave(data + 4, i, v4, v5);
2287 }
2288 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2289 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
2290 {
2291 deinterleave(data, i, v0, v1);
2292 deinterleave(data + 2, i, v2, v3);
2293 deinterleave(data + 4, i, v4, v5);
2294 deinterleave(data + 6, i, v6, v7);
2295 }
2296 };
2297
2298 }
2299 }
2300
2301 #endif
2302
2303