Warning, file /include/Vc/avx/detail.h was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_AVX_DETAIL_H_
0029 #define VC_AVX_DETAIL_H_
0030
0031 #include "../sse/detail.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 namespace Detail
0037 {
0038
0039 template <typename Flags>
0040 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0041 typename Flags::EnableIfAligned = nullptr)
0042 {
0043 return _mm256_load_ps(x);
0044 }
0045 template <typename Flags>
0046 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0047 typename Flags::EnableIfUnaligned = nullptr)
0048 {
0049 return _mm256_loadu_ps(x);
0050 }
0051 template <typename Flags>
0052 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0053 typename Flags::EnableIfStreaming = nullptr)
0054 {
0055 return AvxIntrinsics::stream_load<__m256>(x);
0056 }
0057
0058 template <typename Flags>
0059 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0060 typename Flags::EnableIfAligned = nullptr)
0061 {
0062 return _mm256_load_pd(x);
0063 }
0064 template <typename Flags>
0065 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0066 typename Flags::EnableIfUnaligned = nullptr)
0067 {
0068 return _mm256_loadu_pd(x);
0069 }
0070 template <typename Flags>
0071 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0072 typename Flags::EnableIfStreaming = nullptr)
0073 {
0074 return AvxIntrinsics::stream_load<__m256d>(x);
0075 }
0076
0077 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0078 Vc_INTRINSIC Vc_PURE __m256i
0079 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
0080 {
0081 return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
0082 }
0083 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0084 Vc_INTRINSIC Vc_PURE __m256i
0085 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
0086 {
0087 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
0088 }
0089 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0090 Vc_INTRINSIC Vc_PURE __m256i
0091 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
0092 {
0093 return AvxIntrinsics::stream_load<__m256i>(x);
0094 }
0095
0096
0097 Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
0098 {
0099 return _mm256_load_ps(mem);
0100 }
0101 Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
0102 {
0103 return _mm256_loadu_ps(mem);
0104 }
0105 Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
0106 {
0107 return AvxIntrinsics::stream_load<__m256>(mem);
0108 }
0109 Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
0110 {
0111 return _mm256_load_pd(mem);
0112 }
0113 Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
0114 {
0115 return _mm256_loadu_pd(mem);
0116 }
0117 Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
0118 {
0119 return AvxIntrinsics::stream_load<__m256d>(mem);
0120 }
0121 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
0122 {
0123 static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0124 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0125 }
0126 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
0127 {
0128 static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0129 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0130 }
0131 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
0132 {
0133 static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0134 return AvxIntrinsics::stream_load<__m256i>(mem);
0135 }
0136
0137
0138 #ifdef Vc_MSVC
0139
0140 Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
0141 {
0142 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0143 }
0144
0145 Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
0146 {
0147 return _mm256_loadu_pd(mem);
0148 }
0149
0150 template <typename V, typename DstT>
0151 Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
0152 enable_if<(std::is_same<DstT, float>::value &&
0153 std::is_same<V, __m256>::value)> = nullarg)
0154 {
0155 return _mm256_load_ps(mem);
0156 }
0157
0158 template <typename V, typename DstT>
0159 Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
0160 enable_if<(std::is_same<DstT, float>::value &&
0161 std::is_same<V, __m256>::value)> = nullarg)
0162 {
0163 return _mm256_loadu_ps(mem);
0164 }
0165
0166 template <typename V, typename DstT>
0167 Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
0168 enable_if<(std::is_same<DstT, float>::value &&
0169 std::is_same<V, __m256>::value)> = nullarg)
0170 {
0171 return AvxIntrinsics::stream_load<__m256>(mem);
0172 }
0173
0174 template <typename V, typename DstT>
0175 Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
0176 enable_if<(std::is_same<DstT, double>::value &&
0177 std::is_same<V, __m256d>::value)> = nullarg)
0178 {
0179 return _mm256_load_pd(mem);
0180 }
0181
0182 template <typename V, typename DstT>
0183 Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
0184 enable_if<(std::is_same<DstT, double>::value &&
0185 std::is_same<V, __m256d>::value)> = nullarg)
0186 {
0187 return _mm256_loadu_pd(mem);
0188 }
0189
0190 template <typename V, typename DstT>
0191 Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
0192 enable_if<(std::is_same<DstT, double>::value &&
0193 std::is_same<V, __m256d>::value)> = nullarg)
0194 {
0195 return AvxIntrinsics::stream_load<__m256d>(mem);
0196 }
0197
0198 template <typename V, typename DstT>
0199 Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
0200 enable_if<(std::is_same<DstT, uint>::value &&
0201 std::is_same<V, __m256i>::value)> = nullarg)
0202 {
0203 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0204 }
0205
0206 template <typename V, typename DstT>
0207 Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
0208 enable_if<(std::is_same<DstT, uint>::value &&
0209 std::is_same<V, __m256i>::value)> = nullarg)
0210 {
0211 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0212 }
0213
0214 template <typename V, typename DstT>
0215 Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
0216 enable_if<(std::is_same<DstT, uint>::value &&
0217 std::is_same<V, __m256i>::value)> = nullarg)
0218 {
0219 return AvxIntrinsics::stream_load<__m256i>(mem);
0220 }
0221
0222 template <typename V, typename DstT>
0223 Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
0224 enable_if<(std::is_same<DstT, int>::value &&
0225 std::is_same<V, __m256i>::value)> = nullarg)
0226 {
0227 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0228 }
0229
0230 template <typename V, typename DstT>
0231 Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
0232 enable_if<(std::is_same<DstT, int>::value &&
0233 std::is_same<V, __m256i>::value)> = nullarg)
0234 {
0235 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0236 }
0237
0238 template <typename V, typename DstT>
0239 Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
0240 enable_if<(std::is_same<DstT, int>::value &&
0241 std::is_same<V, __m256i>::value)> = nullarg)
0242 {
0243 return AvxIntrinsics::stream_load<__m256i>(mem);
0244 }
0245
0246 template <typename V, typename DstT>
0247 Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
0248 enable_if<(std::is_same<DstT, short>::value &&
0249 std::is_same<V, __m256i>::value)> = nullarg)
0250 {
0251 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0252 }
0253
0254 template <typename V, typename DstT>
0255 Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
0256 enable_if<(std::is_same<DstT, short>::value &&
0257 std::is_same<V, __m256i>::value)> = nullarg)
0258 {
0259 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0260 }
0261
0262 template <typename V, typename DstT>
0263 Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
0264 enable_if<(std::is_same<DstT, short>::value &&
0265 std::is_same<V, __m256i>::value)> = nullarg)
0266 {
0267 return AvxIntrinsics::stream_load<__m256i>(mem);
0268 }
0269
0270 template <typename V, typename DstT>
0271 Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
0272 enable_if<(std::is_same<DstT, ushort>::value &&
0273 std::is_same<V, __m256i>::value)> = nullarg)
0274 {
0275 return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0276 }
0277
0278 template <typename V, typename DstT>
0279 Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
0280 enable_if<(std::is_same<DstT, ushort>::value &&
0281 std::is_same<V, __m256i>::value)> = nullarg)
0282 {
0283 return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0284 }
0285
0286 template <typename V, typename DstT>
0287 Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
0288 enable_if<(std::is_same<DstT, ushort>::value &&
0289 std::is_same<V, __m256i>::value)> = nullarg)
0290 {
0291 return AvxIntrinsics::stream_load<__m256i>(mem);
0292 }
0293
0294 #endif
0295
0296
0297 template <typename Flags>
0298 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
0299 {
0300 return load32(mem, f);
0301 }
0302 template <typename Flags>
0303 Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
0304 {
0305 return AVX::cvtepu8_epi16(load16(mem, f));
0306 }
0307 template <typename Flags>
0308 Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
0309 {
0310 return AVX::cvtepi8_epi16(load16(mem, f));
0311 }
0312
0313
0314 template <typename Flags>
0315 Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
0316 {
0317 return AVX::cvtepu8_epi16(load16(mem, f));
0318 }
0319
0320
0321 template <typename Flags>
0322 Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
0323 {
0324 return load32(mem, f);
0325 }
0326 template <typename Flags>
0327 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
0328 {
0329 return AVX::cvtepu16_epi32(load16(mem, f));
0330 }
0331 template <typename Flags>
0332 Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
0333 {
0334 return AVX::cvtepi16_epi32(load16(mem, f));
0335 }
0336 template <typename Flags>
0337 Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
0338 {
0339 return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0340 }
0341 template <typename Flags>
0342 Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
0343 {
0344 return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0345 }
0346
0347
0348 template <typename Flags>
0349 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
0350 {
0351 return AVX::cvtepu16_epi32(load16(mem, f));
0352 }
0353 template <typename Flags>
0354 Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
0355 {
0356 return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0357 }
0358
0359
0360 template <typename Flags>
0361 Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
0362 {
0363 return AVX::convert<float, double>(load16(mem, f));
0364 }
0365 template <typename Flags>
0366 Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
0367 {
0368 return AVX::convert<uint, double>(load16(mem, f));
0369 }
0370 template <typename Flags>
0371 Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
0372 {
0373 return AVX::convert<int, double>(load16(mem, f));
0374 }
0375 template <typename Flags>
0376 Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
0377 {
0378 return AVX::convert<int, double>(load16(mem, f));
0379 }
0380 template <typename Flags>
0381 Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
0382 {
0383 return AVX::convert<int, double>(load16(mem, f));
0384 }
0385 template <typename Flags>
0386 Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
0387 {
0388 return AVX::convert<int, double>(load16(mem, f));
0389 }
0390 template <typename Flags>
0391 Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
0392 {
0393 return AVX::convert<int, double>(load16(mem, f));
0394 }
0395
0396
0397 template <typename Flags>
0398 Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
0399 {
0400 return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
0401 _mm256_cvtpd_ps(load32(&mem[4], f)));
0402 }
0403 template <typename Flags>
0404 Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
0405 {
0406 const auto v = load32(mem, f);
0407 return _mm256_blendv_ps(
0408 _mm256_cvtepi32_ps(v),
0409 _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
0410 AVX::set2power31_ps()),
0411 _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
0412 }
0413 template <typename Flags>
0414 Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
0415 {
0416 return AVX::convert<int, float>(load32(mem, f));
0417 }
0418 template <typename T, typename Flags,
0419 typename = enable_if<!std::is_same<T, float>::value>>
0420 Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
0421 {
0422 return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
0423 }
0424 template <typename Flags>
0425 Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
0426 {
0427 return AVX::convert<ushort, float>(load16(mem, f));
0428 }
0429 template <typename Flags>
0430 Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
0431 {
0432 return AVX::convert<short, float>(load16(mem, f));
0433 }
0434
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444
0445
0446
0447
0448
0449
0450
0451
0452 template <int amount, typename T>
0453 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
0454 {
0455 return AVX::avx_cast<T>(AVX::zeroExtend(
0456 _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
0457 }
0458 template <int amount, typename T>
0459 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
0460 shifted(T k)
0461 {
0462 return AVX::avx_cast<T>(
0463 AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
0464 AVX::avx_cast<__m256i>(k)));
0465 }
0466 template <int amount, typename T>
0467 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
0468 {
0469 return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
0470 _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
0471 }
0472 template <int amount, typename T>
0473 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
0474 shifted(T k)
0475 {
0476 return AVX::avx_cast<T>(
0477 AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
0478 Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
0479 }
0480
0481 template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
0482 {
0483 static_assert(From == To, "Incorrect mask cast.");
0484 static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
0485 return AVX::avx_cast<__m256>(k);
0486 }
0487
0488
0489 template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
0490 {
0491 return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
0492 }
0493
0494 template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
0495 {
0496 const auto kk = _mm_castsi128_ps(k);
0497 return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
0498 }
0499
0500
0501 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
0502 {
0503
0504 return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
0505 _mm_setzero_si128()));
0506 }
0507
0508 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
0509 {
0510
0511 return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
0512 }
0513
0514 template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
0515 {
0516 return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
0517 }
0518
0519
0520 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
0521 {
0522
0523 return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
0524 }
0525
0526
0527 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
0528 {
0529
0530 const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
0531 return AVX::concat(_mm_unpacklo_ps(lo, lo),
0532 _mm_unpackhi_ps(lo, lo));
0533 }
0534
0535 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
0536 {
0537 return AVX::avx_cast<__m128>(AVX::lo128(k));
0538 }
0539
0540 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
0541 {
0542
0543 const auto tmp = _mm_unpacklo_epi16(k, k);
0544 return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp),
0545 _mm_unpackhi_epi32(tmp, tmp)));
0546 }
0547
0548
0549 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
0550 {
0551
0552 return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
0553 }
0554
0555 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
0556 {
0557 return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
0558 _mm_unpackhi_epi16(k, k)));
0559 }
0560
0561
0562 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
0563 {
0564
0565 return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
0566 }
0567
0568
0569 #ifdef Vc_IMPL_AVX2
0570 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
0571 {
0572
0573 const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
0574 return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
0575 }
0576 #endif
0577
0578
0579 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
0580 {
0581
0582 const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k));
0583 return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
0584 }
0585
0586
0587 template<> Vc_INTRINSIC Vc_CONST __m256 allone<__m256 >() { return AVX::setallone_ps(); }
0588 template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
0589 template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }
0590
0591
0592 template<> Vc_INTRINSIC Vc_CONST __m256 zero<__m256 >() { return _mm256_setzero_ps(); }
0593 template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
0594 template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }
0595
0596
0597 Vc_INTRINSIC Vc_CONST __m256 one( float) { return AVX::setone_ps (); }
0598 Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd (); }
0599 Vc_INTRINSIC Vc_CONST __m256i one( int) { return AVX::setone_epi32(); }
0600 Vc_INTRINSIC Vc_CONST __m256i one( uint) { return AVX::setone_epu32(); }
0601 Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
0602 Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
0603 Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
0604 Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }
0605
0606
0607 Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
0608 {
0609 return _mm256_xor_ps(v, AVX::setsignmask_ps());
0610 }
0611 Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
0612 {
0613 return _mm256_xor_pd(v, AVX::setsignmask_pd());
0614 }
0615 Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
0616 {
0617 return AVX::sign_epi32(v, Detail::allone<__m256i>());
0618 }
0619 Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
0620 {
0621 return AVX::sign_epi16(v, Detail::allone<__m256i>());
0622 }
0623
0624
0625 Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
0626 Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
0627 Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
0628 {
0629 #ifdef Vc_IMPL_AVX2
0630 return _mm256_xor_si256(a, b);
0631 #else
0632 return _mm256_castps_si256(
0633 _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0634 #endif
0635 }
0636
0637
0638 Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
0639 Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
0640 Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
0641 {
0642 #ifdef Vc_IMPL_AVX2
0643 return _mm256_or_si256(a, b);
0644 #else
0645 return _mm256_castps_si256(
0646 _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0647 #endif
0648 }
0649
0650
0651 Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
0652 Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
0653 Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
0654 #ifdef Vc_IMPL_AVX2
0655 return _mm256_and_si256(a, b);
0656 #else
0657 return _mm256_castps_si256(
0658 _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0659 #endif
0660 }
0661
0662
0663 Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
0664 Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
0665 Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
0666 {
0667 #ifdef Vc_IMPL_AVX2
0668 return _mm256_andnot_si256(a, b);
0669 #else
0670 return _mm256_castps_si256(
0671 _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0672 #endif
0673 }
0674
0675
0676 Vc_INTRINSIC __m256 not_(__m256 a) { return andnot_(a, allone<__m256 >()); }
0677 Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
0678 Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }
0679
0680
0681 Vc_INTRINSIC __m256 blend(__m256 a, __m256 b, __m256 c) { return _mm256_blendv_ps(a, b, c); }
0682 Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
0683 Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }
0684
0685
0686 Vc_INTRINSIC __m256 abs(__m256 a, float) { return and_(a, AVX::setabsmask_ps()); }
0687 Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
0688 Vc_INTRINSIC __m256i abs(__m256i a, int) { return AVX::abs_epi32(a); }
0689 Vc_INTRINSIC __m256i abs(__m256i a, uint) { return a; }
0690 Vc_INTRINSIC __m256i abs(__m256i a, short) { return AVX::abs_epi16(a); }
0691 Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
0692 Vc_INTRINSIC __m256i abs(__m256i a, schar) { return AVX::abs_epi8 (a); }
0693 Vc_INTRINSIC __m256i abs(__m256i a, uchar) { return a; }
0694
0695
0696 Vc_INTRINSIC __m256 add(__m256 a, __m256 b, float) { return _mm256_add_ps(a, b); }
0697 Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
0698 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, int) { return AVX::add_epi32(a, b); }
0699 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, uint) { return AVX::add_epi32(a, b); }
0700 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, short) { return AVX::add_epi16(a, b); }
0701 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
0702
0703
0704 Vc_INTRINSIC __m256 sub(__m256 a, __m256 b, float) { return _mm256_sub_ps(a, b); }
0705 Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
0706 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, int) { return AVX::sub_epi32(a, b); }
0707 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, uint) { return AVX::sub_epi32(a, b); }
0708 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, short) { return AVX::sub_epi16(a, b); }
0709 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
0710
0711
0712 Vc_INTRINSIC __m256 mul(__m256 a, __m256 b, float) { return _mm256_mul_ps(a, b); }
0713 Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
0714 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, int) { return AVX::mullo_epi32(a, b); }
0715 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, uint) { return AVX::mullo_epi32(a, b); }
0716 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, short) { return AVX::mullo_epi16(a, b); }
0717 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }
0718
0719
0720 Vc_INTRINSIC __m256 div(__m256 a, __m256 b, float) { return _mm256_div_ps(a, b); }
0721 Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
0722 Vc_INTRINSIC __m256i div(__m256i a, __m256i b, int) {
0723 using namespace AVX;
0724 const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
0725 const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
0726 const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
0727 const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
0728 return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
0729 _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
0730 }
0731 Vc_INTRINSIC __m256i div(__m256i a, __m256i b, uint) {
0732
0733
0734
0735
0736 using namespace AVX;
0737 const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
0738 const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
0739 const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
0740 const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
0741 const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
0742 const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
0743
0744
0745 return avx_cast<__m256i>(_mm256_blendv_ps(
0746 avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
0747 _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
0748 avx_cast<__m256>(a),
0749 avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
0750 }
0751 Vc_INTRINSIC __m256i div(__m256i a, __m256i b, short) {
0752 using namespace AVX;
0753 const __m256 lo =
0754 _mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
0755 const __m256 hi =
0756 _mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
0757 return concat(convert<float, short>(lo), convert<float, short>(hi));
0758 }
0759
0760
0761 template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0762 {
0763 return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
0764 }
0765
0766
0767 template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0768 {
0769 return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
0770 }
0771
0772
0773 template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0774 {
0775 return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
0776 }
0777
0778
0779 template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0780 {
0781 return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
0782 }
0783
0784 Vc_INTRINSIC __m256 cmpeq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpeq_ps(a, b); }
0785 Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
0786 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
0787 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
0788 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
0789 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
0790
0791
0792 Vc_INTRINSIC __m256 cmpneq(__m256 a, __m256 b, float) { return AvxIntrinsics::cmpneq_ps(a, b); }
0793 Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
0794 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
0795 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
0796 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
0797 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
0798 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
0799 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
0800
0801
0802 Vc_INTRINSIC __m256 cmpgt(__m256 a, __m256 b, float) { return AVX::cmpgt_ps(a, b); }
0803 Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
0804 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(a, b); }
0805 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(a, b); }
0806 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(a, b); }
0807 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
0808 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (a, b); }
0809 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (a, b); }
0810
0811
0812 Vc_INTRINSIC __m256 cmpge(__m256 a, __m256 b, float) { return AVX::cmpge_ps(a, b); }
0813 Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
0814 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(b, a)); }
0815 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(b, a)); }
0816 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(b, a)); }
0817 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
0818 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
0819 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }
0820
0821
0822 Vc_INTRINSIC __m256 cmple(__m256 a, __m256 b, float) { return AVX::cmple_ps(a, b); }
0823 Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
0824 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, int) { return not_(AVX::cmpgt_epi32(a, b)); }
0825 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uint) { return not_(AVX::cmpgt_epu32(a, b)); }
0826 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, short) { return not_(AVX::cmpgt_epi16(a, b)); }
0827 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
0828 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
0829 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }
0830
0831
0832 Vc_INTRINSIC __m256 cmplt(__m256 a, __m256 b, float) { return AVX::cmplt_ps(a, b); }
0833 Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
0834 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, int) { return AVX::cmpgt_epi32(b, a); }
0835 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uint) { return AVX::cmpgt_epu32(b, a); }
0836 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, short) { return AVX::cmpgt_epi16(b, a); }
0837 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
0838 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, schar) { return AVX::cmpgt_epi8 (b, a); }
0839 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, uchar) { return AVX::cmpgt_epu8 (b, a); }
0840
0841
0842 Vc_INTRINSIC __m256 fma(__m256 a, __m256 b, __m256 c, float) {
0843 #ifdef Vc_IMPL_FMA4
0844 return _mm256_macc_ps(a, b, c);
0845 #elif defined Vc_IMPL_FMA
0846 return _mm256_fmadd_ps(a, b, c);
0847 #else
0848 using namespace AVX;
0849 __m256d v1_0 = _mm256_cvtps_pd(lo128(a));
0850 __m256d v1_1 = _mm256_cvtps_pd(hi128(a));
0851 __m256d v2_0 = _mm256_cvtps_pd(lo128(b));
0852 __m256d v2_1 = _mm256_cvtps_pd(hi128(b));
0853 __m256d v3_0 = _mm256_cvtps_pd(lo128(c));
0854 __m256d v3_1 = _mm256_cvtps_pd(hi128(c));
0855 return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
0856 _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
0857 #endif
0858 }
0859 Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
0860 {
0861 #ifdef Vc_IMPL_FMA4
0862 return _mm256_macc_pd(a, b, c);
0863 #elif defined Vc_IMPL_FMA
0864 return _mm256_fmadd_pd(a, b, c);
0865 #else
0866 using namespace AVX;
0867 __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
0868 &c_general::highMaskDouble)));
0869 __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
0870 &c_general::highMaskDouble)));
0871 const __m256d l1 = _mm256_sub_pd(a, h1);
0872 const __m256d l2 = _mm256_sub_pd(b, h2);
0873 const __m256d ll = mul(l1, l2, double());
0874 const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
0875 const __m256d hh = mul(h1, h2, double());
0876
0877 const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());
0878 const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
0879 const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
0880 return add(add(ll, x, double()), add(y, hh, double()), double());
0881 #endif
0882 }
0883 template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
0884 {
0885 return add(mul(a, b, T()), c, T());
0886 }
0887
0888
0889 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, int) { return AVX::srai_epi32<shift>(a); }
0890 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, uint) { return AVX::srli_epi32<shift>(a); }
0891 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, short) { return AVX::srai_epi16<shift>(a); }
0892 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
0893
0894
0895
0896 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
0897 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
0898 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
0899 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
0900
0901
0902
0903
0904 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, int) { return AVX::slli_epi32<shift>(a); }
0905 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, uint) { return AVX::slli_epi32<shift>(a); }
0906 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, short) { return AVX::slli_epi16<shift>(a); }
0907 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
0908
0909
0910
0911 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
0912 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
0913 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
0914 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
0915
0916
0917
0918
0919 Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m256 x) { return x; }
0920 Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
0921 Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
0922 Vc_INTRINSIC __m256 zeroExtendIfNeeded(__m128 x) { return AVX::zeroExtend(x); }
0923 Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
0924 Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }
0925
0926
0927 Vc_INTRINSIC __m256 avx_broadcast( float x) { return _mm256_set1_ps(x); }
0928 Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
0929 Vc_INTRINSIC __m256i avx_broadcast( int x) { return _mm256_set1_epi32(x); }
0930 Vc_INTRINSIC __m256i avx_broadcast( uint x) { return _mm256_set1_epi32(x); }
0931 Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
0932 Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
0933 Vc_INTRINSIC __m256i avx_broadcast( char x) { return _mm256_set1_epi8(x); }
0934 Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
0935 Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }
0936
0937
0938 template <Vc::Implementation Impl, typename T,
0939 typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
0940 Vc_CONST_L AVX2::Vector<T> Vc_VDECL sorted(AVX2::Vector<T> x) Vc_CONST_R;
0941 template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
0942 {
0943 return sorted<CurrentImplementation::current()>(x);
0944 }
0945
0946
0947 template <typename T, typename V>
0948 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
0949 {
0950 using namespace AVX;
0951 constexpr int S = sizeof(T);
0952 switch (amount) {
0953 case 0: return v;
0954 case 1: return shifted<sanitize<V>( 1 * S)>(v);
0955 case 2: return shifted<sanitize<V>( 2 * S)>(v);
0956 case 3: return shifted<sanitize<V>( 3 * S)>(v);
0957 case -1: return shifted<sanitize<V>(-1 * S)>(v);
0958 case -2: return shifted<sanitize<V>(-2 * S)>(v);
0959 case -3: return shifted<sanitize<V>(-3 * S)>(v);
0960 }
0961 if (sizeof(T) <= 4) {
0962 switch (amount) {
0963 case 4: return shifted<sanitize<V>( 4 * S)>(v);
0964 case 5: return shifted<sanitize<V>( 5 * S)>(v);
0965 case 6: return shifted<sanitize<V>( 6 * S)>(v);
0966 case 7: return shifted<sanitize<V>( 7 * S)>(v);
0967 case -4: return shifted<sanitize<V>(-4 * S)>(v);
0968 case -5: return shifted<sanitize<V>(-5 * S)>(v);
0969 case -6: return shifted<sanitize<V>(-6 * S)>(v);
0970 case -7: return shifted<sanitize<V>(-7 * S)>(v);
0971 }
0972 if (sizeof(T) <= 2) {
0973 switch (amount) {
0974 case 8: return shifted<sanitize<V>( 8 * S)>(v);
0975 case 9: return shifted<sanitize<V>( 9 * S)>(v);
0976 case 10: return shifted<sanitize<V>( 10 * S)>(v);
0977 case 11: return shifted<sanitize<V>( 11 * S)>(v);
0978 case 12: return shifted<sanitize<V>( 12 * S)>(v);
0979 case 13: return shifted<sanitize<V>( 13 * S)>(v);
0980 case 14: return shifted<sanitize<V>( 14 * S)>(v);
0981 case 15: return shifted<sanitize<V>( 15 * S)>(v);
0982 case -8: return shifted<sanitize<V>(- 8 * S)>(v);
0983 case -9: return shifted<sanitize<V>(- 9 * S)>(v);
0984 case -10: return shifted<sanitize<V>(-10 * S)>(v);
0985 case -11: return shifted<sanitize<V>(-11 * S)>(v);
0986 case -12: return shifted<sanitize<V>(-12 * S)>(v);
0987 case -13: return shifted<sanitize<V>(-13 * S)>(v);
0988 case -14: return shifted<sanitize<V>(-14 * S)>(v);
0989 case -15: return shifted<sanitize<V>(-15 * S)>(v);
0990 }
0991 if (sizeof(T) == 1) {
0992 switch (amount) {
0993 case 16: return shifted<sanitize<V>( 16)>(v);
0994 case 17: return shifted<sanitize<V>( 17)>(v);
0995 case 18: return shifted<sanitize<V>( 18)>(v);
0996 case 19: return shifted<sanitize<V>( 19)>(v);
0997 case 20: return shifted<sanitize<V>( 20)>(v);
0998 case 21: return shifted<sanitize<V>( 21)>(v);
0999 case 22: return shifted<sanitize<V>( 22)>(v);
1000 case 23: return shifted<sanitize<V>( 23)>(v);
1001 case 24: return shifted<sanitize<V>( 24)>(v);
1002 case 25: return shifted<sanitize<V>( 25)>(v);
1003 case 26: return shifted<sanitize<V>( 26)>(v);
1004 case 27: return shifted<sanitize<V>( 27)>(v);
1005 case 28: return shifted<sanitize<V>( 28)>(v);
1006 case 29: return shifted<sanitize<V>( 29)>(v);
1007 case 30: return shifted<sanitize<V>( 30)>(v);
1008 case 31: return shifted<sanitize<V>( 31)>(v);
1009 case -16: return shifted<sanitize<V>(-16)>(v);
1010 case -17: return shifted<sanitize<V>(-17)>(v);
1011 case -18: return shifted<sanitize<V>(-18)>(v);
1012 case -19: return shifted<sanitize<V>(-19)>(v);
1013 case -20: return shifted<sanitize<V>(-20)>(v);
1014 case -21: return shifted<sanitize<V>(-21)>(v);
1015 case -22: return shifted<sanitize<V>(-22)>(v);
1016 case -23: return shifted<sanitize<V>(-23)>(v);
1017 case -24: return shifted<sanitize<V>(-24)>(v);
1018 case -25: return shifted<sanitize<V>(-25)>(v);
1019 case -26: return shifted<sanitize<V>(-26)>(v);
1020 case -27: return shifted<sanitize<V>(-27)>(v);
1021 case -28: return shifted<sanitize<V>(-28)>(v);
1022 case -29: return shifted<sanitize<V>(-29)>(v);
1023 case -30: return shifted<sanitize<V>(-30)>(v);
1024 case -31: return shifted<sanitize<V>(-31)>(v);
1025 }
1026 }
1027 }
1028 }
1029 return avx_cast<V>(_mm256_setzero_ps());
1030 }
1031
1032 template <typename T, typename V>
1033 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
1034 {
1035 using namespace AVX;
1036 switch (amount) {
1037 case 0: return v;
1038 case 1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
1039 case 2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
1040 case 3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
1041 case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
1042 case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
1043 case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
1044 }
1045 if (sizeof(T) <= 2) {
1046 switch (amount) {
1047 case 4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
1048 case 5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
1049 case 6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
1050 case 7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
1051 case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
1052 case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
1053 case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
1054 case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
1055 }
1056 }
1057 return avx_cast<V>(_mm_setzero_ps());
1058 }
1059
1060 template <typename T, size_t N, typename V>
1061 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
1062 int amount)
1063 {
1064 using namespace AVX;
1065 const __m128i vLo = avx_cast<__m128i>(lo128(v));
1066 const __m128i vHi = avx_cast<__m128i>(hi128(v));
1067 switch (static_cast<unsigned int>(amount) % N) {
1068 case 0:
1069 return v;
1070 case 1:
1071 return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
1072 SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
1073 case 2:
1074 return Mem::permute128<X1, X0>(v);
1075 case 3:
1076 return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
1077 SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
1078 }
1079 return avx_cast<V>(_mm256_setzero_ps());
1080 }
1081
1082 template <typename T, size_t N, typename V>
1083 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
1084 int amount)
1085 {
1086 using namespace AVX;
1087 const __m128i vLo = avx_cast<__m128i>(lo128(v));
1088 const __m128i vHi = avx_cast<__m128i>(hi128(v));
1089 switch (static_cast<unsigned int>(amount) % N) {
1090 case 0:
1091 return v;
1092 case 1:
1093 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
1094 SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
1095 case 2:
1096 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
1097 SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
1098 case 3:
1099 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
1100 SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
1101 case 4:
1102 return Mem::permute128<X1, X0>(v);
1103 case 5:
1104 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
1105 SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
1106 case 6:
1107 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
1108 SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
1109 case 7:
1110 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
1111 SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
1112 }
1113 return avx_cast<V>(_mm256_setzero_ps());
1114 }
1115
1116 #ifdef Vc_IMPL_AVX2
1117 template <typename T, size_t N, typename V>
1118 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
1119 V v, int amount)
1120 {
1121 using namespace AVX;
1122 const __m128i vLo = avx_cast<__m128i>(lo128(v));
1123 const __m128i vHi = avx_cast<__m128i>(hi128(v));
1124 switch (static_cast<unsigned int>(amount) % N) {
1125 case 0:
1126 return v;
1127 case 1:
1128 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
1129 SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
1130 case 2:
1131 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
1132 SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
1133 case 3:
1134 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
1135 SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
1136 case 4:
1137 return Mem::permute4x64<X1, X2, X3, X0>(v);
1138 case 5:
1139 return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
1140 SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
1141 case 6:
1142 return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
1143 SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
1144 case 7:
1145 return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
1146 SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
1147 case 8:
1148 return Mem::permute128<X1, X0>(v);
1149 case 9:
1150 return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
1151 SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
1152 case 10:
1153 return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
1154 SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
1155 case 11:
1156 return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
1157 SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
1158 case 12:
1159 return Mem::permute4x64<X3, X0, X1, X2>(v);
1160 case 13:
1161 return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
1162 SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
1163 case 14:
1164 return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
1165 SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
1166 case 15:
1167 return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
1168 SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
1169 }
1170 return avx_cast<V>(_mm256_setzero_ps());
1171 }
1172 #endif
1173
1174
1175 Vc_INTRINSIC Vc_CONST int testc(__m128 a, __m128 b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1176 Vc_INTRINSIC Vc_CONST int testc(__m256 a, __m256 b) { return _mm256_testc_ps(a, b); }
1177 Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
1178 Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }
1179
1180
1181 Vc_INTRINSIC Vc_CONST int testz(__m128 a, __m128 b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1182 Vc_INTRINSIC Vc_CONST int testz(__m256 a, __m256 b) { return _mm256_testz_ps(a, b); }
1183 Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
1184 Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }
1185
1186
1187 Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1188 Vc_INTRINSIC Vc_CONST int testnzc(__m256 a, __m256 b) { return _mm256_testnzc_ps(a, b); }
1189 Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
1190 Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }
1191
1192
1193 Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
1194 Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
1195 Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
1196 Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
1197 Vc_INTRINSIC Vc_CONST int movemask(__m256 a) { return _mm256_movemask_ps(a); }
1198 Vc_INTRINSIC Vc_CONST int movemask(__m128 a) { return _mm_movemask_ps(a); }
1199
1200
1201 template <size_t N, typename Flags>
1202 Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
1203 {
1204 static_assert(
1205 N == 4 || N == 8 || N == 16,
1206 "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
1207 switch (N) {
1208 case 4:
1209 *aliasing_cast<int32_t>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) |
1210 (_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
1211 0x01010101;
1212 break;
1213 case 8: {
1214 const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
1215 const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
1216 #ifdef __x86_64__
1217 *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k3);
1218 #else
1219 *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(k3);
1220 *aliasing_cast<int32_t>(mem + 4) = _mm_extract_epi32(k3, 1);
1221 #endif
1222 } break;
1223 case 16: {
1224 const auto bools = Detail::and_(_mm_set1_epi8(1),
1225 _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
1226 if (Flags::IsAligned) {
1227 _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
1228 } else {
1229 _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
1230 }
1231 } break;
1232 default:
1233 Vc_UNREACHABLE();
1234 }
1235 }
1236
1237
1238 template <typename R, size_t N, typename Flags>
1239 Vc_INTRINSIC R mask_load(const bool *mem, Flags,
1240 enable_if<std::is_same<R, __m128>::value> = nullarg)
1241 {
1242 static_assert(N == 4 || N == 8,
1243 "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
1244 switch (N) {
1245 case 4: {
1246 __m128i k = _mm_cvtsi32_si128(*aliasing_cast<int32_t>(mem));
1247 k = _mm_unpacklo_epi8(k, k);
1248 k = _mm_unpacklo_epi16(k, k);
1249 k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
1250 return AVX::avx_cast<__m128>(k);
1251 }
1252 case 8: {
1253 #ifdef __x86_64__
1254 __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
1255 #else
1256 __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
1257 #endif
1258 return AVX::avx_cast<__m128>(
1259 _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
1260 }
1261 default:
1262 Vc_UNREACHABLE();
1263 }
1264 }
1265
1266 template <typename R, size_t N, typename Flags>
1267 Vc_INTRINSIC R mask_load(const bool *mem, Flags,
1268 enable_if<std::is_same<R, __m256>::value> = nullarg)
1269 {
1270 static_assert(
1271 N == 4 || N == 8 || N == 16,
1272 "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
1273 switch (N) {
1274 case 4: {
1275 __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
1276 _mm_set1_ps(*aliasing_cast<float>(mem)),
1277 AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
1278 k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
1279 return AVX::avx_cast<__m256>(
1280 AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
1281 }
1282 case 8: {
1283 #ifdef __x86_64__
1284 __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
1285 #else
1286 __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
1287 #endif
1288 k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
1289 return AVX::avx_cast<__m256>(
1290 AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
1291 }
1292 case 16: {
1293 const auto k128 = _mm_cmpgt_epi8(
1294 Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
1295 : _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
1296 _mm_setzero_si128());
1297 return AVX::avx_cast<__m256>(
1298 AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
1299 }
1300 default:
1301 Vc_UNREACHABLE();
1302 return R();
1303 }
1304 }
1305
1306
1307 template <size_t Size>
1308 Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
1309 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
1310 {
1311 return movemask(AVX::avx_cast<__m256d>(k));
1312 }
1313 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
1314 {
1315 return movemask(AVX::avx_cast<__m256>(k));
1316 }
1317 #ifdef Vc_IMPL_BMI2
1318 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
1319 {
1320 return _pext_u32(movemask(k), 0x55555555u);
1321 }
1322 #endif
1323 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
1324 {
1325 return movemask(k);
1326 }
1327
1328
1329 template<typename V> struct InterleaveImpl<V, 16, 32> {
1330 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1331 const typename V::AsArg v0,
1332 const typename V::AsArg v1)
1333 {
1334 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
1335 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
1336 using namespace AVX;
1337 *aliasing_cast<uint32_t>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
1338 *aliasing_cast<uint32_t>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
1339 *aliasing_cast<uint32_t>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
1340 *aliasing_cast<uint32_t>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
1341 *aliasing_cast<uint32_t>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
1342 *aliasing_cast<uint32_t>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
1343 *aliasing_cast<uint32_t>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
1344 *aliasing_cast<uint32_t>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
1345 *aliasing_cast<uint32_t>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
1346 *aliasing_cast<uint32_t>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
1347 *aliasing_cast<uint32_t>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
1348 *aliasing_cast<uint32_t>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
1349 *aliasing_cast<uint32_t>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
1350 *aliasing_cast<uint32_t>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
1351 *aliasing_cast<uint32_t>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
1352 *aliasing_cast<uint32_t>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
1353 }
1354 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
1355 const typename V::AsArg v0, const typename V::AsArg v1)
1356 {
1357 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data());
1358 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data());
1359 V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
1360 V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
1361 }
1362 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1363 const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
1364 {
1365 interleave(data, i, v0, v1);
1366 v2.scatter(data + 2, i);
1367 }
1368 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1369 const typename V::AsArg v0, const typename V::AsArg v1,
1370 const typename V::AsArg v2, const typename V::AsArg v3)
1371 {
1372 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
1373 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
1374 const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
1375 const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
1376
1377 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1378 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1379 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1380 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1381
1382 using namespace AVX;
1383 auto &&store = [&](__m256i x, int offset) {
1384 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
1385 _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
1386 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
1387 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
1388 };
1389 store(tmp4, 0);
1390 store(tmp5, 2);
1391 store(tmp6, 4);
1392 store(tmp7, 6);
1393 }
1394 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,
1395 const typename V::AsArg v0, const typename V::AsArg v1,
1396 const typename V::AsArg v2, const typename V::AsArg v3)
1397 {
1398 const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data());
1399 const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data());
1400 const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data());
1401 const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data());
1402
1403 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1404 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1405 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1406 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1407
1408 V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
1409 V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
1410 V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
1411 V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
1412 }
1413 template <typename I>
1414 static inline void interleave(typename V::EntryType *const data, const I &i,
1415 const typename V::AsArg v0, const typename V::AsArg v1,
1416 const typename V::AsArg v2, const typename V::AsArg v3,
1417 const typename V::AsArg v4)
1418 {
1419 interleave(data, i, v0, v1, v2, v3);
1420 v4.scatter(data + 4, i);
1421 }
1422 template <typename I>
1423 static inline void interleave(typename V::EntryType *const data, const I &i,
1424 const typename V::AsArg v0, const typename V::AsArg v1,
1425 const typename V::AsArg v2, const typename V::AsArg v3,
1426 const typename V::AsArg v4, const typename V::AsArg v5)
1427 {
1428 interleave(data, i, v0, v1, v2, v3);
1429 interleave(data + 4, i, v4, v5);
1430 }
1431 template <typename I>
1432 static inline void interleave(typename V::EntryType *const data, const I &i,
1433 const typename V::AsArg v0, const typename V::AsArg v1,
1434 const typename V::AsArg v2, const typename V::AsArg v3,
1435 const typename V::AsArg v4, const typename V::AsArg v5,
1436 const typename V::AsArg v6)
1437 {
1438 interleave(data, i, v0, v1, v2, v3);
1439 interleave(data + 4, i, v4, v5, v6);
1440 }
1441 template <typename I>
1442 static inline void interleave(typename V::EntryType *const data, const I &i,
1443 const typename V::AsArg v0, const typename V::AsArg v1,
1444 const typename V::AsArg v2, const typename V::AsArg v3,
1445 const typename V::AsArg v4, const typename V::AsArg v5,
1446 const typename V::AsArg v6, const typename V::AsArg v7)
1447 {
1448 interleave(data, i, v0, v1, v2, v3);
1449 interleave(data + 4, i, v4, v5, v6, v7);
1450 }
1451
1452 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1453 const I &i, V &v0, V &v1)
1454 {
1455 const __m256i tmp4 =
1456 _mm256_setr_epi32(
1457 *aliasing_cast<int>(&data[i[0]]), *aliasing_cast<int>(&data[i[1]]),
1458 *aliasing_cast<int>(&data[i[2]]), *aliasing_cast<int>(&data[i[3]]),
1459 *aliasing_cast<int>(&data[i[8]]), *aliasing_cast<int>(&data[i[9]]),
1460 *aliasing_cast<int>(&data[i[10]]), *aliasing_cast<int>(&data[i[11]]));
1461 const __m256i tmp5 =
1462 _mm256_setr_epi32(
1463 *aliasing_cast<int>(&data[i[4]]), *aliasing_cast<int>(&data[i[5]]),
1464 *aliasing_cast<int>(&data[i[6]]), *aliasing_cast<int>(&data[i[7]]),
1465 *aliasing_cast<int>(&data[i[12]]), *aliasing_cast<int>(&data[i[13]]),
1466 *aliasing_cast<int>(&data[i[14]]), *aliasing_cast<int>(&data[i[15]]));
1467
1468 const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);
1469 const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);
1470
1471 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1472 const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);
1473
1474 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1475 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1476 }
1477 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1478 const I &i, V &v0, V &v1, V &v2)
1479 {
1480 using namespace AVX;
1481 const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
1482 *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
1483 *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
1484 const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
1485 *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
1486 *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
1487 const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
1488 *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
1489 *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
1490 const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
1491 *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
1492 *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
1493 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1494 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1495 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1496 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1497
1498 const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
1499 const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
1500 const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
1501 const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
1502
1503 v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
1504 v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
1505 v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
1506 }
1507 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1508 const I &i, V &v0, V &v1, V &v2, V &v3)
1509 {
1510 using namespace AVX;
1511 const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
1512 *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
1513 *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
1514 const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
1515 *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
1516 *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
1517 const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
1518 *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
1519 *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
1520 const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
1521 *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
1522 *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
1523 const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2);
1524 const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2);
1525 const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3);
1526 const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3);
1527
1528 const __m256i tmp8 = AVX::unpacklo_epi16(tmp4, tmp6);
1529 const __m256i tmp9 = AVX::unpackhi_epi16(tmp4, tmp6);
1530 const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7);
1531 const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7);
1532
1533 v0.data() = AVX::unpacklo_epi16(tmp8, tmp10);
1534 v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
1535 v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
1536 v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
1537 }
1538 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1539 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1540 {
1541 using namespace AVX;
1542 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1543 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1544 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1545 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1546 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1547 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1548 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1549 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1550 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1551 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1552 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1553 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1554 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1555 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1556 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1557 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1558
1559 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1560 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1561 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1562 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1563 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1564 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1565 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1566 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1567
1568 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1569 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1570 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1571 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1572 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1573 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1574
1575 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1576 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1577 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1578 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1579 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1580 }
1581 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1582 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1583 {
1584 using namespace AVX;
1585 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1586 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1587 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1588 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1589 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1590 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1591 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1592 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1593 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1594 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1595 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1596 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1597 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1598 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1599 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1600 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1601
1602 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1603 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1604 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1605 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1606 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1607 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1608 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1609 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1610
1611 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1612 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1613 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1614 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1615 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1616 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1617
1618 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1619 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1620 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1621 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1622 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1623 v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1624 }
1625 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1626 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1627 {
1628 using namespace AVX;
1629 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1630 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1631 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1632 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1633 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1634 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1635 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1636 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1637 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1638 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1639 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1640 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1641 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1642 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1643 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1644 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1645
1646 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1647 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1648 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1649 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1650 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1651 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1652 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1653 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1654
1655 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1656 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1657 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1658 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1659 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1660 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1661 const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
1662 const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
1663
1664 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1665 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1666 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1667 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1668 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1669 v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1670 v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
1671 }
1672 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
1673 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1674 {
1675 using namespace AVX;
1676 const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1677 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1678 const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1679 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1680 const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1681 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1682 const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1683 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1684 const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1685 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1686 const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1687 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1688 const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1689 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1690 const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1691 _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1692
1693 const __m256i tmp2 = AVX::unpacklo_epi16(a, e);
1694 const __m256i tmp4 = AVX::unpacklo_epi16(b, f);
1695 const __m256i tmp3 = AVX::unpacklo_epi16(c, g);
1696 const __m256i tmp5 = AVX::unpacklo_epi16(d, h);
1697 const __m256i tmp10 = AVX::unpackhi_epi16(a, e);
1698 const __m256i tmp11 = AVX::unpackhi_epi16(c, g);
1699 const __m256i tmp12 = AVX::unpackhi_epi16(b, f);
1700 const __m256i tmp13 = AVX::unpackhi_epi16(d, h);
1701
1702 const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);
1703 const __m256i tmp1 = AVX::unpacklo_epi16(tmp4, tmp5);
1704 const __m256i tmp6 = AVX::unpackhi_epi16(tmp2, tmp3);
1705 const __m256i tmp7 = AVX::unpackhi_epi16(tmp4, tmp5);
1706 const __m256i tmp8 = AVX::unpacklo_epi16(tmp10, tmp11);
1707 const __m256i tmp9 = AVX::unpacklo_epi16(tmp12, tmp13);
1708 const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11);
1709 const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13);
1710
1711 v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1712 v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1713 v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1714 v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1715 v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1716 v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1717 v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
1718 v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
1719 }
1720 };
1721 template<typename V> struct InterleaveImpl<V, 8, 32> {
1722 static_assert(sizeof(typename V::value_type) == 4, "");
1723 template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,
1724 const typename V::AsArg v0, const typename V::AsArg v1)
1725 {
1726 using namespace AVX;
1727
1728 const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1729
1730 const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1731 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
1732 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
1733 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
1734 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
1735 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
1736 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
1737 _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
1738 _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
1739 }
1740 static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,
1741 const typename V::AsArg v0, const typename V::AsArg v1)
1742 {
1743 using namespace AVX;
1744
1745 const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1746
1747 const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1748 _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(tmp0));
1749 _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(tmp1));
1750 _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(tmp0));
1751 _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(tmp1));
1752 }
1753
1754 template <typename I>
1755 static inline void interleave(typename V::EntryType *const data, const I &i,
1756 const typename V::AsArg v0, const typename V::AsArg v1,
1757 const typename V::AsArg v2)
1758 {
1759 using namespace AVX;
1760 #ifdef Vc_USE_MASKMOV_SCATTER
1761
1762 const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1763
1764 const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1765
1766 const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
1767
1768 const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
1769 const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
1770 const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
1771 const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
1772 const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
1773 const m128i mask = _mm_set_epi32(0, -1, -1, -1);
1774 _mm_maskstore_ps(aliasing_cast<float>(&data[i[0]]), mask, lo128(tmp4));
1775 _mm_maskstore_ps(aliasing_cast<float>(&data[i[1]]), mask, lo128(tmp5));
1776 _mm_maskstore_ps(aliasing_cast<float>(&data[i[2]]), mask, lo128(tmp6));
1777 _mm_maskstore_ps(aliasing_cast<float>(&data[i[3]]), mask, lo128(tmp7));
1778 _mm_maskstore_ps(aliasing_cast<float>(&data[i[4]]), mask, hi128(tmp4));
1779 _mm_maskstore_ps(aliasing_cast<float>(&data[i[5]]), mask, hi128(tmp5));
1780 _mm_maskstore_ps(aliasing_cast<float>(&data[i[6]]), mask, hi128(tmp6));
1781 _mm_maskstore_ps(aliasing_cast<float>(&data[i[7]]), mask, hi128(tmp7));
1782 #else
1783 interleave(data, i, v0, v1);
1784 v2.scatter(data + 2, i);
1785 #endif
1786 }
1787
1788 static inline void interleave(typename V::EntryType *const data,
1789 const Common::SuccessiveEntries<3> &i,
1790 const typename V::AsArg v0_,
1791 const typename V::AsArg v1_,
1792 const typename V::AsArg v2_)
1793 {
1794 __m256 v0 = AVX::avx_cast<__m256>(v0_.data());
1795 __m256 v1 = AVX::avx_cast<__m256>(v1_.data());
1796 __m256 v2 = AVX::avx_cast<__m256>(v2_.data());
1797
1798 v0 = _mm256_shuffle_ps(v0, v0, 0x6c);
1799 v1 = _mm256_shuffle_ps(v1, v1, 0xb1);
1800 v2 = _mm256_shuffle_ps(v2, v2, 0xc6);
1801
1802
1803 __m256 w0 = Mem::blend<X0, X1, Y2, X3, Y4, X5, X6, Y7>(
1804 Mem::blend<X0, Y1, X2, X3, X4, X5, Y6, X7>(v0, v1), v2);
1805
1806 __m256 w1 = Mem::blend<X0, Y1, X2, X3, X4, Y5, X6, X7>(
1807 Mem::blend<Y0, X1, X2, Y3, Y4, X5, X6, Y7>(v0, v1), v2);
1808
1809 __m256 w2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(
1810 Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(v0, v1), v2);
1811
1812
1813 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
1814 _mm256_permute2f128_ps(w0, w1, 0x20));
1815
1816 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8), w2);
1817
1818 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
1819 _mm256_permute2f128_ps(w1, w0, 0x31));
1820
1821 }
1822
1823 template <typename I>
1824 static inline void interleave(typename V::EntryType *const data, const I &i,
1825 const typename V::AsArg v0, const typename V::AsArg v1,
1826 const typename V::AsArg v2, const typename V::AsArg v3)
1827 {
1828 using namespace AVX;
1829 const __m256 tmp0 =
1830 _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1831 const __m256 tmp1 =
1832 _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1833 const __m256 tmp2 =
1834 _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1835 const __m256 tmp3 =
1836 _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1837 const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
1838 const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
1839 const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
1840 const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
1841 _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(_04));
1842 _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), lo128(_15));
1843 _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(_26));
1844 _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), lo128(_37));
1845 _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(_04));
1846 _mm_storeu_ps(aliasing_cast<float>(&data[i[5]]), hi128(_15));
1847 _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(_26));
1848 _mm_storeu_ps(aliasing_cast<float>(&data[i[7]]), hi128(_37));
1849 }
1850
1851
1852 static inline void interleave(typename V::EntryType *const data,
1853 const Common::SuccessiveEntries<4> &i,
1854 const typename V::AsArg v0, const typename V::AsArg v1,
1855 const typename V::AsArg v2, const typename V::AsArg v3)
1856 {
1857 using namespace AVX;
1858 const __m256 tmp0 =
1859 _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1860 const __m256 tmp1 =
1861 _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1862 const __m256 tmp2 =
1863 _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1864 const __m256 tmp3 =
1865 _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1866 const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
1867 const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
1868 const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
1869 const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
1870 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
1871 _mm256_permute2f128_ps(_04, _15, 0x20));
1872 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8),
1873 _mm256_permute2f128_ps(_26, _37, 0x20));
1874 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
1875 _mm256_permute2f128_ps(_04, _15, 0x31));
1876 _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 24),
1877 _mm256_permute2f128_ps(_26, _37, 0x31));
1878 }
1879 template <typename I>
1880 static inline void interleave(typename V::EntryType *const data, const I &i,
1881 const typename V::AsArg v0, const typename V::AsArg v1,
1882 const typename V::AsArg v2, const typename V::AsArg v3,
1883 const typename V::AsArg v4)
1884 {
1885 interleave(data, i, v0, v1, v2, v3);
1886 v4.scatter(data + 4, i);
1887 }
1888 template <typename I>
1889 static inline void interleave(typename V::EntryType *const data, const I &i,
1890 const typename V::AsArg v0, const typename V::AsArg v1,
1891 const typename V::AsArg v2, const typename V::AsArg v3,
1892 const typename V::AsArg v4, const typename V::AsArg v5)
1893 {
1894 interleave(data, i, v0, v1, v2, v3);
1895 interleave(data + 4, i, v4, v5);
1896 }
1897 template <typename I>
1898 static inline void interleave(typename V::EntryType *const data, const I &i,
1899 const typename V::AsArg v0, const typename V::AsArg v1,
1900 const typename V::AsArg v2, const typename V::AsArg v3,
1901 const typename V::AsArg v4, const typename V::AsArg v5,
1902 const typename V::AsArg v6)
1903 {
1904 interleave(data, i, v0, v1, v2, v3);
1905 interleave(data + 4, i, v4, v5, v6);
1906 }
1907 template <typename I>
1908 static inline void interleave(typename V::EntryType *const data, const I &i,
1909 const typename V::AsArg v0, const typename V::AsArg v1,
1910 const typename V::AsArg v2, const typename V::AsArg v3,
1911 const typename V::AsArg v4, const typename V::AsArg v5,
1912 const typename V::AsArg v6, const typename V::AsArg v7)
1913 {
1914 interleave(data, i, v0, v1, v2, v3);
1915 interleave(data + 4, i, v4, v5, v6, v7);
1916 }
1917
1918
1919 template <typename I>
1920 static inline void deinterleave(typename V::EntryType const *const data, const I &i,
1921 V &v0, V &v1)
1922 {
1923 using namespace AVX;
1924 const m128 il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]]));
1925 const m128 il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]]));
1926 const m128 il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]]));
1927 const m128 il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]]));
1928 const m128 il01 = _mm_loadh_pi( il0, reinterpret_cast<__m64 const *>(&data[i[1]]));
1929 const m128 il23 = _mm_loadh_pi( il2, reinterpret_cast<__m64 const *>(&data[i[3]]));
1930 const m128 il45 = _mm_loadh_pi( il4, reinterpret_cast<__m64 const *>(&data[i[5]]));
1931 const m128 il67 = _mm_loadh_pi( il6, reinterpret_cast<__m64 const *>(&data[i[7]]));
1932
1933 const m256 tmp2 = concat(il01, il45);
1934 const m256 tmp3 = concat(il23, il67);
1935
1936 const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
1937 const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
1938
1939 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
1940 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
1941 }
1942
1943 static inline void deinterleave(typename V::EntryType const *const data,
1944 const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
1945 {
1946 using namespace AVX;
1947 const m256 il0123 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1948 const m256 il4567 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[4]]));
1949
1950 const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
1951 const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);
1952
1953 const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
1954 const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
1955
1956 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
1957 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
1958 }
1959
1960 template <typename I>
1961 static inline void deinterleave(typename V::EntryType const *const data, const I &i,
1962 V &v0, V &v1, V &v2)
1963 {
1964 using namespace AVX;
1965 const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
1966 const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
1967 const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
1968 const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
1969 const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
1970 const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
1971 const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
1972 const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
1973
1974 const m256 il04 = concat(il0, il4);
1975 const m256 il15 = concat(il1, il5);
1976 const m256 il26 = concat(il2, il6);
1977 const m256 il37 = concat(il3, il7);
1978 const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
1979 const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
1980 const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
1981 const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
1982 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
1983 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
1984 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
1985 }
1986
1987 static inline void deinterleave(typename V::EntryType const *const data,
1988 const Common::SuccessiveEntries<3> &i, V &v0, V &v1,
1989 V &v2)
1990 {
1991
1992 __m256 in0 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 0));
1993
1994 __m256 in1 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 8));
1995
1996 __m256 in2 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 16));
1997
1998
1999
2000
2001
2002 const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20);
2003 const __m256 cdddeeef = in1;
2004 const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31);
2005
2006
2007
2008
2009 const __m256 x0 = _mm256_blend_ps(
2010 _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80),
2011 bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0);
2012 const __m256 x1 = _mm256_blend_ps(
2013 _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0),
2014 bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0);
2015 const __m256 x2 = _mm256_blend_ps(
2016 _mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0),
2017 bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80);
2018
2019
2020
2021 v0 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x0, x0, 0x6c));
2022 v1 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x1, x1, 0xb1));
2023 v2 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x2, x2, 0xc6));
2024 }
2025
2026 template <typename I>
2027 static inline void deinterleave(typename V::EntryType const *const data, const I &i,
2028 V &v0, V &v1, V &v2, V &v3)
2029 {
2030 using namespace AVX;
2031 const m128 il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]]));
2032 const m128 il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]]));
2033 const m128 il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]]));
2034 const m128 il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]]));
2035 const m128 il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]]));
2036 const m128 il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]]));
2037 const m128 il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]]));
2038 const m128 il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]]));
2039
2040 const m256 il04 = concat(il0, il4);
2041 const m256 il15 = concat(il1, il5);
2042 const m256 il26 = concat(il2, il6);
2043 const m256 il37 = concat(il3, il7);
2044 const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
2045 const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
2046 const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
2047 const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
2048 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
2049 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
2050 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
2051 v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
2052 }
2053
2054 static inline void deinterleave(typename V::EntryType const *const data,
2055 const Common::SuccessiveEntries<4> &i, V &v0, V &v1,
2056 V &v2, V &v3)
2057 {
2058 using namespace AVX;
2059 const __m256 il01 = _mm256_loadu_ps(
2060 aliasing_cast<float>(&data[i[0]]));
2061 const __m256 il23 = _mm256_loadu_ps(
2062 aliasing_cast<float>(&data[i[2]]));
2063 const __m256 il45 = _mm256_loadu_ps(
2064 aliasing_cast<float>(&data[i[4]]));
2065 const __m256 il67 = _mm256_loadu_ps(
2066 aliasing_cast<float>(&data[i[6]]));
2067
2068 const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20);
2069 const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31);
2070 const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20);
2071 const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31);
2072 const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
2073 const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
2074 const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
2075 const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
2076 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
2077 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
2078 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
2079 v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
2080 }
2081 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2082 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
2083 {
2084 v4.gather(data + 4, i);
2085 deinterleave(data, i, v0, v1, v2, v3);
2086 }
2087 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2088 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2089 {
2090 deinterleave(data, i, v0, v1, v2, v3);
2091 deinterleave(data + 4, i, v4, v5);
2092 }
2093 static inline void deinterleave(typename V::EntryType const *const data,
2094 const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2095 {
2096 using namespace AVX;
2097 const m256 a = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
2098 const m256 b = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 1 * V::Size]));
2099 const m256 c = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 2 * V::Size]));
2100 const m256 d = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 3 * V::Size]));
2101 const m256 e = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 4 * V::Size]));
2102 const m256 f = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 5 * V::Size]));
2103 const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
2104 const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
2105 const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
2106 const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
2107 const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
2108 const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
2109 const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
2110 const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
2111 const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
2112 const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
2113 const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
2114 const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
2115 v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
2116 v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
2117 v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
2118 v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
2119 v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
2120 v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
2121 }
2122 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2123 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
2124 {
2125 deinterleave(data, i, v0, v1, v2, v3);
2126 deinterleave(data + 4, i, v4, v5, v6);
2127 }
2128 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2129 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
2130 {
2131 deinterleave(data, i, v0, v1, v2, v3);
2132 deinterleave(data + 4, i, v4, v5, v6, v7);
2133 }
2134 };
2135 template<typename V> struct InterleaveImpl<V, 4, 32> {
2136 template <typename I>
2137 static inline void interleave(typename V::EntryType *const data, const I &i,
2138 const typename V::AsArg v0, const typename V::AsArg v1)
2139 {
2140 using namespace AVX;
2141 const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2142 const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2143 _mm_storeu_pd(&data[i[0]], lo128(tmp0));
2144 _mm_storeu_pd(&data[i[1]], lo128(tmp1));
2145 _mm_storeu_pd(&data[i[2]], hi128(tmp0));
2146 _mm_storeu_pd(&data[i[3]], hi128(tmp1));
2147 }
2148 template <typename I>
2149 static inline void interleave(typename V::EntryType *const data, const I &i,
2150 const typename V::AsArg v0, const typename V::AsArg v1,
2151 const typename V::AsArg v2)
2152 {
2153 using namespace AVX;
2154 #ifdef Vc_USE_MASKMOV_SCATTER
2155 const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2156 const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2157 const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
2158 const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());
2159
2160 #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
2161
2162 const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
2163 #else
2164 const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
2165 #endif
2166 _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
2167 _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
2168 _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
2169 _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
2170 #else
2171 interleave(data, i, v0, v1);
2172 v2.scatter(data + 2, i);
2173 #endif
2174 }
2175 template <typename I>
2176 static inline void interleave(typename V::EntryType *const data, const I &i,
2177 const typename V::AsArg v0, const typename V::AsArg v1,
2178 const typename V::AsArg v2, const typename V::AsArg v3)
2179 {
2180 using namespace AVX;
2181
2182 const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2183
2184 const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2185
2186 const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
2187
2188 const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
2189
2190
2191
2192
2193
2194
2195
2196 _mm_storeu_pd(&data[i[0] ], lo128(tmp0));
2197 _mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
2198 _mm_storeu_pd(&data[i[1] ], lo128(tmp1));
2199 _mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
2200 _mm_storeu_pd(&data[i[2] ], hi128(tmp0));
2201 _mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
2202 _mm_storeu_pd(&data[i[3] ], hi128(tmp1));
2203 _mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
2204 }
2205 template <typename I>
2206 static inline void interleave(typename V::EntryType *const data, const I &i,
2207 const typename V::AsArg v0, const typename V::AsArg v1,
2208 const typename V::AsArg v2, const typename V::AsArg v3,
2209 const typename V::AsArg v4)
2210 {
2211 interleave(data, i, v0, v1, v2, v3);
2212 v4.scatter(data + 4, i);
2213 }
2214 template <typename I>
2215 static inline void interleave(typename V::EntryType *const data, const I &i,
2216 const typename V::AsArg v0, const typename V::AsArg v1,
2217 const typename V::AsArg v2, const typename V::AsArg v3,
2218 const typename V::AsArg v4, const typename V::AsArg v5)
2219 {
2220 interleave(data, i, v0, v1, v2, v3);
2221 interleave(data + 4, i, v4, v5);
2222 }
2223 template <typename I>
2224 static inline void interleave(typename V::EntryType *const data, const I &i,
2225 const typename V::AsArg v0, const typename V::AsArg v1,
2226 const typename V::AsArg v2, const typename V::AsArg v3,
2227 const typename V::AsArg v4, const typename V::AsArg v5,
2228 const typename V::AsArg v6)
2229 {
2230 interleave(data, i, v0, v1, v2, v3);
2231 interleave(data + 4, i, v4, v5, v6);
2232 }
2233 template <typename I>
2234 static inline void interleave(typename V::EntryType *const data, const I &i,
2235 const typename V::AsArg v0, const typename V::AsArg v1,
2236 const typename V::AsArg v2, const typename V::AsArg v3,
2237 const typename V::AsArg v4, const typename V::AsArg v5,
2238 const typename V::AsArg v6, const typename V::AsArg v7)
2239 {
2240 interleave(data, i, v0, v1, v2, v3);
2241 interleave(data + 4, i, v4, v5, v6, v7);
2242 }
2243
2244 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2245 const I &i, V &v0, V &v1)
2246 {
2247 using namespace Vc::AVX;
2248 const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
2249 const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));
2250
2251 v0.data() = _mm256_unpacklo_pd(ab02, ab13);
2252 v1.data() = _mm256_unpackhi_pd(ab02, ab13);
2253 }
2254 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2255 const I &i, V &v0, V &v1, V &v2)
2256 {
2257 v2.gather(data + 2, i);
2258 deinterleave(data, i, v0, v1);
2259 }
2260 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2261 const I &i, V &v0, V &v1, V &v2, V &v3)
2262 {
2263 deinterleave(data, i, v0, v1);
2264 deinterleave(data + 2, i, v2, v3);
2265 }
2266 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2267 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
2268 {
2269 v4.gather(data + 4, i);
2270 deinterleave(data, i, v0, v1);
2271 deinterleave(data + 2, i, v2, v3);
2272 }
2273 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2274 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2275 {
2276 deinterleave(data, i, v0, v1);
2277 deinterleave(data + 2, i, v2, v3);
2278 deinterleave(data + 4, i, v4, v5);
2279 }
2280 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2281 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
2282 {
2283 v6.gather(data + 6, i);
2284 deinterleave(data, i, v0, v1);
2285 deinterleave(data + 2, i, v2, v3);
2286 deinterleave(data + 4, i, v4, v5);
2287 }
2288 template<typename I> static inline void deinterleave(typename V::EntryType const *const data,
2289 const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
2290 {
2291 deinterleave(data, i, v0, v1);
2292 deinterleave(data + 2, i, v2, v3);
2293 deinterleave(data + 4, i, v4, v5);
2294 deinterleave(data + 6, i, v6, v7);
2295 }
2296 };
2297
2298 }
2299 }
2300
2301 #endif
2302
2303