Vc/avx/detail.h

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #ifndef VC_AVX_DETAIL_H_
0029 #define VC_AVX_DETAIL_H_
0030
0031 #include "../sse/detail.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 namespace Detail
0037 {
0038 // (converting) load functions {{{1
0039 template <typename Flags>
0040 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0041                                  typename Flags::EnableIfAligned = nullptr)
0042 {
0043     return _mm256_load_ps(x);
0044 }
0045 template <typename Flags>
0046 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0047                                  typename Flags::EnableIfUnaligned = nullptr)
0048 {
0049     return _mm256_loadu_ps(x);
0050 }
0051 template <typename Flags>
0052 Vc_INTRINSIC Vc_PURE __m256 load(const float *x, Flags, LoadTag<__m256, float>,
0053                                  typename Flags::EnableIfStreaming = nullptr)
0054 {
0055     return AvxIntrinsics::stream_load<__m256>(x);
0056 }
0057
0058 template <typename Flags>
0059 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0060                                   typename Flags::EnableIfAligned = nullptr)
0061 {
0062     return _mm256_load_pd(x);
0063 }
0064 template <typename Flags>
0065 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0066                                   typename Flags::EnableIfUnaligned = nullptr)
0067 {
0068     return _mm256_loadu_pd(x);
0069 }
0070 template <typename Flags>
0071 Vc_INTRINSIC Vc_PURE __m256d load(const double *x, Flags, LoadTag<__m256d, double>,
0072                                   typename Flags::EnableIfStreaming = nullptr)
0073 {
0074     return AvxIntrinsics::stream_load<__m256d>(x);
0075 }
0076
0077 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0078 Vc_INTRINSIC Vc_PURE __m256i
0079 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfAligned = nullptr)
0080 {
0081     return _mm256_load_si256(reinterpret_cast<const __m256i *>(x));
0082 }
0083 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0084 Vc_INTRINSIC Vc_PURE __m256i
0085 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfUnaligned = nullptr)
0086 {
0087     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(x));
0088 }
0089 template <typename Flags, typename T, typename = enable_if<std::is_integral<T>::value>>
0090 Vc_INTRINSIC Vc_PURE __m256i
0091 load(const T *x, Flags, LoadTag<__m256i, T>, typename Flags::EnableIfStreaming = nullptr)
0092 {
0093     return AvxIntrinsics::stream_load<__m256i>(x);
0094 }
0095
0096 // load32{{{2
0097 Vc_INTRINSIC __m256 load32(const float *mem, when_aligned)
0098 {
0099     return _mm256_load_ps(mem);
0100 }
0101 Vc_INTRINSIC __m256 load32(const float *mem, when_unaligned)
0102 {
0103     return _mm256_loadu_ps(mem);
0104 }
0105 Vc_INTRINSIC __m256 load32(const float *mem, when_streaming)
0106 {
0107     return AvxIntrinsics::stream_load<__m256>(mem);
0108 }
0109 Vc_INTRINSIC __m256d load32(const double *mem, when_aligned)
0110 {
0111     return _mm256_load_pd(mem);
0112 }
0113 Vc_INTRINSIC __m256d load32(const double *mem, when_unaligned)
0114 {
0115     return _mm256_loadu_pd(mem);
0116 }
0117 Vc_INTRINSIC __m256d load32(const double *mem, when_streaming)
0118 {
0119     return AvxIntrinsics::stream_load<__m256d>(mem);
0120 }
0121 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_aligned)
0122 {
0123     static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0124     return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0125 }
0126 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_unaligned)
0127 {
0128     static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0129     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0130 }
0131 template <class T> Vc_INTRINSIC __m256i load32(const T *mem, when_streaming)
0132 {
0133     static_assert(std::is_integral<T>::value, "load32<T> is only intended for integral T");
0134     return AvxIntrinsics::stream_load<__m256i>(mem);
0135 }
0136
0137 // MSVC workarounds{{{2
0138 #ifdef Vc_MSVC
0139 // work around: "fatal error C1001: An internal error has occurred in the compiler."
0140 Vc_INTRINSIC __m256i load(const uint *mem, when_aligned, LoadTag<__m256i, int>)
0141 {
0142     return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0143 }
0144
0145 Vc_INTRINSIC __m256d load(const double *mem, when_unaligned, LoadTag<__m256d, double>)
0146 {
0147     return _mm256_loadu_pd(mem);
0148 }
0149
0150 template <typename V, typename DstT>
0151 Vc_INTRINSIC __m256 load(const float *mem, when_aligned,
0152                          enable_if<(std::is_same<DstT, float>::value &&
0153                                     std::is_same<V, __m256>::value)> = nullarg)
0154 {
0155     return _mm256_load_ps(mem);
0156 }
0157
0158 template <typename V, typename DstT>
0159 Vc_INTRINSIC __m256 load(const float *mem, when_unaligned,
0160                          enable_if<(std::is_same<DstT, float>::value &&
0161                                     std::is_same<V, __m256>::value)> = nullarg)
0162 {
0163     return _mm256_loadu_ps(mem);
0164 }
0165
0166 template <typename V, typename DstT>
0167 Vc_INTRINSIC __m256 load(const float *mem, when_streaming,
0168                          enable_if<(std::is_same<DstT, float>::value &&
0169                                     std::is_same<V, __m256>::value)> = nullarg)
0170 {
0171     return AvxIntrinsics::stream_load<__m256>(mem);
0172 }
0173
0174 template <typename V, typename DstT>
0175 Vc_INTRINSIC __m256d load(const double *mem, when_aligned,
0176                           enable_if<(std::is_same<DstT, double>::value &&
0177                                      std::is_same<V, __m256d>::value)> = nullarg)
0178 {
0179     return _mm256_load_pd(mem);
0180 }
0181
0182 template <typename V, typename DstT>
0183 Vc_INTRINSIC __m256d load(const double *mem, when_unaligned,
0184                           enable_if<(std::is_same<DstT, double>::value &&
0185                                      std::is_same<V, __m256d>::value)> = nullarg)
0186 {
0187     return _mm256_loadu_pd(mem);
0188 }
0189
0190 template <typename V, typename DstT>
0191 Vc_INTRINSIC __m256d load(const double *mem, when_streaming,
0192                           enable_if<(std::is_same<DstT, double>::value &&
0193                                      std::is_same<V, __m256d>::value)> = nullarg)
0194 {
0195     return AvxIntrinsics::stream_load<__m256d>(mem);
0196 }
0197
0198 template <typename V, typename DstT>
0199 Vc_INTRINSIC __m256i load(const uint *mem, when_aligned,
0200                           enable_if<(std::is_same<DstT, uint>::value &&
0201                                      std::is_same<V, __m256i>::value)> = nullarg)
0202 {
0203     return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0204 }
0205
0206 template <typename V, typename DstT>
0207 Vc_INTRINSIC __m256i load(const uint *mem, when_unaligned,
0208                           enable_if<(std::is_same<DstT, uint>::value &&
0209                                      std::is_same<V, __m256i>::value)> = nullarg)
0210 {
0211     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0212 }
0213
0214 template <typename V, typename DstT>
0215 Vc_INTRINSIC __m256i load(const uint *mem, when_streaming,
0216                           enable_if<(std::is_same<DstT, uint>::value &&
0217                                      std::is_same<V, __m256i>::value)> = nullarg)
0218 {
0219     return AvxIntrinsics::stream_load<__m256i>(mem);
0220 }
0221
0222 template <typename V, typename DstT>
0223 Vc_INTRINSIC __m256i load(const int *mem, when_unaligned,
0224                           enable_if<(std::is_same<DstT, int>::value &&
0225                                      std::is_same<V, __m256i>::value)> = nullarg)
0226 {
0227     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0228 }
0229
0230 template <typename V, typename DstT>
0231 Vc_INTRINSIC __m256i load(const int *mem, when_aligned,
0232                           enable_if<(std::is_same<DstT, int>::value &&
0233                                      std::is_same<V, __m256i>::value)> = nullarg)
0234 {
0235     return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0236 }
0237
0238 template <typename V, typename DstT>
0239 Vc_INTRINSIC __m256i load(const int *mem, when_streaming,
0240                           enable_if<(std::is_same<DstT, int>::value &&
0241                                      std::is_same<V, __m256i>::value)> = nullarg)
0242 {
0243     return AvxIntrinsics::stream_load<__m256i>(mem);
0244 }
0245
0246 template <typename V, typename DstT>
0247 Vc_INTRINSIC __m256i load(const short *mem, when_unaligned,
0248                           enable_if<(std::is_same<DstT, short>::value &&
0249                                      std::is_same<V, __m256i>::value)> = nullarg)
0250 {
0251     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0252 }
0253
0254 template <typename V, typename DstT>
0255 Vc_INTRINSIC __m256i load(const short *mem, when_aligned,
0256                           enable_if<(std::is_same<DstT, short>::value &&
0257                                      std::is_same<V, __m256i>::value)> = nullarg)
0258 {
0259     return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0260 }
0261
0262 template <typename V, typename DstT>
0263 Vc_INTRINSIC __m256i load(const short *mem, when_streaming,
0264                           enable_if<(std::is_same<DstT, short>::value &&
0265                                      std::is_same<V, __m256i>::value)> = nullarg)
0266 {
0267     return AvxIntrinsics::stream_load<__m256i>(mem);
0268 }
0269
0270 template <typename V, typename DstT>
0271 Vc_INTRINSIC __m256i load(const ushort *mem, when_unaligned,
0272                           enable_if<(std::is_same<DstT, ushort>::value &&
0273                                      std::is_same<V, __m256i>::value)> = nullarg)
0274 {
0275     return _mm256_loadu_si256(reinterpret_cast<const __m256i *>(mem));
0276 }
0277
0278 template <typename V, typename DstT>
0279 Vc_INTRINSIC __m256i load(const ushort *mem, when_aligned,
0280                           enable_if<(std::is_same<DstT, ushort>::value &&
0281                                      std::is_same<V, __m256i>::value)> = nullarg)
0282 {
0283     return _mm256_load_si256(reinterpret_cast<const __m256i *>(mem));
0284 }
0285
0286 template <typename V, typename DstT>
0287 Vc_INTRINSIC __m256i load(const ushort *mem, when_streaming,
0288                           enable_if<(std::is_same<DstT, ushort>::value &&
0289                                      std::is_same<V, __m256i>::value)> = nullarg)
0290 {
0291     return AvxIntrinsics::stream_load<__m256i>(mem);
0292 }
0293
0294 #endif  // Vc_MSVC
0295
0296 // short {{{2
0297 template <typename Flags>
0298 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, short>)
0299 {
0300     return load32(mem, f);
0301 }
0302 template <typename Flags>
0303 Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, short>)
0304 {
0305     return AVX::cvtepu8_epi16(load16(mem, f));
0306 }
0307 template <typename Flags>
0308 Vc_INTRINSIC __m256i load(const schar *mem, Flags f, LoadTag<__m256i, short>)
0309 {
0310     return AVX::cvtepi8_epi16(load16(mem, f));
0311 }
0312
0313 // ushort {{{2
0314 template <typename Flags>
0315 Vc_INTRINSIC __m256i load(const uchar *mem, Flags f, LoadTag<__m256i, ushort>)
0316 {
0317     return AVX::cvtepu8_epi16(load16(mem, f));
0318 }
0319
0320 // int {{{2
0321 template <typename Flags>
0322 Vc_INTRINSIC __m256i load(const uint *mem, Flags f, LoadTag<__m256i, int>)
0323 {
0324     return load32(mem, f);
0325 }
0326 template <typename Flags>
0327 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, int>)
0328 {
0329     return AVX::cvtepu16_epi32(load16(mem, f));
0330 }
0331 template <typename Flags>
0332 Vc_INTRINSIC __m256i load(const short *mem, Flags f, LoadTag<__m256i, int>)
0333 {
0334     return AVX::cvtepi16_epi32(load16(mem, f));
0335 }
0336 template <typename Flags>
0337 Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, int>)
0338 {
0339     return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0340 }
0341 template <typename Flags>
0342 Vc_INTRINSIC __m256i load(const schar *mem, Flags, LoadTag<__m256i, int>)
0343 {
0344     return AVX::cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0345 }
0346
0347 // uint {{{2
0348 template <typename Flags>
0349 Vc_INTRINSIC __m256i load(const ushort *mem, Flags f, LoadTag<__m256i, uint>)
0350 {
0351     return AVX::cvtepu16_epi32(load16(mem, f));
0352 }
0353 template <typename Flags>
0354 Vc_INTRINSIC __m256i load(const uchar *mem, Flags, LoadTag<__m256i, uint>)
0355 {
0356     return AVX::cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem)));
0357 }
0358
0359 // double {{{2
0360 template <typename Flags>
0361 Vc_INTRINSIC __m256d load(const float *mem, Flags f, LoadTag<__m256d, double>)
0362 {
0363     return AVX::convert<float, double>(load16(mem, f));
0364 }
0365 template <typename Flags>
0366 Vc_INTRINSIC __m256d load(const uint *mem, Flags f, LoadTag<__m256d, double>)
0367 {
0368     return AVX::convert<uint, double>(load16(mem, f));
0369 }
0370 template <typename Flags>
0371 Vc_INTRINSIC __m256d load(const int *mem, Flags f, LoadTag<__m256d, double>)
0372 {
0373     return AVX::convert<int, double>(load16(mem, f));
0374 }
0375 template <typename Flags>
0376 Vc_INTRINSIC __m256d load(const ushort *mem, Flags f, LoadTag<__m256d, double>)
0377 {
0378     return AVX::convert<int, double>(load16(mem, f));
0379 }
0380 template <typename Flags>
0381 Vc_INTRINSIC __m256d load(const short *mem, Flags f, LoadTag<__m256d, double>)
0382 {
0383     return AVX::convert<int, double>(load16(mem, f));
0384 }
0385 template <typename Flags>
0386 Vc_INTRINSIC __m256d load(const uchar *mem, Flags f, LoadTag<__m256d, double>)
0387 {
0388     return AVX::convert<int, double>(load16(mem, f));
0389 }
0390 template <typename Flags>
0391 Vc_INTRINSIC __m256d load(const schar *mem, Flags f, LoadTag<__m256d, double>)
0392 {
0393     return AVX::convert<int, double>(load16(mem, f));
0394 }
0395
0396 // float {{{2
0397 template <typename Flags>
0398 Vc_INTRINSIC __m256 load(const double *mem, Flags f, LoadTag<__m256, float>)
0399 {
0400     return AVX::concat(_mm256_cvtpd_ps(load32(&mem[0], f)),
0401                        _mm256_cvtpd_ps(load32(&mem[4], f)));
0402 }
0403 template <typename Flags>
0404 Vc_INTRINSIC __m256 load(const uint *mem, Flags f, LoadTag<__m256, float>)
0405 {
0406     const auto v = load32(mem, f);
0407     return _mm256_blendv_ps(
0408         _mm256_cvtepi32_ps(v),
0409         _mm256_add_ps(_mm256_cvtepi32_ps(AVX::sub_epi32(v, AVX::set2power31_epu32())),
0410                       AVX::set2power31_ps()),
0411         _mm256_castsi256_ps(AVX::cmplt_epi32(v, _mm256_setzero_si256())));
0412 }
0413 template <typename Flags>
0414 Vc_INTRINSIC __m256 load(const int *mem, Flags f, LoadTag<__m256, float>)
0415 {
0416     return AVX::convert<int, float>(load32(mem, f));
0417 }
0418 template <typename T, typename Flags,
0419           typename = enable_if<!std::is_same<T, float>::value>>
0420 Vc_INTRINSIC __m256 load(const T *mem, Flags f, LoadTag<__m256, float>)
0421 {
0422     return _mm256_cvtepi32_ps(load<__m256i, int>(mem, f));
0423 }
0424 template <typename Flags>
0425 Vc_INTRINSIC __m256 load(const ushort *mem, Flags f, LoadTag<__m256, float>)
0426 {
0427     return AVX::convert<ushort, float>(load16(mem, f));
0428 }
0429 template <typename Flags>
0430 Vc_INTRINSIC __m256 load(const short *mem, Flags f, LoadTag<__m256, float>)
0431 {
0432     return AVX::convert<short, float>(load16(mem, f));
0433 }
0434 /*
0435 template<typename Flags> struct LoadHelper<float, unsigned char, Flags> {
0436     static __m256 load(const unsigned char *mem, Flags)
0437     {
0438         return _mm256_cvtepi32_ps(
0439             cvtepu8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))));
0440     }
0441 };
0442 template<typename Flags> struct LoadHelper<float, signed char, Flags> {
0443     static __m256 load(const signed char *mem, Flags)
0444     {
0445         return _mm256_cvtepi32_ps(
0446             cvtepi8_epi32(_mm_loadl_epi64(reinterpret_cast<const __m128i *>(mem))));
0447     }
0448 };
0449 */
0450
0451 // shifted{{{1
0452 template <int amount, typename T>
0453 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount >= 16), T> shifted(T k)
0454 {
0455     return AVX::avx_cast<T>(AVX::zeroExtend(
0456         _mm_srli_si128(AVX::hi128(AVX::avx_cast<__m256i>(k)), amount - 16)));
0457 }
0458 template <int amount, typename T>
0459 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > 0 && amount < 16), T>
0460 shifted(T k)
0461 {
0462     return AVX::avx_cast<T>(
0463         AVX::alignr<amount>(Mem::permute128<X1, Const0>(AVX::avx_cast<__m256i>(k)),
0464                             AVX::avx_cast<__m256i>(k)));
0465 }
0466 template <int amount, typename T>
0467 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount <= -16), T> shifted(T k)
0468 {
0469     return AVX::avx_cast<T>(Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(
0470         _mm_slli_si128(AVX::lo128(AVX::avx_cast<__m256i>(k)), -16 - amount))));
0471 }
0472 template <int amount, typename T>
0473 Vc_INTRINSIC Vc_CONST enable_if<(sizeof(T) == 32 && amount > -16 && amount < 0), T>
0474 shifted(T k)
0475 {
0476     return AVX::avx_cast<T>(
0477         AVX::alignr<16 + amount>(AVX::avx_cast<__m256i>(k),
0478                                  Mem::permute128<Const0, X0>(AVX::avx_cast<__m256i>(k))));
0479 }
0480 // mask_cast{{{1
0481 template<size_t From, size_t To, typename R> Vc_INTRINSIC Vc_CONST R mask_cast(__m256i k)
0482 {
0483     static_assert(From == To, "Incorrect mask cast.");
0484     static_assert(std::is_same<R, __m256>::value, "Incorrect mask cast.");
0485     return AVX::avx_cast<__m256>(k);
0486 }
0487
0488 // 4 -> 4
0489 template <> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 4, __m128>(__m256i k)
0490 {
0491     return AVX::avx_cast<__m128>(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)));
0492 }
0493
0494 template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 4, __m256>(__m128i k)
0495 {
0496     const auto kk = _mm_castsi128_ps(k);
0497     return AVX::concat(_mm_unpacklo_ps(kk, kk), _mm_unpackhi_ps(kk, kk));
0498 }
0499
0500 // 4 -> 8
0501 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m256i k)
0502 {
0503     // aabb ccdd -> abcd 0000
0504     return AVX::avx_cast<__m256>(AVX::concat(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)),
0505                                  _mm_setzero_si128()));
0506 }
0507
0508 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<4, 8, __m128>(__m256i k)
0509 {
0510     // aaaa bbbb cccc dddd -> abcd 0000
0511     return AVX::avx_cast<__m128>(_mm_packs_epi16(_mm_packs_epi32(AVX::lo128(k), AVX::hi128(k)), _mm_setzero_si128()));
0512 }
0513
0514 template <> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 8, __m256>(__m128i k)
0515 {
0516     return AVX::zeroExtend(AVX::avx_cast<__m128>(k));
0517 }
0518
0519 // 4 -> 16
0520 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<4, 16, __m256>(__m256i k)
0521 {
0522     // aaaa bbbb cccc dddd -> abcd 0000 0000 0000
0523     return AVX::zeroExtend(mask_cast<4, 8, __m128>(k));
0524 }
0525
0526 // 8 -> 4
0527 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m256i k)
0528 {
0529     // aabb ccdd eeff gghh -> aaaa bbbb cccc dddd
0530     const auto lo = AVX::lo128(AVX::avx_cast<__m256>(k));
0531     return AVX::concat(_mm_unpacklo_ps(lo, lo),
0532                   _mm_unpackhi_ps(lo, lo));
0533 }
0534
0535 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 4, __m128>(__m256i k)
0536 {
0537     return AVX::avx_cast<__m128>(AVX::lo128(k));
0538 }
0539
0540 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 4, __m256>(__m128i k)
0541 {
0542     // abcd efgh -> aaaa bbbb cccc dddd
0543     const auto tmp = _mm_unpacklo_epi16(k, k); // aa bb cc dd
0544     return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), // aaaa bbbb
0545                                  _mm_unpackhi_epi32(tmp, tmp))); // cccc dddd
0546 }
0547
0548 // 8 -> 8
0549 template<> Vc_INTRINSIC Vc_CONST __m128 mask_cast<8, 8, __m128>(__m256i k)
0550 {
0551     // aabb ccdd eeff gghh -> abcd efgh
0552     return AVX::avx_cast<__m128>(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
0553 }
0554
0555 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 8, __m256>(__m128i k)
0556 {
0557     return AVX::avx_cast<__m256>(AVX::concat(_mm_unpacklo_epi16(k, k),
0558                                  _mm_unpackhi_epi16(k, k)));
0559 }
0560
0561 // 8 -> 16
0562 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<8, 16, __m256>(__m256i k)
0563 {
0564     // aabb ccdd eeff gghh -> abcd efgh 0000 0000
0565     return AVX::zeroExtend(mask_cast<8, 8, __m128>(k));
0566 }
0567
0568 // 16 -> 8
0569 #ifdef Vc_IMPL_AVX2
0570 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 8, __m256>(__m256i k)
0571 {
0572     // abcd efgh ijkl mnop -> aabb ccdd eeff gghh
0573     const auto flipped = Mem::permute4x64<X0, X2, X1, X3>(k);
0574     return _mm256_castsi256_ps(AVX::unpacklo_epi16(flipped, flipped));
0575 }
0576 #endif
0577
0578 // 16 -> 4
0579 template<> Vc_INTRINSIC Vc_CONST __m256 mask_cast<16, 4, __m256>(__m256i k)
0580 {
0581     // abcd efgh ijkl mnop -> aaaa bbbb cccc dddd
0582     const auto tmp = _mm_unpacklo_epi16(AVX::lo128(k), AVX::lo128(k)); // aabb ccdd
0583     return _mm256_castsi256_ps(AVX::concat(_mm_unpacklo_epi32(tmp, tmp), _mm_unpackhi_epi32(tmp, tmp)));
0584 }
0585
0586 // allone{{{1
0587 template<> Vc_INTRINSIC Vc_CONST __m256  allone<__m256 >() { return AVX::setallone_ps(); }
0588 template<> Vc_INTRINSIC Vc_CONST __m256i allone<__m256i>() { return AVX::setallone_si256(); }
0589 template<> Vc_INTRINSIC Vc_CONST __m256d allone<__m256d>() { return AVX::setallone_pd(); }
0590
0591 // zero{{{1
0592 template<> Vc_INTRINSIC Vc_CONST __m256  zero<__m256 >() { return _mm256_setzero_ps(); }
0593 template<> Vc_INTRINSIC Vc_CONST __m256i zero<__m256i>() { return _mm256_setzero_si256(); }
0594 template<> Vc_INTRINSIC Vc_CONST __m256d zero<__m256d>() { return _mm256_setzero_pd(); }
0595
0596 // one{{{1
0597 Vc_INTRINSIC Vc_CONST __m256  one( float) { return AVX::setone_ps   (); }
0598 Vc_INTRINSIC Vc_CONST __m256d one(double) { return AVX::setone_pd   (); }
0599 Vc_INTRINSIC Vc_CONST __m256i one(   int) { return AVX::setone_epi32(); }
0600 Vc_INTRINSIC Vc_CONST __m256i one(  uint) { return AVX::setone_epu32(); }
0601 Vc_INTRINSIC Vc_CONST __m256i one( short) { return AVX::setone_epi16(); }
0602 Vc_INTRINSIC Vc_CONST __m256i one(ushort) { return AVX::setone_epu16(); }
0603 Vc_INTRINSIC Vc_CONST __m256i one( schar) { return AVX::setone_epi8 (); }
0604 Vc_INTRINSIC Vc_CONST __m256i one( uchar) { return AVX::setone_epu8 (); }
0605
0606 // negate{{{1
0607 Vc_ALWAYS_INLINE Vc_CONST __m256 negate(__m256 v, std::integral_constant<std::size_t, 4>)
0608 {
0609     return _mm256_xor_ps(v, AVX::setsignmask_ps());
0610 }
0611 Vc_ALWAYS_INLINE Vc_CONST __m256d negate(__m256d v, std::integral_constant<std::size_t, 8>)
0612 {
0613     return _mm256_xor_pd(v, AVX::setsignmask_pd());
0614 }
0615 Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 4>)
0616 {
0617     return AVX::sign_epi32(v, Detail::allone<__m256i>());
0618 }
0619 Vc_ALWAYS_INLINE Vc_CONST __m256i negate(__m256i v, std::integral_constant<std::size_t, 2>)
0620 {
0621     return AVX::sign_epi16(v, Detail::allone<__m256i>());
0622 }
0623
0624 // xor_{{{1
0625 Vc_INTRINSIC __m256 xor_(__m256 a, __m256 b) { return _mm256_xor_ps(a, b); }
0626 Vc_INTRINSIC __m256d xor_(__m256d a, __m256d b) { return _mm256_xor_pd(a, b); }
0627 Vc_INTRINSIC __m256i xor_(__m256i a, __m256i b)
0628 {
0629 #ifdef Vc_IMPL_AVX2
0630     return _mm256_xor_si256(a, b);
0631 #else
0632     return _mm256_castps_si256(
0633         _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0634 #endif
0635 }
0636
0637 // or_{{{1
0638 Vc_INTRINSIC __m256 or_(__m256 a, __m256 b) { return _mm256_or_ps(a, b); }
0639 Vc_INTRINSIC __m256d or_(__m256d a, __m256d b) { return _mm256_or_pd(a, b); }
0640 Vc_INTRINSIC __m256i or_(__m256i a, __m256i b)
0641 {
0642 #ifdef Vc_IMPL_AVX2
0643     return _mm256_or_si256(a, b);
0644 #else
0645     return _mm256_castps_si256(
0646         _mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0647 #endif
0648 }
0649
0650 // and_{{{1
0651 Vc_INTRINSIC __m256 and_(__m256 a, __m256 b) { return _mm256_and_ps(a, b); }
0652 Vc_INTRINSIC __m256d and_(__m256d a, __m256d b) { return _mm256_and_pd(a, b); }
0653 Vc_INTRINSIC __m256i and_(__m256i a, __m256i b) {
0654 #ifdef Vc_IMPL_AVX2
0655     return _mm256_and_si256(a, b);
0656 #else
0657     return _mm256_castps_si256(
0658         _mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0659 #endif
0660 }
0661
0662 // andnot_{{{1
0663 Vc_INTRINSIC __m256 andnot_(__m256 a, __m256 b) { return _mm256_andnot_ps(a, b); }
0664 Vc_INTRINSIC __m256d andnot_(__m256d a, __m256d b) { return _mm256_andnot_pd(a, b); }
0665 Vc_INTRINSIC __m256i andnot_(__m256i a, __m256i b)
0666 {
0667 #ifdef Vc_IMPL_AVX2
0668     return _mm256_andnot_si256(a, b);
0669 #else
0670     return _mm256_castps_si256(
0671         _mm256_andnot_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
0672 #endif
0673 }
0674
0675 // not_{{{1
0676 Vc_INTRINSIC __m256  not_(__m256  a) { return andnot_(a, allone<__m256 >()); }
0677 Vc_INTRINSIC __m256d not_(__m256d a) { return andnot_(a, allone<__m256d>()); }
0678 Vc_INTRINSIC __m256i not_(__m256i a) { return andnot_(a, allone<__m256i>()); }
0679
0680 // blend{{{1
0681 Vc_INTRINSIC __m256  blend(__m256  a, __m256  b, __m256  c) { return _mm256_blendv_ps(a, b, c); }
0682 Vc_INTRINSIC __m256d blend(__m256d a, __m256d b, __m256d c) { return _mm256_blendv_pd(a, b, c); }
0683 Vc_INTRINSIC __m256i blend(__m256i a, __m256i b, __m256i c) { return AVX::blendv_epi8(a, b, c); }
0684
0685 // abs{{{1
0686 Vc_INTRINSIC __m256  abs(__m256  a,  float) { return and_(a, AVX::setabsmask_ps()); }
0687 Vc_INTRINSIC __m256d abs(__m256d a, double) { return and_(a, AVX::setabsmask_pd()); }
0688 Vc_INTRINSIC __m256i abs(__m256i a,    int) { return AVX::abs_epi32(a); }
0689 Vc_INTRINSIC __m256i abs(__m256i a,   uint) { return a; }
0690 Vc_INTRINSIC __m256i abs(__m256i a,  short) { return AVX::abs_epi16(a); }
0691 Vc_INTRINSIC __m256i abs(__m256i a, ushort) { return a; }
0692 Vc_INTRINSIC __m256i abs(__m256i a,  schar) { return AVX::abs_epi8 (a); }
0693 Vc_INTRINSIC __m256i abs(__m256i a,  uchar) { return a; }
0694
0695 // add{{{1
0696 Vc_INTRINSIC __m256  add(__m256  a, __m256  b,  float) { return _mm256_add_ps(a, b); }
0697 Vc_INTRINSIC __m256d add(__m256d a, __m256d b, double) { return _mm256_add_pd(a, b); }
0698 Vc_INTRINSIC __m256i add(__m256i a, __m256i b,    int) { return AVX::add_epi32(a, b); }
0699 Vc_INTRINSIC __m256i add(__m256i a, __m256i b,   uint) { return AVX::add_epi32(a, b); }
0700 Vc_INTRINSIC __m256i add(__m256i a, __m256i b,  short) { return AVX::add_epi16(a, b); }
0701 Vc_INTRINSIC __m256i add(__m256i a, __m256i b, ushort) { return AVX::add_epi16(a, b); }
0702
0703 // sub{{{1
0704 Vc_INTRINSIC __m256  sub(__m256  a, __m256  b,  float) { return _mm256_sub_ps(a, b); }
0705 Vc_INTRINSIC __m256d sub(__m256d a, __m256d b, double) { return _mm256_sub_pd(a, b); }
0706 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b,    int) { return AVX::sub_epi32(a, b); }
0707 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b,   uint) { return AVX::sub_epi32(a, b); }
0708 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b,  short) { return AVX::sub_epi16(a, b); }
0709 Vc_INTRINSIC __m256i sub(__m256i a, __m256i b, ushort) { return AVX::sub_epi16(a, b); }
0710
0711 // mul{{{1
0712 Vc_INTRINSIC __m256  mul(__m256  a, __m256  b,  float) { return _mm256_mul_ps(a, b); }
0713 Vc_INTRINSIC __m256d mul(__m256d a, __m256d b, double) { return _mm256_mul_pd(a, b); }
0714 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b,    int) { return AVX::mullo_epi32(a, b); }
0715 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b,   uint) { return AVX::mullo_epi32(a, b); }
0716 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b,  short) { return AVX::mullo_epi16(a, b); }
0717 Vc_INTRINSIC __m256i mul(__m256i a, __m256i b, ushort) { return AVX::mullo_epi16(a, b); }
0718
0719 // mul{{{1
0720 Vc_INTRINSIC __m256  div(__m256  a, __m256  b,  float) { return _mm256_div_ps(a, b); }
0721 Vc_INTRINSIC __m256d div(__m256d a, __m256d b, double) { return _mm256_div_pd(a, b); }
0722 Vc_INTRINSIC __m256i div(__m256i a, __m256i b,    int) {
0723     using namespace AVX;
0724     const __m256d lo1 = _mm256_cvtepi32_pd(lo128(a));
0725     const __m256d lo2 = _mm256_cvtepi32_pd(lo128(b));
0726     const __m256d hi1 = _mm256_cvtepi32_pd(hi128(a));
0727     const __m256d hi2 = _mm256_cvtepi32_pd(hi128(b));
0728     return concat(_mm256_cvttpd_epi32(_mm256_div_pd(lo1, lo2)),
0729                   _mm256_cvttpd_epi32(_mm256_div_pd(hi1, hi2)));
0730 }
0731 Vc_INTRINSIC __m256i div(__m256i a, __m256i b,   uint) {
0732     // SSE/AVX only has signed int conversion to doubles. Therefore we first adjust the input before
0733     // conversion and take the adjustment back after the conversion.
0734     // It could be argued that for b this is not really important because division by a b >= 2^31 is
0735     // useless. But for full correctness it cannot be ignored.
0736     using namespace AVX;
0737     const __m256i aa = add_epi32(a, set1_epi32(-2147483648));
0738     const __m256i bb = add_epi32(b, set1_epi32(-2147483648));
0739     const __m256d loa = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(aa)), set1_pd(2147483648.));
0740     const __m256d hia = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(aa)), set1_pd(2147483648.));
0741     const __m256d lob = _mm256_add_pd(_mm256_cvtepi32_pd(lo128(bb)), set1_pd(2147483648.));
0742     const __m256d hib = _mm256_add_pd(_mm256_cvtepi32_pd(hi128(bb)), set1_pd(2147483648.));
0743     // there is one remaining problem: a >= 2^31 and b == 1
0744     // in that case the return value would be 2^31
0745     return avx_cast<__m256i>(_mm256_blendv_ps(
0746         avx_cast<__m256>(concat(_mm256_cvttpd_epi32(_mm256_div_pd(loa, lob)),
0747                                           _mm256_cvttpd_epi32(_mm256_div_pd(hia, hib)))),
0748         avx_cast<__m256>(a),
0749         avx_cast<__m256>(cmpeq_epi32(b, setone_epi32()))));
0750 }
0751 Vc_INTRINSIC __m256i div(__m256i a, __m256i b,  short) {
0752     using namespace AVX;
0753     const __m256 lo =
0754         _mm256_div_ps(convert<short, float>(lo128(a)), convert<short, float>(lo128(b)));
0755     const __m256 hi =
0756         _mm256_div_ps(convert<short, float>(hi128(a)), convert<short, float>(hi128(b)));
0757     return concat(convert<float, short>(lo), convert<float, short>(hi));
0758 }
0759
0760 // horizontal add{{{1
0761 template <typename T> Vc_INTRINSIC T add(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0762 {
0763     return {add(add(AVX::lo128(a), AVX::hi128(a), T()), T())};
0764 }
0765
0766 // horizontal mul{{{1
0767 template <typename T> Vc_INTRINSIC T mul(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0768 {
0769     return {mul(mul(AVX::lo128(a), AVX::hi128(a), T()), T())};
0770 }
0771
0772 // horizontal min{{{1
0773 template <typename T> Vc_INTRINSIC T min(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0774 {
0775     return {min(min(AVX::lo128(a), AVX::hi128(a), T()), T())};
0776 }
0777
0778 // horizontal max{{{1
0779 template <typename T> Vc_INTRINSIC T max(Common::IntrinsicType<T, 32 / sizeof(T)> a, T)
0780 {
0781     return {max(max(AVX::lo128(a), AVX::hi128(a), T()), T())};
0782 }
0783 // cmpeq{{{1
0784 Vc_INTRINSIC __m256  cmpeq(__m256  a, __m256  b,  float) { return AvxIntrinsics::cmpeq_ps(a, b); }
0785 Vc_INTRINSIC __m256d cmpeq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpeq_pd(a, b); }
0786 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b,    int) { return AvxIntrinsics::cmpeq_epi32(a, b); }
0787 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b,   uint) { return AvxIntrinsics::cmpeq_epi32(a, b); }
0788 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b,  short) { return AvxIntrinsics::cmpeq_epi16(a, b); }
0789 Vc_INTRINSIC __m256i cmpeq(__m256i a, __m256i b, ushort) { return AvxIntrinsics::cmpeq_epi16(a, b); }
0790
0791 // cmpneq{{{1
0792 Vc_INTRINSIC __m256  cmpneq(__m256  a, __m256  b,  float) { return AvxIntrinsics::cmpneq_ps(a, b); }
0793 Vc_INTRINSIC __m256d cmpneq(__m256d a, __m256d b, double) { return AvxIntrinsics::cmpneq_pd(a, b); }
0794 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,    int) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
0795 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,   uint) { return not_(AvxIntrinsics::cmpeq_epi32(a, b)); }
0796 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,  short) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
0797 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b, ushort) { return not_(AvxIntrinsics::cmpeq_epi16(a, b)); }
0798 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,  schar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
0799 Vc_INTRINSIC __m256i cmpneq(__m256i a, __m256i b,  uchar) { return not_(AvxIntrinsics::cmpeq_epi8 (a, b)); }
0800
0801 // cmpgt{{{1
0802 Vc_INTRINSIC __m256  cmpgt(__m256  a, __m256  b,  float) { return AVX::cmpgt_ps(a, b); }
0803 Vc_INTRINSIC __m256d cmpgt(__m256d a, __m256d b, double) { return AVX::cmpgt_pd(a, b); }
0804 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,    int) { return AVX::cmpgt_epi32(a, b); }
0805 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,   uint) { return AVX::cmpgt_epu32(a, b); }
0806 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,  short) { return AVX::cmpgt_epi16(a, b); }
0807 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(a, b); }
0808 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,  schar) { return AVX::cmpgt_epi8 (a, b); }
0809 Vc_INTRINSIC __m256i cmpgt(__m256i a, __m256i b,  uchar) { return AVX::cmpgt_epu8 (a, b); }
0810
0811 // cmpge{{{1
0812 Vc_INTRINSIC __m256  cmpge(__m256  a, __m256  b,  float) { return AVX::cmpge_ps(a, b); }
0813 Vc_INTRINSIC __m256d cmpge(__m256d a, __m256d b, double) { return AVX::cmpge_pd(a, b); }
0814 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,    int) { return not_(AVX::cmpgt_epi32(b, a)); }
0815 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,   uint) { return not_(AVX::cmpgt_epu32(b, a)); }
0816 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,  short) { return not_(AVX::cmpgt_epi16(b, a)); }
0817 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(b, a)); }
0818 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,  schar) { return not_(AVX::cmpgt_epi8 (b, a)); }
0819 Vc_INTRINSIC __m256i cmpge(__m256i a, __m256i b,  uchar) { return not_(AVX::cmpgt_epu8 (b, a)); }
0820
0821 // cmple{{{1
0822 Vc_INTRINSIC __m256  cmple(__m256  a, __m256  b,  float) { return AVX::cmple_ps(a, b); }
0823 Vc_INTRINSIC __m256d cmple(__m256d a, __m256d b, double) { return AVX::cmple_pd(a, b); }
0824 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,    int) { return not_(AVX::cmpgt_epi32(a, b)); }
0825 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,   uint) { return not_(AVX::cmpgt_epu32(a, b)); }
0826 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,  short) { return not_(AVX::cmpgt_epi16(a, b)); }
0827 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b, ushort) { return not_(AVX::cmpgt_epu16(a, b)); }
0828 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,  schar) { return not_(AVX::cmpgt_epi8 (a, b)); }
0829 Vc_INTRINSIC __m256i cmple(__m256i a, __m256i b,  uchar) { return not_(AVX::cmpgt_epu8 (a, b)); }
0830
0831 // cmplt{{{1
0832 Vc_INTRINSIC __m256  cmplt(__m256  a, __m256  b,  float) { return AVX::cmplt_ps(a, b); }
0833 Vc_INTRINSIC __m256d cmplt(__m256d a, __m256d b, double) { return AVX::cmplt_pd(a, b); }
0834 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,    int) { return AVX::cmpgt_epi32(b, a); }
0835 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,   uint) { return AVX::cmpgt_epu32(b, a); }
0836 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,  short) { return AVX::cmpgt_epi16(b, a); }
0837 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b, ushort) { return AVX::cmpgt_epu16(b, a); }
0838 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,  schar) { return AVX::cmpgt_epi8 (b, a); }
0839 Vc_INTRINSIC __m256i cmplt(__m256i a, __m256i b,  uchar) { return AVX::cmpgt_epu8 (b, a); }
0840
0841 // fma{{{1
0842 Vc_INTRINSIC __m256 fma(__m256  a, __m256  b, __m256  c,  float) {
0843 #ifdef Vc_IMPL_FMA4
0844     return _mm256_macc_ps(a, b, c);
0845 #elif defined Vc_IMPL_FMA
0846     return _mm256_fmadd_ps(a, b, c);
0847 #else
0848     using namespace AVX;
0849     __m256d v1_0 = _mm256_cvtps_pd(lo128(a));
0850     __m256d v1_1 = _mm256_cvtps_pd(hi128(a));
0851     __m256d v2_0 = _mm256_cvtps_pd(lo128(b));
0852     __m256d v2_1 = _mm256_cvtps_pd(hi128(b));
0853     __m256d v3_0 = _mm256_cvtps_pd(lo128(c));
0854     __m256d v3_1 = _mm256_cvtps_pd(hi128(c));
0855     return concat(_mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_0, v2_0), v3_0)),
0856                   _mm256_cvtpd_ps(_mm256_add_pd(_mm256_mul_pd(v1_1, v2_1), v3_1)));
0857 #endif
0858 }
0859 Vc_INTRINSIC __m256d fma(__m256d a, __m256d b, __m256d c, double)
0860 {
0861 #ifdef Vc_IMPL_FMA4
0862     return _mm256_macc_pd(a, b, c);
0863 #elif defined Vc_IMPL_FMA
0864     return _mm256_fmadd_pd(a, b, c);
0865 #else
0866     using namespace AVX;
0867     __m256d h1 = and_(a, _mm256_broadcast_sd(reinterpret_cast<const double *>(
0868                              &c_general::highMaskDouble)));
0869     __m256d h2 = and_(b, _mm256_broadcast_sd(reinterpret_cast<const double *>(
0870                              &c_general::highMaskDouble)));
0871     const __m256d l1 = _mm256_sub_pd(a, h1);
0872     const __m256d l2 = _mm256_sub_pd(b, h2);
0873     const __m256d ll = mul(l1, l2, double());
0874     const __m256d lh = add(mul(l1, h2, double()), mul(h1, l2, double()), double());
0875     const __m256d hh = mul(h1, h2, double());
0876     // ll < lh < hh for all entries is certain
0877     const __m256d lh_lt_v3 = cmplt(abs(lh, double()), abs(c, double()), double());  // |lh| < |c|
0878     const __m256d x = _mm256_blendv_pd(c, lh, lh_lt_v3);
0879     const __m256d y = _mm256_blendv_pd(lh, c, lh_lt_v3);
0880     return add(add(ll, x, double()), add(y, hh, double()), double());
0881 #endif
0882 }
0883 template <typename T> Vc_INTRINSIC __m256i fma(__m256i a, __m256i b, __m256i c, T)
0884 {
0885     return add(mul(a, b, T()), c, T());
0886 }
0887
0888 // shiftRight{{{1
0889 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,    int) { return AVX::srai_epi32<shift>(a); }
0890 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,   uint) { return AVX::srli_epi32<shift>(a); }
0891 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,  short) { return AVX::srai_epi16<shift>(a); }
0892 template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a, ushort) { return AVX::srli_epi16<shift>(a); }
0893 //template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,  schar) { return AVX::srai_epi8 <shift>(a); }
0894 //template <int shift> Vc_INTRINSIC __m256i shiftRight(__m256i a,  uchar) { return AVX::srli_epi8 <shift>(a); }
0895
0896 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,    int) { return AVX::sra_epi32(a, _mm_cvtsi32_si128(shift)); }
0897 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,   uint) { return AVX::srl_epi32(a, _mm_cvtsi32_si128(shift)); }
0898 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,  short) { return AVX::sra_epi16(a, _mm_cvtsi32_si128(shift)); }
0899 Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift, ushort) { return AVX::srl_epi16(a, _mm_cvtsi32_si128(shift)); }
0900 //Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,  schar) { return AVX::sra_epi8 (a, _mm_cvtsi32_si128(shift)); }
0901 //Vc_INTRINSIC __m256i shiftRight(__m256i a, int shift,  uchar) { return AVX::srl_epi8 (a, _mm_cvtsi32_si128(shift)); }
0902
0903 // shiftLeft{{{1
0904 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,    int) { return AVX::slli_epi32<shift>(a); }
0905 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,   uint) { return AVX::slli_epi32<shift>(a); }
0906 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,  short) { return AVX::slli_epi16<shift>(a); }
0907 template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a, ushort) { return AVX::slli_epi16<shift>(a); }
0908 //template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,  schar) { return AVX::slli_epi8 <shift>(a); }
0909 //template <int shift> Vc_INTRINSIC __m256i shiftLeft(__m256i a,  uchar) { return AVX::slli_epi8 <shift>(a); }
0910
0911 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,    int) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
0912 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,   uint) { return AVX::sll_epi32(a, _mm_cvtsi32_si128(shift)); }
0913 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,  short) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
0914 Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift, ushort) { return AVX::sll_epi16(a, _mm_cvtsi32_si128(shift)); }
0915 //Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,  schar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); }
0916 //Vc_INTRINSIC __m256i shiftLeft(__m256i a, int shift,  uchar) { return AVX::sll_epi8 (a, _mm_cvtsi32_si128(shift)); }
0917
0918 // zeroExtendIfNeeded{{{1
0919 Vc_INTRINSIC __m256  zeroExtendIfNeeded(__m256  x) { return x; }
0920 Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m256d x) { return x; }
0921 Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m256i x) { return x; }
0922 Vc_INTRINSIC __m256  zeroExtendIfNeeded(__m128  x) { return AVX::zeroExtend(x); }
0923 Vc_INTRINSIC __m256d zeroExtendIfNeeded(__m128d x) { return AVX::zeroExtend(x); }
0924 Vc_INTRINSIC __m256i zeroExtendIfNeeded(__m128i x) { return AVX::zeroExtend(x); }
0925
0926 // broadcast{{{1
0927 Vc_INTRINSIC __m256  avx_broadcast( float x) { return _mm256_set1_ps(x); }
0928 Vc_INTRINSIC __m256d avx_broadcast(double x) { return _mm256_set1_pd(x); }
0929 Vc_INTRINSIC __m256i avx_broadcast(   int x) { return _mm256_set1_epi32(x); }
0930 Vc_INTRINSIC __m256i avx_broadcast(  uint x) { return _mm256_set1_epi32(x); }
0931 Vc_INTRINSIC __m256i avx_broadcast( short x) { return _mm256_set1_epi16(x); }
0932 Vc_INTRINSIC __m256i avx_broadcast(ushort x) { return _mm256_set1_epi16(x); }
0933 Vc_INTRINSIC __m256i avx_broadcast(  char x) { return _mm256_set1_epi8(x); }
0934 Vc_INTRINSIC __m256i avx_broadcast( schar x) { return _mm256_set1_epi8(x); }
0935 Vc_INTRINSIC __m256i avx_broadcast( uchar x) { return _mm256_set1_epi8(x); }
0936
0937 // sorted{{{1
0938 template <Vc::Implementation Impl, typename T,
0939           typename = enable_if<(Impl >= AVXImpl && Impl <= AVX2Impl)>>
0940 Vc_CONST_L AVX2::Vector<T> Vc_VDECL sorted(AVX2::Vector<T> x) Vc_CONST_R;
0941 template <typename T> Vc_INTRINSIC Vc_CONST AVX2::Vector<T> sorted(AVX2::Vector<T> x)
0942 {
0943     return sorted<CurrentImplementation::current()>(x);
0944 }
0945
0946 // shifted{{{1
0947 template <typename T, typename V>
0948 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32), V> shifted(V v, int amount)
0949 {
0950     using namespace AVX;
0951     constexpr int S = sizeof(T);
0952     switch (amount) {
0953     case  0: return v;
0954     case  1: return shifted<sanitize<V>( 1 * S)>(v);
0955     case  2: return shifted<sanitize<V>( 2 * S)>(v);
0956     case  3: return shifted<sanitize<V>( 3 * S)>(v);
0957     case -1: return shifted<sanitize<V>(-1 * S)>(v);
0958     case -2: return shifted<sanitize<V>(-2 * S)>(v);
0959     case -3: return shifted<sanitize<V>(-3 * S)>(v);
0960     }
0961     if (sizeof(T) <= 4) {
0962         switch (amount) {
0963         case  4: return shifted<sanitize<V>( 4 * S)>(v);
0964         case  5: return shifted<sanitize<V>( 5 * S)>(v);
0965         case  6: return shifted<sanitize<V>( 6 * S)>(v);
0966         case  7: return shifted<sanitize<V>( 7 * S)>(v);
0967         case -4: return shifted<sanitize<V>(-4 * S)>(v);
0968         case -5: return shifted<sanitize<V>(-5 * S)>(v);
0969         case -6: return shifted<sanitize<V>(-6 * S)>(v);
0970         case -7: return shifted<sanitize<V>(-7 * S)>(v);
0971         }
0972         if (sizeof(T) <= 2) {
0973             switch (amount) {
0974             case   8: return shifted<sanitize<V>(  8 * S)>(v);
0975             case   9: return shifted<sanitize<V>(  9 * S)>(v);
0976             case  10: return shifted<sanitize<V>( 10 * S)>(v);
0977             case  11: return shifted<sanitize<V>( 11 * S)>(v);
0978             case  12: return shifted<sanitize<V>( 12 * S)>(v);
0979             case  13: return shifted<sanitize<V>( 13 * S)>(v);
0980             case  14: return shifted<sanitize<V>( 14 * S)>(v);
0981             case  15: return shifted<sanitize<V>( 15 * S)>(v);
0982             case  -8: return shifted<sanitize<V>(- 8 * S)>(v);
0983             case  -9: return shifted<sanitize<V>(- 9 * S)>(v);
0984             case -10: return shifted<sanitize<V>(-10 * S)>(v);
0985             case -11: return shifted<sanitize<V>(-11 * S)>(v);
0986             case -12: return shifted<sanitize<V>(-12 * S)>(v);
0987             case -13: return shifted<sanitize<V>(-13 * S)>(v);
0988             case -14: return shifted<sanitize<V>(-14 * S)>(v);
0989             case -15: return shifted<sanitize<V>(-15 * S)>(v);
0990             }
0991             if (sizeof(T) == 1) {
0992                 switch (amount) {
0993                 case  16: return shifted<sanitize<V>( 16)>(v);
0994                 case  17: return shifted<sanitize<V>( 17)>(v);
0995                 case  18: return shifted<sanitize<V>( 18)>(v);
0996                 case  19: return shifted<sanitize<V>( 19)>(v);
0997                 case  20: return shifted<sanitize<V>( 20)>(v);
0998                 case  21: return shifted<sanitize<V>( 21)>(v);
0999                 case  22: return shifted<sanitize<V>( 22)>(v);
1000                 case  23: return shifted<sanitize<V>( 23)>(v);
1001                 case  24: return shifted<sanitize<V>( 24)>(v);
1002                 case  25: return shifted<sanitize<V>( 25)>(v);
1003                 case  26: return shifted<sanitize<V>( 26)>(v);
1004                 case  27: return shifted<sanitize<V>( 27)>(v);
1005                 case  28: return shifted<sanitize<V>( 28)>(v);
1006                 case  29: return shifted<sanitize<V>( 29)>(v);
1007                 case  30: return shifted<sanitize<V>( 30)>(v);
1008                 case  31: return shifted<sanitize<V>( 31)>(v);
1009                 case -16: return shifted<sanitize<V>(-16)>(v);
1010                 case -17: return shifted<sanitize<V>(-17)>(v);
1011                 case -18: return shifted<sanitize<V>(-18)>(v);
1012                 case -19: return shifted<sanitize<V>(-19)>(v);
1013                 case -20: return shifted<sanitize<V>(-20)>(v);
1014                 case -21: return shifted<sanitize<V>(-21)>(v);
1015                 case -22: return shifted<sanitize<V>(-22)>(v);
1016                 case -23: return shifted<sanitize<V>(-23)>(v);
1017                 case -24: return shifted<sanitize<V>(-24)>(v);
1018                 case -25: return shifted<sanitize<V>(-25)>(v);
1019                 case -26: return shifted<sanitize<V>(-26)>(v);
1020                 case -27: return shifted<sanitize<V>(-27)>(v);
1021                 case -28: return shifted<sanitize<V>(-28)>(v);
1022                 case -29: return shifted<sanitize<V>(-29)>(v);
1023                 case -30: return shifted<sanitize<V>(-30)>(v);
1024                 case -31: return shifted<sanitize<V>(-31)>(v);
1025                 }
1026             }
1027         }
1028     }
1029     return avx_cast<V>(_mm256_setzero_ps());
1030 }
1031
1032 template <typename T, typename V>
1033 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 16), V> shifted(V v, int amount)
1034 {
1035     using namespace AVX;
1036     switch (amount) {
1037     case  0: return v;
1038     case  1: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
1039     case  2: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
1040     case  3: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
1041     case -1: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(1 * sizeof(T))));
1042     case -2: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(2 * sizeof(T))));
1043     case -3: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(3 * sizeof(T))));
1044     }
1045     if (sizeof(T) <= 2) {
1046         switch (amount) {
1047         case  4: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
1048         case  5: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
1049         case  6: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
1050         case  7: return avx_cast<V>(_mm_srli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
1051         case -4: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(4 * sizeof(T))));
1052         case -5: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(5 * sizeof(T))));
1053         case -6: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(6 * sizeof(T))));
1054         case -7: return avx_cast<V>(_mm_slli_si128(avx_cast<__m128i>(v), sanitize<V>(7 * sizeof(T))));
1055         }
1056     }
1057     return avx_cast<V>(_mm_setzero_ps());
1058 }
1059 // rotated{{{1
1060 template <typename T, size_t N, typename V>
1061 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 4), V> rotated(V v,
1062                                                                                int amount)
1063 {
1064     using namespace AVX;
1065     const __m128i vLo = avx_cast<__m128i>(lo128(v));
1066     const __m128i vHi = avx_cast<__m128i>(hi128(v));
1067     switch (static_cast<unsigned int>(amount) % N) {
1068     case 0:
1069         return v;
1070     case 1:
1071         return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vHi, vLo),
1072                                   SSE::alignr_epi8<sizeof(T)>(vLo, vHi)));
1073     case 2:
1074         return Mem::permute128<X1, X0>(v);
1075     case 3:
1076         return avx_cast<V>(concat(SSE::alignr_epi8<sizeof(T)>(vLo, vHi),
1077                                   SSE::alignr_epi8<sizeof(T)>(vHi, vLo)));
1078     }
1079     return avx_cast<V>(_mm256_setzero_ps());
1080 }
1081
1082 template <typename T, size_t N, typename V>
1083 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 8), V> rotated(V v,
1084                                                                                int amount)
1085 {
1086     using namespace AVX;
1087     const __m128i vLo = avx_cast<__m128i>(lo128(v));
1088     const __m128i vHi = avx_cast<__m128i>(hi128(v));
1089     switch (static_cast<unsigned int>(amount) % N) {
1090     case 0:
1091         return v;
1092     case 1:
1093         return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
1094                                   SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
1095     case 2:
1096         return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
1097                                   SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
1098     case 3:
1099         return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
1100                                   SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
1101     case 4:
1102         return Mem::permute128<X1, X0>(v);
1103     case 5:
1104         return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
1105                                   SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
1106     case 6:
1107         return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
1108                                   SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
1109     case 7:
1110         return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
1111                                   SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
1112     }
1113     return avx_cast<V>(_mm256_setzero_ps());
1114 }
1115
1116 #ifdef Vc_IMPL_AVX2
1117 template <typename T, size_t N, typename V>
1118 static Vc_INTRINSIC Vc_CONST enable_if<(sizeof(V) == 32 && N == 16), V> rotated(
1119     V v, int amount)
1120 {
1121     using namespace AVX;
1122     const __m128i vLo = avx_cast<__m128i>(lo128(v));
1123     const __m128i vHi = avx_cast<__m128i>(hi128(v));
1124     switch (static_cast<unsigned int>(amount) % N) {
1125     case 0:
1126         return v;
1127     case 1:
1128         return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo),
1129                                   SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi)));
1130     case 2:
1131         return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo),
1132                                   SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi)));
1133     case 3:
1134         return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo),
1135                                   SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi)));
1136     case 4:
1137         return Mem::permute4x64<X1, X2, X3, X0>(v);
1138     case 5:
1139         return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo),
1140                                   SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi)));
1141     case 6:
1142         return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo),
1143                                   SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi)));
1144     case 7:
1145         return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo),
1146                                   SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi)));
1147     case 8:
1148         return Mem::permute128<X1, X0>(v);
1149     case 9:
1150         return avx_cast<V>(concat(SSE::alignr_epi8<1 * sizeof(T)>(vLo, vHi),
1151                                   SSE::alignr_epi8<1 * sizeof(T)>(vHi, vLo)));
1152     case 10:
1153         return avx_cast<V>(concat(SSE::alignr_epi8<2 * sizeof(T)>(vLo, vHi),
1154                                   SSE::alignr_epi8<2 * sizeof(T)>(vHi, vLo)));
1155     case 11:
1156         return avx_cast<V>(concat(SSE::alignr_epi8<3 * sizeof(T)>(vLo, vHi),
1157                                   SSE::alignr_epi8<3 * sizeof(T)>(vHi, vLo)));
1158     case 12:
1159         return Mem::permute4x64<X3, X0, X1, X2>(v);
1160     case 13:
1161         return avx_cast<V>(concat(SSE::alignr_epi8<5 * sizeof(T)>(vLo, vHi),
1162                                   SSE::alignr_epi8<5 * sizeof(T)>(vHi, vLo)));
1163     case 14:
1164         return avx_cast<V>(concat(SSE::alignr_epi8<6 * sizeof(T)>(vLo, vHi),
1165                                   SSE::alignr_epi8<6 * sizeof(T)>(vHi, vLo)));
1166     case 15:
1167         return avx_cast<V>(concat(SSE::alignr_epi8<7 * sizeof(T)>(vLo, vHi),
1168                                   SSE::alignr_epi8<7 * sizeof(T)>(vHi, vLo)));
1169     }
1170     return avx_cast<V>(_mm256_setzero_ps());
1171 }
1172 #endif  // Vc_IMPL_AVX2
1173
1174 // testc{{{1
1175 Vc_INTRINSIC Vc_CONST int testc(__m128  a, __m128  b) { return _mm_testc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1176 Vc_INTRINSIC Vc_CONST int testc(__m256  a, __m256  b) { return _mm256_testc_ps(a, b); }
1177 Vc_INTRINSIC Vc_CONST int testc(__m256d a, __m256d b) { return _mm256_testc_pd(a, b); }
1178 Vc_INTRINSIC Vc_CONST int testc(__m256i a, __m256i b) { return _mm256_testc_si256(a, b); }
1179
1180 // testz{{{1
1181 Vc_INTRINSIC Vc_CONST int testz(__m128  a, __m128  b) { return _mm_testz_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1182 Vc_INTRINSIC Vc_CONST int testz(__m256  a, __m256  b) { return _mm256_testz_ps(a, b); }
1183 Vc_INTRINSIC Vc_CONST int testz(__m256d a, __m256d b) { return _mm256_testz_pd(a, b); }
1184 Vc_INTRINSIC Vc_CONST int testz(__m256i a, __m256i b) { return _mm256_testz_si256(a, b); }
1185
1186 // testnzc{{{1
1187 Vc_INTRINSIC Vc_CONST int testnzc(__m128 a, __m128 b) { return _mm_testnzc_si128(_mm_castps_si128(a), _mm_castps_si128(b)); }
1188 Vc_INTRINSIC Vc_CONST int testnzc(__m256  a, __m256  b) { return _mm256_testnzc_ps(a, b); }
1189 Vc_INTRINSIC Vc_CONST int testnzc(__m256d a, __m256d b) { return _mm256_testnzc_pd(a, b); }
1190 Vc_INTRINSIC Vc_CONST int testnzc(__m256i a, __m256i b) { return _mm256_testnzc_si256(a, b); }
1191
1192 // movemask{{{1
1193 Vc_INTRINSIC Vc_CONST int movemask(__m256i a) { return AVX::movemask_epi8(a); }
1194 Vc_INTRINSIC Vc_CONST int movemask(__m128i a) { return _mm_movemask_epi8(a); }
1195 Vc_INTRINSIC Vc_CONST int movemask(__m256d a) { return _mm256_movemask_pd(a); }
1196 Vc_INTRINSIC Vc_CONST int movemask(__m128d a) { return _mm_movemask_pd(a); }
1197 Vc_INTRINSIC Vc_CONST int movemask(__m256  a) { return _mm256_movemask_ps(a); }
1198 Vc_INTRINSIC Vc_CONST int movemask(__m128  a) { return _mm_movemask_ps(a); }
1199
1200 // mask_store{{{1
1201 template <size_t N, typename Flags>
1202 Vc_INTRINSIC void mask_store(__m256i k, bool *mem, Flags)
1203 {
1204     static_assert(
1205         N == 4 || N == 8 || N == 16,
1206         "mask_store(__m256i, bool *) is only implemented for 4, 8, and 16 entries");
1207     switch (N) {
1208     case 4:
1209         *aliasing_cast<int32_t>(mem) = (_mm_movemask_epi8(AVX::lo128(k)) |
1210                                         (_mm_movemask_epi8(AVX::hi128(k)) << 16)) &
1211                                        0x01010101;
1212         break;
1213     case 8: {
1214         const auto k2 = _mm_srli_epi16(_mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)), 15);
1215         const auto k3 = _mm_packs_epi16(k2, _mm_setzero_si128());
1216 #ifdef __x86_64__
1217         *aliasing_cast<int64_t>(mem) = _mm_cvtsi128_si64(k3);
1218 #else
1219         *aliasing_cast<int32_t>(mem) = _mm_cvtsi128_si32(k3);
1220         *aliasing_cast<int32_t>(mem + 4) = _mm_extract_epi32(k3, 1);
1221 #endif
1222     } break;
1223     case 16: {
1224         const auto bools = Detail::and_(_mm_set1_epi8(1),
1225                                         _mm_packs_epi16(AVX::lo128(k), AVX::hi128(k)));
1226         if (Flags::IsAligned) {
1227             _mm_store_si128(reinterpret_cast<__m128i *>(mem), bools);
1228         } else {
1229             _mm_storeu_si128(reinterpret_cast<__m128i *>(mem), bools);
1230         }
1231     } break;
1232     default:
1233         Vc_UNREACHABLE();
1234     }
1235 }
1236
1237 // mask_load{{{1
1238 template <typename R, size_t N, typename Flags>
1239 Vc_INTRINSIC R mask_load(const bool *mem, Flags,
1240                          enable_if<std::is_same<R, __m128>::value> = nullarg)
1241 {
1242     static_assert(N == 4 || N == 8,
1243                   "mask_load<__m128>(const bool *) is only implemented for 4, 8 entries");
1244     switch (N) {
1245     case 4: {
1246         __m128i k = _mm_cvtsi32_si128(*aliasing_cast<int32_t>(mem));
1247         k = _mm_unpacklo_epi8(k, k);
1248         k = _mm_unpacklo_epi16(k, k);
1249         k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
1250         return AVX::avx_cast<__m128>(k);
1251     }
1252     case 8: {
1253 #ifdef __x86_64__
1254         __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
1255 #else
1256         __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
1257 #endif
1258         return AVX::avx_cast<__m128>(
1259             _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128()));
1260     }
1261     default:
1262         Vc_UNREACHABLE();
1263     }
1264 }
1265
1266 template <typename R, size_t N, typename Flags>
1267 Vc_INTRINSIC R mask_load(const bool *mem, Flags,
1268                          enable_if<std::is_same<R, __m256>::value> = nullarg)
1269 {
1270     static_assert(
1271         N == 4 || N == 8 || N == 16,
1272         "mask_load<__m256>(const bool *) is only implemented for 4, 8, and 16 entries");
1273     switch (N) {
1274     case 4: {
1275         __m128i k = AVX::avx_cast<__m128i>(_mm_and_ps(
1276             _mm_set1_ps(*aliasing_cast<float>(mem)),
1277             AVX::avx_cast<__m128>(_mm_setr_epi32(0x1, 0x100, 0x10000, 0x1000000))));
1278         k = _mm_cmpgt_epi32(k, _mm_setzero_si128());
1279         return AVX::avx_cast<__m256>(
1280             AVX::concat(_mm_unpacklo_epi32(k, k), _mm_unpackhi_epi32(k, k)));
1281     }
1282     case 8: {
1283 #ifdef __x86_64__
1284         __m128i k = _mm_cvtsi64_si128(*aliasing_cast<int64_t>(mem));
1285 #else
1286         __m128i k = _mm_castpd_si128(_mm_load_sd(aliasing_cast<double>(mem)));
1287 #endif
1288         k = _mm_cmpgt_epi16(_mm_unpacklo_epi8(k, k), _mm_setzero_si128());
1289         return AVX::avx_cast<__m256>(
1290             AVX::concat(_mm_unpacklo_epi16(k, k), _mm_unpackhi_epi16(k, k)));
1291     }
1292     case 16: {
1293         const auto k128 = _mm_cmpgt_epi8(
1294             Flags::IsAligned ? _mm_load_si128(reinterpret_cast<const __m128i *>(mem))
1295                              : _mm_loadu_si128(reinterpret_cast<const __m128i *>(mem)),
1296             _mm_setzero_si128());
1297         return AVX::avx_cast<__m256>(
1298             AVX::concat(_mm_unpacklo_epi8(k128, k128), _mm_unpackhi_epi8(k128, k128)));
1299     }
1300     default:
1301         Vc_UNREACHABLE();
1302         return R();
1303     }
1304 }
1305
1306 // mask_to_int{{{1
1307 template <size_t Size>
1308 Vc_INTRINSIC_L Vc_CONST_L int mask_to_int(__m256i x) Vc_INTRINSIC_R Vc_CONST_R;
1309 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<4>(__m256i k)
1310 {
1311     return movemask(AVX::avx_cast<__m256d>(k));
1312 }
1313 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<8>(__m256i k)
1314 {
1315     return movemask(AVX::avx_cast<__m256>(k));
1316 }
1317 #ifdef Vc_IMPL_BMI2
1318 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<16>(__m256i k)
1319 {
1320     return _pext_u32(movemask(k), 0x55555555u);
1321 }
1322 #endif
1323 template <> Vc_INTRINSIC Vc_CONST int mask_to_int<32>(__m256i k)
1324 {
1325     return movemask(k);
1326 }
1327
1328 //InterleaveImpl{{{1
1329 template<typename V> struct InterleaveImpl<V, 16, 32> {
1330     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
1331             const typename V::AsArg v0, // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ...
1332             const typename V::AsArg v1) // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ...
1333     {
1334         const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ...
1335         const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ...
1336         using namespace AVX;
1337         *aliasing_cast<uint32_t>(&data[i[ 0]]) = _mm_cvtsi128_si32(lo128(tmp0));
1338         *aliasing_cast<uint32_t>(&data[i[ 1]]) = _mm_extract_epi32(lo128(tmp0), 1);
1339         *aliasing_cast<uint32_t>(&data[i[ 2]]) = _mm_extract_epi32(lo128(tmp0), 2);
1340         *aliasing_cast<uint32_t>(&data[i[ 3]]) = _mm_extract_epi32(lo128(tmp0), 3);
1341         *aliasing_cast<uint32_t>(&data[i[ 4]]) = _mm_cvtsi128_si32(lo128(tmp1));
1342         *aliasing_cast<uint32_t>(&data[i[ 5]]) = _mm_extract_epi32(lo128(tmp1), 1);
1343         *aliasing_cast<uint32_t>(&data[i[ 6]]) = _mm_extract_epi32(lo128(tmp1), 2);
1344         *aliasing_cast<uint32_t>(&data[i[ 7]]) = _mm_extract_epi32(lo128(tmp1), 3);
1345         *aliasing_cast<uint32_t>(&data[i[ 8]]) = _mm_cvtsi128_si32(hi128(tmp0));
1346         *aliasing_cast<uint32_t>(&data[i[ 9]]) = _mm_extract_epi32(hi128(tmp0), 1);
1347         *aliasing_cast<uint32_t>(&data[i[10]]) = _mm_extract_epi32(hi128(tmp0), 2);
1348         *aliasing_cast<uint32_t>(&data[i[11]]) = _mm_extract_epi32(hi128(tmp0), 3);
1349         *aliasing_cast<uint32_t>(&data[i[12]]) = _mm_cvtsi128_si32(hi128(tmp1));
1350         *aliasing_cast<uint32_t>(&data[i[13]]) = _mm_extract_epi32(hi128(tmp1), 1);
1351         *aliasing_cast<uint32_t>(&data[i[14]]) = _mm_extract_epi32(hi128(tmp1), 2);
1352         *aliasing_cast<uint32_t>(&data[i[15]]) = _mm_extract_epi32(hi128(tmp1), 3);
1353     }/*}}}*/
1354     static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
1355             const typename V::AsArg v0, const typename V::AsArg v1)
1356     {
1357         const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v1.data()); // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 ...
1358         const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v1.data()); // a4 b4 a5 ...
1359         V(Mem::shuffle128<X0, Y0>(tmp0, tmp1)).store(&data[i[0]], Vc::Unaligned);
1360         V(Mem::shuffle128<X1, Y1>(tmp0, tmp1)).store(&data[i[8]], Vc::Unaligned);
1361     }/*}}}*/
1362     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
1363             const typename V::AsArg v0, const typename V::AsArg v1, const typename V::AsArg v2)
1364     {
1365         interleave(data, i, v0, v1);
1366         v2.scatter(data + 2, i);
1367     }/*}}}*/
1368     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
1369             const typename V::AsArg v0, const typename V::AsArg v1,
1370             const typename V::AsArg v2, const typename V::AsArg v3)
1371     {
1372         const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ...
1373         const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ...
1374         const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ...
1375         const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ...
1376
1377         const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ...
1378         const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11
1379         const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13
1380         const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15
1381
1382         using namespace AVX;
1383         auto &&store = [&](__m256i x, int offset) {
1384             _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 0]]), lo128(x));
1385             _mm_storel_epi64(reinterpret_cast<__m128i *>(&data[i[offset + 8]]), hi128(x));
1386             _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 1]]), avx_cast<__m128>(x));
1387             _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[offset + 9]]), avx_cast<__m128>(hi128(x)));
1388         };
1389         store(tmp4, 0);
1390         store(tmp5, 2);
1391         store(tmp6, 4);
1392         store(tmp7, 6);
1393     }/*}}}*/
1394     static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<4> &i,/*{{{*/
1395             const typename V::AsArg v0, const typename V::AsArg v1,
1396             const typename V::AsArg v2, const typename V::AsArg v3)
1397     {
1398         const __m256i tmp0 = AVX::unpacklo_epi16(v0.data(), v2.data()); // a0 c0 a1 c1 a2 c2 a3 c3 | a8 c8 a9 c9 ...
1399         const __m256i tmp1 = AVX::unpackhi_epi16(v0.data(), v2.data()); // a4 c4 a5 c5 a6 c6 a7 c7 | a12 c12 ...
1400         const __m256i tmp2 = AVX::unpacklo_epi16(v1.data(), v3.data()); // b0 d0 b1 d1 b2 d2 b3 d3 | b8 d8 b9 d9 ...
1401         const __m256i tmp3 = AVX::unpackhi_epi16(v1.data(), v3.data()); // b4 d4 b5 ...
1402
1403         const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 b0 c0 d0 a1 b1 c1 d1 | a8 b8 c8 d8 a9 b9 ...
1404         const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // [abcd]2 [abcd]3 | [abcd]10 [abcd]11
1405         const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // [abcd]4 [abcd]5 | [abcd]12 [abcd]13
1406         const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // [abcd]6 [abcd]7 | [abcd]14 [abcd]15
1407
1408         V(Mem::shuffle128<X0, Y0>(tmp4, tmp5)).store(&data[i[0]], ::Vc::Unaligned);
1409         V(Mem::shuffle128<X0, Y0>(tmp6, tmp7)).store(&data[i[4]], ::Vc::Unaligned);
1410         V(Mem::shuffle128<X1, Y1>(tmp4, tmp5)).store(&data[i[8]], ::Vc::Unaligned);
1411         V(Mem::shuffle128<X1, Y1>(tmp6, tmp7)).store(&data[i[12]], ::Vc::Unaligned);
1412     }/*}}}*/
1413     template <typename I>  // interleave 5 args {{{2
1414     static inline void interleave(typename V::EntryType *const data, const I &i,
1415                                   const typename V::AsArg v0, const typename V::AsArg v1,
1416                                   const typename V::AsArg v2, const typename V::AsArg v3,
1417                                   const typename V::AsArg v4)
1418     {
1419         interleave(data, i, v0, v1, v2, v3);
1420         v4.scatter(data + 4, i);
1421     }
1422     template <typename I>  // interleave 6 args {{{2
1423     static inline void interleave(typename V::EntryType *const data, const I &i,
1424                                   const typename V::AsArg v0, const typename V::AsArg v1,
1425                                   const typename V::AsArg v2, const typename V::AsArg v3,
1426                                   const typename V::AsArg v4, const typename V::AsArg v5)
1427     {
1428         interleave(data, i, v0, v1, v2, v3);
1429         interleave(data + 4, i, v4, v5);
1430     }
1431     template <typename I>  // interleave 7 args {{{2
1432     static inline void interleave(typename V::EntryType *const data, const I &i,
1433                                   const typename V::AsArg v0, const typename V::AsArg v1,
1434                                   const typename V::AsArg v2, const typename V::AsArg v3,
1435                                   const typename V::AsArg v4, const typename V::AsArg v5,
1436                                   const typename V::AsArg v6)
1437     {
1438         interleave(data, i, v0, v1, v2, v3);
1439         interleave(data + 4, i, v4, v5, v6);
1440     }
1441     template <typename I>  // interleave 8 args {{{2
1442     static inline void interleave(typename V::EntryType *const data, const I &i,
1443                                   const typename V::AsArg v0, const typename V::AsArg v1,
1444                                   const typename V::AsArg v2, const typename V::AsArg v3,
1445                                   const typename V::AsArg v4, const typename V::AsArg v5,
1446                                   const typename V::AsArg v6, const typename V::AsArg v7)
1447     {
1448         interleave(data, i, v0, v1, v2, v3);
1449         interleave(data + 4, i, v4, v5, v6, v7);
1450     }
1451     //}}}2
1452     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1453             const I &i, V &v0, V &v1)
1454     {
1455         const __m256i tmp4 =  // a0 b0 a1 b1 a2 b2 a3 b3 | a8 b8 a9 b9 a10 b10 a11 b11
1456             _mm256_setr_epi32(
1457                 *aliasing_cast<int>(&data[i[0]]), *aliasing_cast<int>(&data[i[1]]),
1458                 *aliasing_cast<int>(&data[i[2]]), *aliasing_cast<int>(&data[i[3]]),
1459                 *aliasing_cast<int>(&data[i[8]]), *aliasing_cast<int>(&data[i[9]]),
1460                 *aliasing_cast<int>(&data[i[10]]), *aliasing_cast<int>(&data[i[11]]));
1461         const __m256i tmp5 =  // a4 b4 a5 b5 a6 b6 a7 b7 | a12 b12 a13 b13 a14 b14 a15 b15
1462             _mm256_setr_epi32(
1463                 *aliasing_cast<int>(&data[i[4]]), *aliasing_cast<int>(&data[i[5]]),
1464                 *aliasing_cast<int>(&data[i[6]]), *aliasing_cast<int>(&data[i[7]]),
1465                 *aliasing_cast<int>(&data[i[12]]), *aliasing_cast<int>(&data[i[13]]),
1466                 *aliasing_cast<int>(&data[i[14]]), *aliasing_cast<int>(&data[i[15]]));
1467
1468         const __m256i tmp2 = AVX::unpacklo_epi16(tmp4, tmp5);  // a0 a4 b0 b4 a1 a5 b1 b5 | a8 a12 b8 b12 a9 a13 b9 b13
1469         const __m256i tmp3 = AVX::unpackhi_epi16(tmp4, tmp5);  // a2 a6 b2 b6 a3 a7 b3 b7 | a10 a14 b10 b14 a11 a15 b11 b15
1470
1471         const __m256i tmp0 = AVX::unpacklo_epi16(tmp2, tmp3);  // a0 a2 a4 a6 b0 b2 b4 b6 | a8 a10 a12 a14 b8 ...
1472         const __m256i tmp1 = AVX::unpackhi_epi16(tmp2, tmp3);  // a1 a3 a5 a7 b1 b3 b5 b7 | a9 a11 a13 a15 b9 ...
1473
1474         v0.data() = AVX::unpacklo_epi16(tmp0, tmp1); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 a9 ...
1475         v1.data() = AVX::unpackhi_epi16(tmp0, tmp1); // b0 b1 b2 b3 b4 b5 b6 b7 | b8 b9 ...
1476     }/*}}}*/
1477     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1478             const I &i, V &v0, V &v1, V &v2)
1479     {
1480         using namespace AVX;
1481         const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
1482             *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
1483             *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
1484         const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
1485             *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
1486             *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
1487         const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
1488             *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
1489             *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
1490         const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
1491             *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
1492             *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
1493         const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 XX XX | a8 a12 b8 ...
1494         const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ...
1495         const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ...
1496         const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ...
1497
1498         const __m256i tmp8  = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ...
1499         const __m256i tmp9  = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 XX ...
1500         const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ...
1501         const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 XX ...
1502
1503         v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ...
1504         v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
1505         v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
1506     }/*}}}*/
1507     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1508             const I &i, V &v0, V &v1, V &v2, V &v3)
1509     {
1510         using namespace AVX;
1511         const __m256i tmp0 = avx_cast<__m256i>(_mm256_setr_pd(
1512             *aliasing_cast<double>(&data[i[0]]), *aliasing_cast<double>(&data[i[1]]),
1513             *aliasing_cast<double>(&data[i[8]]), *aliasing_cast<double>(&data[i[9]])));
1514         const __m256i tmp1 = avx_cast<__m256i>(_mm256_setr_pd(
1515             *aliasing_cast<double>(&data[i[2]]), *aliasing_cast<double>(&data[i[3]]),
1516             *aliasing_cast<double>(&data[i[10]]), *aliasing_cast<double>(&data[i[11]])));
1517         const __m256i tmp2 = avx_cast<__m256i>(_mm256_setr_pd(
1518             *aliasing_cast<double>(&data[i[4]]), *aliasing_cast<double>(&data[i[5]]),
1519             *aliasing_cast<double>(&data[i[12]]), *aliasing_cast<double>(&data[i[13]])));
1520         const __m256i tmp3 = avx_cast<__m256i>(_mm256_setr_pd(
1521             *aliasing_cast<double>(&data[i[6]]), *aliasing_cast<double>(&data[i[7]]),
1522             *aliasing_cast<double>(&data[i[14]]), *aliasing_cast<double>(&data[i[15]])));
1523         const __m256i tmp4 = AVX::unpacklo_epi16(tmp0, tmp2); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 a12 b8 ...
1524         const __m256i tmp5 = AVX::unpackhi_epi16(tmp0, tmp2); // a1 a5 ...
1525         const __m256i tmp6 = AVX::unpacklo_epi16(tmp1, tmp3); // a2 a6 ...
1526         const __m256i tmp7 = AVX::unpackhi_epi16(tmp1, tmp3); // a3 a7 ...
1527
1528         const __m256i tmp8  = AVX::unpacklo_epi16(tmp4, tmp6); // a0 a2 a4 a6 b0 ...
1529         const __m256i tmp9  = AVX::unpackhi_epi16(tmp4, tmp6); // c0 c2 c4 c6 d0 ...
1530         const __m256i tmp10 = AVX::unpacklo_epi16(tmp5, tmp7); // a1 a3 a5 a7 b1 ...
1531         const __m256i tmp11 = AVX::unpackhi_epi16(tmp5, tmp7); // c1 c3 c5 c7 d1 ...
1532
1533         v0.data() = AVX::unpacklo_epi16(tmp8, tmp10); // a0 a1 a2 a3 a4 a5 a6 a7 | a8 ...
1534         v1.data() = AVX::unpackhi_epi16(tmp8, tmp10);
1535         v2.data() = AVX::unpacklo_epi16(tmp9, tmp11);
1536         v3.data() = AVX::unpackhi_epi16(tmp9, tmp11);
1537     }/*}}}*/
1538     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1539             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
1540     {
1541         using namespace AVX;
1542         const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1543                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1544         const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1545                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1546         const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1547                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1548         const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1549                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1550         const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1551                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1552         const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1553                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1554         const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1555                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1556         const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1557                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1558
1559         const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
1560         const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1561         const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1562         const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1563         const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1564         const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1565         const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1566         const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1567
1568         const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
1569         const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1570         const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1571         const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1572         const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1573         const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1574
1575         v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1576         v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1577         v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1578         v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1579         v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1580     }/*}}}*/
1581     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1582             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
1583     {
1584         using namespace AVX;
1585         const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1586                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1587         const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1588                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1589         const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1590                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1591         const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1592                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1593         const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1594                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1595         const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1596                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1597         const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1598                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1599         const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1600                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1601
1602         const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
1603         const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1604         const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1605         const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1606         const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1607         const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1608         const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1609         const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1610
1611         const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
1612         const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1613         const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1614         const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1615         const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1616         const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1617
1618         v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1619         v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1620         v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1621         v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1622         v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1623         v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1624     }/*}}}*/
1625     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1626             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
1627     {
1628         using namespace AVX;
1629         const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1630                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1631         const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1632                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1633         const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1634                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1635         const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1636                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1637         const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1638                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1639         const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1640                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1641         const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1642                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1643         const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1644                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1645
1646         const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
1647         const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1648         const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1649         const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1650         const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1651         const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1652         const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1653         const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1654
1655         const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
1656         const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1657         const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1658         const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1659         const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1660         const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1661         const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
1662         const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
1663
1664         v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1665         v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1666         v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1667         v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1668         v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1669         v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1670         v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
1671     }/*}}}*/
1672     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
1673             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
1674     {
1675         using namespace AVX;
1676         const __m256i a = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[0]])),
1677                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[8]])));
1678         const __m256i b = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[1]])),
1679                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[9]])));
1680         const __m256i c = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[2]])),
1681                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[10]])));
1682         const __m256i d = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[3]])),
1683                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[11]])));
1684         const __m256i e = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[4]])),
1685                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[12]])));
1686         const __m256i f = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[5]])),
1687                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[13]])));
1688         const __m256i g = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[6]])),
1689                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[14]])));
1690         const __m256i h = concat(_mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[7]])),
1691                                  _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i[15]])));
1692
1693         const __m256i tmp2  = AVX::unpacklo_epi16(a, e); // a0 a4 b0 b4 c0 c4 d0 d4 | a8 ...
1694         const __m256i tmp4  = AVX::unpacklo_epi16(b, f); // a1 a5 b1 b5 c1 c5 d1 d5
1695         const __m256i tmp3  = AVX::unpacklo_epi16(c, g); // a2 a6 b2 b6 c2 c6 d2 d6
1696         const __m256i tmp5  = AVX::unpacklo_epi16(d, h); // a3 a7 b3 b7 c3 c7 d3 d7
1697         const __m256i tmp10 = AVX::unpackhi_epi16(a, e); // e0 e4 f0 f4 g0 g4 h0 h4
1698         const __m256i tmp11 = AVX::unpackhi_epi16(c, g); // e1 e5 f1 f5 g1 g5 h1 h5
1699         const __m256i tmp12 = AVX::unpackhi_epi16(b, f); // e2 e6 f2 f6 g2 g6 h2 h6
1700         const __m256i tmp13 = AVX::unpackhi_epi16(d, h); // e3 e7 f3 f7 g3 g7 h3 h7
1701
1702         const __m256i tmp0  = AVX::unpacklo_epi16(tmp2, tmp3); // a0 a2 a4 a6 b0 b2 b4 b6 | a8 ...
1703         const __m256i tmp1  = AVX::unpacklo_epi16(tmp4, tmp5); // a1 a3 a5 a7 b1 b3 b5 b7
1704         const __m256i tmp6  = AVX::unpackhi_epi16(tmp2, tmp3); // c0 c2 c4 c6 d0 d2 d4 d6
1705         const __m256i tmp7  = AVX::unpackhi_epi16(tmp4, tmp5); // c1 c3 c5 c7 d1 d3 d5 d7
1706         const __m256i tmp8  = AVX::unpacklo_epi16(tmp10, tmp11); // e0 e2 e4 e6 f0 f2 f4 f6
1707         const __m256i tmp9  = AVX::unpacklo_epi16(tmp12, tmp13); // e1 e3 e5 e7 f1 f3 f5 f7
1708         const __m256i tmp14 = AVX::unpackhi_epi16(tmp10, tmp11); // g0 g2 g4 g6 h0 h2 h4 h6
1709         const __m256i tmp15 = AVX::unpackhi_epi16(tmp12, tmp13); // g1 g3 g5 g7 h1 h3 h5 h7
1710
1711         v0.data() = AVX::unpacklo_epi16(tmp0, tmp1);
1712         v1.data() = AVX::unpackhi_epi16(tmp0, tmp1);
1713         v2.data() = AVX::unpacklo_epi16(tmp6, tmp7);
1714         v3.data() = AVX::unpackhi_epi16(tmp6, tmp7);
1715         v4.data() = AVX::unpacklo_epi16(tmp8, tmp9);
1716         v5.data() = AVX::unpackhi_epi16(tmp8, tmp9);
1717         v6.data() = AVX::unpacklo_epi16(tmp14, tmp15);
1718         v7.data() = AVX::unpackhi_epi16(tmp14, tmp15);
1719     }/*}}}*/
1720 };
1721 template<typename V> struct InterleaveImpl<V, 8, 32> {
1722     static_assert(sizeof(typename V::value_type) == 4, "");
1723     template<typename I> static inline void interleave(typename V::EntryType *const data, const I &i,/*{{{*/
1724             const typename V::AsArg v0, const typename V::AsArg v1)
1725     {
1726         using namespace AVX;
1727         // [0a 1a 0b 1b 0e 1e 0f 1f]:
1728         const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1729         // [0c 1c 0d 1d 0g 1g 0h 1h]:
1730         const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1731         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[0]]), lo128(tmp0));
1732         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[1]]), lo128(tmp0));
1733         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[2]]), lo128(tmp1));
1734         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[3]]), lo128(tmp1));
1735         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[4]]), hi128(tmp0));
1736         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[5]]), hi128(tmp0));
1737         _mm_storel_pi(reinterpret_cast<__m64 *>(&data[i[6]]), hi128(tmp1));
1738         _mm_storeh_pi(reinterpret_cast<__m64 *>(&data[i[7]]), hi128(tmp1));
1739     }/*}}}*/
1740     static inline void interleave(typename V::EntryType *const data, const Common::SuccessiveEntries<2> &i,/*{{{*/
1741             const typename V::AsArg v0, const typename V::AsArg v1)
1742     {
1743         using namespace AVX;
1744         // [0a 1a 0b 1b 0e 1e 0f 1f]:
1745         const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1746         // [0c 1c 0d 1d 0g 1g 0h 1h]:
1747         const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v1.data()));
1748         _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(tmp0));
1749         _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(tmp1));
1750         _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(tmp0));
1751         _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(tmp1));
1752     }/*}}}*/
1753     // interleave scatter 3 {{{
1754     template <typename I>
1755     static inline void interleave(typename V::EntryType *const data, const I &i,
1756                                   const typename V::AsArg v0, const typename V::AsArg v1,
1757                                   const typename V::AsArg v2)
1758     {
1759         using namespace AVX;
1760 #ifdef Vc_USE_MASKMOV_SCATTER
1761         // [0a 2a 0b 2b 0e 2e 0f 2f]:
1762         const m256 tmp0 = _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1763         // [0c 2c 0d 2d 0g 2g 0h 2h]:
1764         const m256 tmp1 = _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1765         // [1a __ 1b __ 1e __ 1f __]:
1766         const m256 tmp2 = _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
1767         // [1c __ 1d __ 1g __ 1h __]:
1768         const m256 tmp3 = _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v1.data()));
1769         const m256 tmp4 = _mm256_unpacklo_ps(tmp0, tmp2);
1770         const m256 tmp5 = _mm256_unpackhi_ps(tmp0, tmp2);
1771         const m256 tmp6 = _mm256_unpacklo_ps(tmp1, tmp3);
1772         const m256 tmp7 = _mm256_unpackhi_ps(tmp1, tmp3);
1773         const m128i mask = _mm_set_epi32(0, -1, -1, -1);
1774         _mm_maskstore_ps(aliasing_cast<float>(&data[i[0]]), mask, lo128(tmp4));
1775         _mm_maskstore_ps(aliasing_cast<float>(&data[i[1]]), mask, lo128(tmp5));
1776         _mm_maskstore_ps(aliasing_cast<float>(&data[i[2]]), mask, lo128(tmp6));
1777         _mm_maskstore_ps(aliasing_cast<float>(&data[i[3]]), mask, lo128(tmp7));
1778         _mm_maskstore_ps(aliasing_cast<float>(&data[i[4]]), mask, hi128(tmp4));
1779         _mm_maskstore_ps(aliasing_cast<float>(&data[i[5]]), mask, hi128(tmp5));
1780         _mm_maskstore_ps(aliasing_cast<float>(&data[i[6]]), mask, hi128(tmp6));
1781         _mm_maskstore_ps(aliasing_cast<float>(&data[i[7]]), mask, hi128(tmp7));
1782 #else
1783         interleave(data, i, v0, v1);
1784         v2.scatter(data + 2, i);
1785 #endif
1786     }  // }}}
1787     // interleave successive 3 {{{
1788     static inline void interleave(typename V::EntryType *const data,
1789                                   const Common::SuccessiveEntries<3> &i,
1790                                   const typename V::AsArg v0_,
1791                                   const typename V::AsArg v1_,
1792                                   const typename V::AsArg v2_)
1793     {
1794         __m256 v0 = AVX::avx_cast<__m256>(v0_.data());  // a0 a1 a2 a3|a4 a5 a6 a7
1795         __m256 v1 = AVX::avx_cast<__m256>(v1_.data());  // b0 b1 b2 b3|b4 b5 b6 b7
1796         __m256 v2 = AVX::avx_cast<__m256>(v2_.data());  // c0 c1 c2 c3|c4 c5 c6 c7
1797
1798         v0 = _mm256_shuffle_ps(v0, v0, 0x6c);  // a0 a3 a2 a1|a4 a7 a6 a5
1799         v1 = _mm256_shuffle_ps(v1, v1, 0xb1);  // b1 b0 b3 b2|b5 b4 b7 b6
1800         v2 = _mm256_shuffle_ps(v2, v2, 0xc6);  // c2 c1 c0 c3|c6 c5 c4 c7
1801
1802         // a0 b0 c0 a1|c6 a7 b7 c7:
1803         __m256 w0 = Mem::blend<X0, X1, Y2, X3, Y4, X5, X6, Y7>(
1804                     Mem::blend<X0, Y1, X2, X3, X4, X5, Y6, X7>(v0, v1), v2);
1805         // b1 c1 a2 b2|b5 c5 a6 b6:
1806         __m256 w1 = Mem::blend<X0, Y1, X2, X3, X4, Y5, X6, X7>(
1807                     Mem::blend<Y0, X1, X2, Y3, Y4, X5, X6, Y7>(v0, v1), v2);
1808         // c2 a3 b3 c3|a4 b4 c4 a5:
1809         __m256 w2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(
1810                     Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(v0, v1), v2);
1811
1812         // a0 b0 c0 a1|b1 c1 a2 b2:
1813         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
1814                          _mm256_permute2f128_ps(w0, w1, 0x20));
1815         // c2 a3 b3 c3|a4 b4 c4 a5: w2
1816         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8), w2);
1817         // b5 c5 a6 b6|c6 a7 b7 c7:
1818         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
1819                          _mm256_permute2f128_ps(w1, w0, 0x31));
1820
1821     }  //}}}
1822     // interleave scatter 4 {{{
1823     template <typename I>
1824     static inline void interleave(typename V::EntryType *const data, const I &i,
1825                                   const typename V::AsArg v0, const typename V::AsArg v1,
1826                                   const typename V::AsArg v2, const typename V::AsArg v3)
1827     {
1828         using namespace AVX;
1829         const __m256 tmp0 =
1830             _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1831         const __m256 tmp1 =
1832             _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1833         const __m256 tmp2 =
1834             _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1835         const __m256 tmp3 =
1836             _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1837         const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
1838         const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
1839         const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
1840         const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
1841         _mm_storeu_ps(aliasing_cast<float>(&data[i[0]]), lo128(_04));
1842         _mm_storeu_ps(aliasing_cast<float>(&data[i[1]]), lo128(_15));
1843         _mm_storeu_ps(aliasing_cast<float>(&data[i[2]]), lo128(_26));
1844         _mm_storeu_ps(aliasing_cast<float>(&data[i[3]]), lo128(_37));
1845         _mm_storeu_ps(aliasing_cast<float>(&data[i[4]]), hi128(_04));
1846         _mm_storeu_ps(aliasing_cast<float>(&data[i[5]]), hi128(_15));
1847         _mm_storeu_ps(aliasing_cast<float>(&data[i[6]]), hi128(_26));
1848         _mm_storeu_ps(aliasing_cast<float>(&data[i[7]]), hi128(_37));
1849     }  // }}}
1850     // interleave successive 4 {{{
1851     // same as above except fot the stores, that can be combined to 256-bit stores
1852     static inline void interleave(typename V::EntryType *const data,
1853                                   const Common::SuccessiveEntries<4> &i,
1854                                   const typename V::AsArg v0, const typename V::AsArg v1,
1855                                   const typename V::AsArg v2, const typename V::AsArg v3)
1856     {
1857         using namespace AVX;
1858         const __m256 tmp0 =
1859             _mm256_unpacklo_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1860         const __m256 tmp1 =
1861             _mm256_unpackhi_ps(avx_cast<m256>(v0.data()), avx_cast<m256>(v2.data()));
1862         const __m256 tmp2 =
1863             _mm256_unpacklo_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1864         const __m256 tmp3 =
1865             _mm256_unpackhi_ps(avx_cast<m256>(v1.data()), avx_cast<m256>(v3.data()));
1866         const __m256 _04 = _mm256_unpacklo_ps(tmp0, tmp2);
1867         const __m256 _15 = _mm256_unpackhi_ps(tmp0, tmp2);
1868         const __m256 _26 = _mm256_unpacklo_ps(tmp1, tmp3);
1869         const __m256 _37 = _mm256_unpackhi_ps(tmp1, tmp3);
1870         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]]),
1871                          _mm256_permute2f128_ps(_04, _15, 0x20));
1872         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 8),
1873                          _mm256_permute2f128_ps(_26, _37, 0x20));
1874         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 16),
1875                          _mm256_permute2f128_ps(_04, _15, 0x31));
1876         _mm256_storeu_ps(aliasing_cast<float>(&data[i[0]] + 24),
1877                          _mm256_permute2f128_ps(_26, _37, 0x31));
1878     }                      // }}}
1879     template <typename I>  // interleave 5 args {{{2
1880     static inline void interleave(typename V::EntryType *const data, const I &i,
1881                                   const typename V::AsArg v0, const typename V::AsArg v1,
1882                                   const typename V::AsArg v2, const typename V::AsArg v3,
1883                                   const typename V::AsArg v4)
1884     {
1885         interleave(data, i, v0, v1, v2, v3);
1886         v4.scatter(data + 4, i);
1887     }
1888     template <typename I>  // interleave 6 args {{{2
1889     static inline void interleave(typename V::EntryType *const data, const I &i,
1890                                   const typename V::AsArg v0, const typename V::AsArg v1,
1891                                   const typename V::AsArg v2, const typename V::AsArg v3,
1892                                   const typename V::AsArg v4, const typename V::AsArg v5)
1893     {
1894         interleave(data, i, v0, v1, v2, v3);
1895         interleave(data + 4, i, v4, v5);
1896     }
1897     template <typename I>  // interleave 7 args {{{2
1898     static inline void interleave(typename V::EntryType *const data, const I &i,
1899                                   const typename V::AsArg v0, const typename V::AsArg v1,
1900                                   const typename V::AsArg v2, const typename V::AsArg v3,
1901                                   const typename V::AsArg v4, const typename V::AsArg v5,
1902                                   const typename V::AsArg v6)
1903     {
1904         interleave(data, i, v0, v1, v2, v3);
1905         interleave(data + 4, i, v4, v5, v6);
1906     }
1907     template <typename I>  // interleave 8 args {{{2
1908     static inline void interleave(typename V::EntryType *const data, const I &i,
1909                                   const typename V::AsArg v0, const typename V::AsArg v1,
1910                                   const typename V::AsArg v2, const typename V::AsArg v3,
1911                                   const typename V::AsArg v4, const typename V::AsArg v5,
1912                                   const typename V::AsArg v6, const typename V::AsArg v7)
1913     {
1914         interleave(data, i, v0, v1, v2, v3);
1915         interleave(data + 4, i, v4, v5, v6, v7);
1916     }
1917     //}}}2
1918     // deinterleave scatter 2 {{{
1919     template <typename I>
1920     static inline void deinterleave(typename V::EntryType const *const data, const I &i,
1921                                     V &v0, V &v1)
1922     {
1923         using namespace AVX;
1924         const m128  il0 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[0]])); // a0 b0
1925         const m128  il2 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[2]])); // a2 b2
1926         const m128  il4 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[4]])); // a4 b4
1927         const m128  il6 = _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const *>(&data[i[6]])); // a6 b6
1928         const m128 il01 = _mm_loadh_pi(             il0, reinterpret_cast<__m64 const *>(&data[i[1]])); // a0 b0 a1 b1
1929         const m128 il23 = _mm_loadh_pi(             il2, reinterpret_cast<__m64 const *>(&data[i[3]])); // a2 b2 a3 b3
1930         const m128 il45 = _mm_loadh_pi(             il4, reinterpret_cast<__m64 const *>(&data[i[5]])); // a4 b4 a5 b5
1931         const m128 il67 = _mm_loadh_pi(             il6, reinterpret_cast<__m64 const *>(&data[i[7]])); // a6 b6 a7 b7
1932
1933         const m256 tmp2 = concat(il01, il45);
1934         const m256 tmp3 = concat(il23, il67);
1935
1936         const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
1937         const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
1938
1939         v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
1940         v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
1941     }  // }}}
1942     // deinterleave successive 2 {{{
1943     static inline void deinterleave(typename V::EntryType const *const data,
1944                                     const Common::SuccessiveEntries<2> &i, V &v0, V &v1)
1945     {
1946         using namespace AVX;
1947         const m256 il0123 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]])); // a0 b0 a1 b1 a2 b2 a3 b3
1948         const m256 il4567 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[4]])); // a4 b4 a5 b5 a6 b6 a7 b7
1949
1950         const m256 tmp2 = Mem::shuffle128<X0, Y0>(il0123, il4567);
1951         const m256 tmp3 = Mem::shuffle128<X1, Y1>(il0123, il4567);
1952
1953         const m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
1954         const m256 tmp1 = _mm256_unpackhi_ps(tmp2, tmp3);
1955
1956         v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
1957         v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
1958     }  // }}}
1959     // deinterleave scatter 3 {{{
1960     template <typename I>
1961     static inline void deinterleave(typename V::EntryType const *const data, const I &i,
1962                                     V &v0, V &v1, V &v2)
1963     {
1964         using namespace AVX;
1965         const m128  il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]])); // a0 b0 c0 d0
1966         const m128  il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]])); // a1 b1 c1 d1
1967         const m128  il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]])); // a2 b2 c2 d2
1968         const m128  il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]])); // a3 b3 c3 d3
1969         const m128  il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]])); // a4 b4 c4 d4
1970         const m128  il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]])); // a5 b5 c5 d5
1971         const m128  il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]])); // a6 b6 c6 d6
1972         const m128  il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]])); // a7 b7 c7 d7
1973
1974         const m256 il04 = concat(il0, il4);
1975         const m256 il15 = concat(il1, il5);
1976         const m256 il26 = concat(il2, il6);
1977         const m256 il37 = concat(il3, il7);
1978         const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
1979         const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
1980         const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
1981         const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
1982         v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
1983         v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
1984         v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
1985     }  // }}}
1986     // deinterleave successive 3 {{{
1987     static inline void deinterleave(typename V::EntryType const *const data,
1988                                     const Common::SuccessiveEntries<3> &i, V &v0, V &v1,
1989                                     V &v2)
1990     {
1991         // 0a 1a 2a 0b 1b 2b 0c 1c
1992         __m256 in0 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 0));
1993         // 2c 0d 1d 2d 0e 1e 2e 0f
1994         __m256 in1 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 8));
1995         // 1f 2f 0g 1g 2g 0h 1h 2h
1996         __m256 in2 = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]] + 16));
1997
1998         // swap(v0.hi, v2.lo):
1999         //      [0a 1a 2a 0b 1f 2f 0g 1g]
2000         //      [2c 0d 1d 2d 0e 1e 2e 0f]
2001         //      [1b 2b 0c 1c 2g 0h 1h 2h]
2002         const __m256 aaabffgg = _mm256_permute2f128_ps(in0, in2, 0x20);
2003         const __m256 cdddeeef = in1;
2004         const __m256 bbccghhh = _mm256_permute2f128_ps(in0, in2, 0x31);
2005         // blend:
2006         // 0: [a d c b e h g f]
2007         // 1: [b a d c f e h g]
2008         // 2: [c b a d g f e h]
2009         const __m256 x0 = _mm256_blend_ps(
2010             _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80),
2011             bbccghhh, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0);
2012         const __m256 x1 = _mm256_blend_ps(
2013             _mm256_blend_ps(aaabffgg, cdddeeef, 0 + 0 + 4 + 0 + 0 + 0x20 + 0 + 0),
2014             bbccghhh, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0);
2015         const __m256 x2 = _mm256_blend_ps(
2016             _mm256_blend_ps(aaabffgg, cdddeeef, 1 + 0 + 0 + 8 + 0 + 0 + 0x40 + 0),
2017             bbccghhh, 0 + 2 + 0 + 0 + 0x10 + 0 + 0 + 0x80);
2018         // 0: [a d c b e h g f] >-perm(0, 3, 2, 1)-> [a b c d e f g h]
2019         // 1: [b a d c f e h g] >-perm(1, 0, 3, 2)-> [a b c d e f g h]
2020         // 2: [c b a d g f e h] >-perm(2, 1, 0, 3)-> [a b c d e f g h]
2021         v0 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x0, x0, 0x6c));
2022         v1 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x1, x1, 0xb1));
2023         v2 = AVX::avx_cast<typename V::VectorType>(_mm256_shuffle_ps(x2, x2, 0xc6));
2024     }  // }}}
2025     // deinterleave scatter 4 {{{
2026     template <typename I>
2027     static inline void deinterleave(typename V::EntryType const *const data, const I &i,
2028                                     V &v0, V &v1, V &v2, V &v3)
2029     {
2030         using namespace AVX;
2031         const m128  il0 = _mm_loadu_ps(aliasing_cast<float>(&data[i[0]])); // a0 b0 c0 d0
2032         const m128  il1 = _mm_loadu_ps(aliasing_cast<float>(&data[i[1]])); // a1 b1 c1 d1
2033         const m128  il2 = _mm_loadu_ps(aliasing_cast<float>(&data[i[2]])); // a2 b2 c2 d2
2034         const m128  il3 = _mm_loadu_ps(aliasing_cast<float>(&data[i[3]])); // a3 b3 c3 d3
2035         const m128  il4 = _mm_loadu_ps(aliasing_cast<float>(&data[i[4]])); // a4 b4 c4 d4
2036         const m128  il5 = _mm_loadu_ps(aliasing_cast<float>(&data[i[5]])); // a5 b5 c5 d5
2037         const m128  il6 = _mm_loadu_ps(aliasing_cast<float>(&data[i[6]])); // a6 b6 c6 d6
2038         const m128  il7 = _mm_loadu_ps(aliasing_cast<float>(&data[i[7]])); // a7 b7 c7 d7
2039
2040         const m256 il04 = concat(il0, il4);
2041         const m256 il15 = concat(il1, il5);
2042         const m256 il26 = concat(il2, il6);
2043         const m256 il37 = concat(il3, il7);
2044         const m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
2045         const m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
2046         const m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
2047         const m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
2048         v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
2049         v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
2050         v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
2051         v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
2052     }  // }}}
2053     // deinterleave successive 4 {{{
2054     static inline void deinterleave(typename V::EntryType const *const data,
2055                                     const Common::SuccessiveEntries<4> &i, V &v0, V &v1,
2056                                     V &v2, V &v3)
2057     {
2058         using namespace AVX;
2059         const __m256 il01 = _mm256_loadu_ps(
2060             aliasing_cast<float>(&data[i[0]]));  // a0 b0 c0 d0 | a1 b1 c1 d1
2061         const __m256 il23 = _mm256_loadu_ps(
2062             aliasing_cast<float>(&data[i[2]]));  // a2 b2 c2 d2 | a3 b3 c3 d3
2063         const __m256 il45 = _mm256_loadu_ps(
2064             aliasing_cast<float>(&data[i[4]]));  // a4 b4 c4 d4 | a5 b5 c5 d5
2065         const __m256 il67 = _mm256_loadu_ps(
2066             aliasing_cast<float>(&data[i[6]]));  // a6 b6 c6 d6 | a7 b7 c7 d7
2067
2068         const __m256 il04 = _mm256_permute2f128_ps(il01, il45, 0x20);
2069         const __m256 il15 = _mm256_permute2f128_ps(il01, il45, 0x31);
2070         const __m256 il26 = _mm256_permute2f128_ps(il23, il67, 0x20);
2071         const __m256 il37 = _mm256_permute2f128_ps(il23, il67, 0x31);
2072         const __m256 ab0246 = _mm256_unpacklo_ps(il04, il26);
2073         const __m256 ab1357 = _mm256_unpacklo_ps(il15, il37);
2074         const __m256 cd0246 = _mm256_unpackhi_ps(il04, il26);
2075         const __m256 cd1357 = _mm256_unpackhi_ps(il15, il37);
2076         v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(ab0246, ab1357));
2077         v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(ab0246, ab1357));
2078         v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(cd0246, cd1357));
2079         v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(cd0246, cd1357));
2080     }  // }}}
2081     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2082             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
2083     {
2084         v4.gather(data + 4, i);
2085         deinterleave(data, i, v0, v1, v2, v3);
2086     }/*}}}*/
2087     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2088             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2089     {
2090         deinterleave(data, i, v0, v1, v2, v3);
2091         deinterleave(data + 4, i, v4, v5);
2092     }/*}}}*/
2093     static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2094             const Common::SuccessiveEntries<6> &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2095     {
2096         using namespace AVX;
2097         const m256 a = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0]]));
2098         const m256 b = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 1 * V::Size]));
2099         const m256 c = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 2 * V::Size]));
2100         const m256 d = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 3 * V::Size]));
2101         const m256 e = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 4 * V::Size]));
2102         const m256 f = _mm256_loadu_ps(aliasing_cast<float>(&data[i[0] + 5 * V::Size]));
2103         const __m256 tmp2 = Mem::shuffle128<X0, Y0>(a, d);
2104         const __m256 tmp3 = Mem::shuffle128<X1, Y1>(b, e);
2105         const __m256 tmp4 = Mem::shuffle128<X1, Y1>(a, d);
2106         const __m256 tmp5 = Mem::shuffle128<X0, Y0>(c, f);
2107         const __m256 tmp8 = Mem::shuffle128<X0, Y0>(b, e);
2108         const __m256 tmp9 = Mem::shuffle128<X1, Y1>(c, f);
2109         const __m256 tmp0 = _mm256_unpacklo_ps(tmp2, tmp3);
2110         const __m256 tmp1 = _mm256_unpackhi_ps(tmp4, tmp5);
2111         const __m256 tmp6 = _mm256_unpackhi_ps(tmp2, tmp3);
2112         const __m256 tmp7 = _mm256_unpacklo_ps(tmp8, tmp9);
2113         const __m256 tmp10 = _mm256_unpacklo_ps(tmp4, tmp5);
2114         const __m256 tmp11 = _mm256_unpackhi_ps(tmp8, tmp9);
2115         v0.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp0, tmp1));
2116         v1.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp0, tmp1));
2117         v2.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp6, tmp7));
2118         v3.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp6, tmp7));
2119         v4.data() = avx_cast<typename V::VectorType>(_mm256_unpacklo_ps(tmp10, tmp11));
2120         v5.data() = avx_cast<typename V::VectorType>(_mm256_unpackhi_ps(tmp10, tmp11));
2121     }/*}}}*/
2122     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2123             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
2124     {
2125         deinterleave(data, i, v0, v1, v2, v3);
2126         deinterleave(data + 4, i, v4, v5, v6);
2127     }/*}}}*/
2128     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2129             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
2130     {
2131         deinterleave(data, i, v0, v1, v2, v3);
2132         deinterleave(data + 4, i, v4, v5, v6, v7);
2133     }/*}}}*/
2134 };
2135 template<typename V> struct InterleaveImpl<V, 4, 32> {
2136     template <typename I>  // interleave 2 args{{{2
2137     static inline void interleave(typename V::EntryType *const data, const I &i,
2138                                   const typename V::AsArg v0, const typename V::AsArg v1)
2139     {
2140         using namespace AVX;
2141         const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2142         const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2143         _mm_storeu_pd(&data[i[0]], lo128(tmp0));
2144         _mm_storeu_pd(&data[i[1]], lo128(tmp1));
2145         _mm_storeu_pd(&data[i[2]], hi128(tmp0));
2146         _mm_storeu_pd(&data[i[3]], hi128(tmp1));
2147     }
2148     template <typename I>  // interleave 3 args{{{2
2149     static inline void interleave(typename V::EntryType *const data, const I &i,
2150                                   const typename V::AsArg v0, const typename V::AsArg v1,
2151                                   const typename V::AsArg v2)
2152     {
2153         using namespace AVX;
2154 #ifdef Vc_USE_MASKMOV_SCATTER
2155         const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2156         const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2157         const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v2.data());
2158         const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v2.data());
2159
2160 #if defined(Vc_MSVC) && (Vc_MSVC < 170000000 || !defined(_WIN64))
2161         // MSVC needs to be at Version 2012 before _mm256_set_epi64x works
2162         const m256i mask = concat(_mm_setallone_si128(), _mm_set_epi32(0, 0, -1, -1));
2163 #else
2164         const m256i mask = _mm256_set_epi64x(0, -1, -1, -1);
2165 #endif
2166         _mm256_maskstore_pd(&data[i[0]], mask, Mem::shuffle128<X0, Y0>(tmp0, tmp2));
2167         _mm256_maskstore_pd(&data[i[1]], mask, Mem::shuffle128<X0, Y0>(tmp1, tmp3));
2168         _mm256_maskstore_pd(&data[i[2]], mask, Mem::shuffle128<X1, Y1>(tmp0, tmp2));
2169         _mm256_maskstore_pd(&data[i[3]], mask, Mem::shuffle128<X1, Y1>(tmp1, tmp3));
2170 #else
2171         interleave(data, i, v0, v1);
2172         v2.scatter(data + 2, i);
2173 #endif
2174     }
2175     template <typename I>  // interleave 4 args{{{2
2176     static inline void interleave(typename V::EntryType *const data, const I &i,
2177                                   const typename V::AsArg v0, const typename V::AsArg v1,
2178                                   const typename V::AsArg v2, const typename V::AsArg v3)
2179     {
2180         using namespace AVX;
2181         // 0a 1a 0c 1c:
2182         const m256d tmp0 = _mm256_unpacklo_pd(v0.data(), v1.data());
2183         // 0b 1b 0b 1b:
2184         const m256d tmp1 = _mm256_unpackhi_pd(v0.data(), v1.data());
2185         // 2a 3a 2c 3c:
2186         const m256d tmp2 = _mm256_unpacklo_pd(v2.data(), v3.data());
2187         // 2b 3b 2b 3b:
2188         const m256d tmp3 = _mm256_unpackhi_pd(v2.data(), v3.data());
2189         /* The following might be more efficient once 256-bit stores are not split internally into 2
2190          * 128-bit stores.
2191         _mm256_storeu_pd(&data[i[0]], Mem::shuffle128<X0, Y0>(tmp0, tmp2));
2192         _mm256_storeu_pd(&data[i[1]], Mem::shuffle128<X0, Y0>(tmp1, tmp3));
2193         _mm256_storeu_pd(&data[i[2]], Mem::shuffle128<X1, Y1>(tmp0, tmp2));
2194         _mm256_storeu_pd(&data[i[3]], Mem::shuffle128<X1, Y1>(tmp1, tmp3));
2195         */
2196         _mm_storeu_pd(&data[i[0]  ], lo128(tmp0));
2197         _mm_storeu_pd(&data[i[0]+2], lo128(tmp2));
2198         _mm_storeu_pd(&data[i[1]  ], lo128(tmp1));
2199         _mm_storeu_pd(&data[i[1]+2], lo128(tmp3));
2200         _mm_storeu_pd(&data[i[2]  ], hi128(tmp0));
2201         _mm_storeu_pd(&data[i[2]+2], hi128(tmp2));
2202         _mm_storeu_pd(&data[i[3]  ], hi128(tmp1));
2203         _mm_storeu_pd(&data[i[3]+2], hi128(tmp3));
2204     }
2205     template <typename I>  // interleave 5 args {{{2
2206     static inline void interleave(typename V::EntryType *const data, const I &i,
2207                                   const typename V::AsArg v0, const typename V::AsArg v1,
2208                                   const typename V::AsArg v2, const typename V::AsArg v3,
2209                                   const typename V::AsArg v4)
2210     {
2211         interleave(data, i, v0, v1, v2, v3);
2212         v4.scatter(data + 4, i);
2213     }
2214     template <typename I>  // interleave 6 args {{{2
2215     static inline void interleave(typename V::EntryType *const data, const I &i,
2216                                   const typename V::AsArg v0, const typename V::AsArg v1,
2217                                   const typename V::AsArg v2, const typename V::AsArg v3,
2218                                   const typename V::AsArg v4, const typename V::AsArg v5)
2219     {
2220         interleave(data, i, v0, v1, v2, v3);
2221         interleave(data + 4, i, v4, v5);
2222     }
2223     template <typename I>  // interleave 7 args {{{2
2224     static inline void interleave(typename V::EntryType *const data, const I &i,
2225                                   const typename V::AsArg v0, const typename V::AsArg v1,
2226                                   const typename V::AsArg v2, const typename V::AsArg v3,
2227                                   const typename V::AsArg v4, const typename V::AsArg v5,
2228                                   const typename V::AsArg v6)
2229     {
2230         interleave(data, i, v0, v1, v2, v3);
2231         interleave(data + 4, i, v4, v5, v6);
2232     }
2233     template <typename I>  // interleave 8 args {{{2
2234     static inline void interleave(typename V::EntryType *const data, const I &i,
2235                                   const typename V::AsArg v0, const typename V::AsArg v1,
2236                                   const typename V::AsArg v2, const typename V::AsArg v3,
2237                                   const typename V::AsArg v4, const typename V::AsArg v5,
2238                                   const typename V::AsArg v6, const typename V::AsArg v7)
2239     {
2240         interleave(data, i, v0, v1, v2, v3);
2241         interleave(data + 4, i, v4, v5, v6, v7);
2242     }
2243     //}}}2
2244     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2245             const I &i, V &v0, V &v1)
2246     {
2247         using namespace Vc::AVX;
2248         const m256d ab02 = concat(_mm_loadu_pd(&data[i[0]]), _mm_loadu_pd(&data[i[2]]));
2249         const m256d ab13 = concat(_mm_loadu_pd(&data[i[1]]), _mm_loadu_pd(&data[i[3]]));
2250
2251         v0.data() = _mm256_unpacklo_pd(ab02, ab13);
2252         v1.data() = _mm256_unpackhi_pd(ab02, ab13);
2253     }/*}}}*/
2254     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2255             const I &i, V &v0, V &v1, V &v2)
2256     {
2257         v2.gather(data + 2, i);
2258         deinterleave(data, i, v0, v1);
2259     }/*}}}*/
2260     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2261             const I &i, V &v0, V &v1, V &v2, V &v3)
2262     {
2263         deinterleave(data, i, v0, v1);
2264         deinterleave(data + 2, i, v2, v3);
2265     }/*}}}*/
2266     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2267             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4)
2268     {
2269         v4.gather(data + 4, i);
2270         deinterleave(data, i, v0, v1);
2271         deinterleave(data + 2, i, v2, v3);
2272     }/*}}}*/
2273     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2274             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5)
2275     {
2276         deinterleave(data, i, v0, v1);
2277         deinterleave(data + 2, i, v2, v3);
2278         deinterleave(data + 4, i, v4, v5);
2279     }/*}}}*/
2280     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2281             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6)
2282     {
2283         v6.gather(data + 6, i);
2284         deinterleave(data, i, v0, v1);
2285         deinterleave(data + 2, i, v2, v3);
2286         deinterleave(data + 4, i, v4, v5);
2287     }/*}}}*/
2288     template<typename I> static inline void deinterleave(typename V::EntryType const *const data,/*{{{*/
2289             const I &i, V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7)
2290     {
2291         deinterleave(data, i, v0, v1);
2292         deinterleave(data + 2, i, v2, v3);
2293         deinterleave(data + 4, i, v4, v5);
2294         deinterleave(data + 6, i, v6, v7);
2295     }/*}}}*/
2296 };
2297 //}}}1
2298 }  // namespace Detail
2299 }  // namespace Vc
2300
2301 #endif  // VC_AVX_DETAIL_H_
2302
2303 // vim: foldmethod=marker