xsimd/arch/xsimd_avx2.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_AVX2_HPP
0013 #define XSIMD_AVX2_HPP
0014
0015 #include <complex>
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_avx2_register.hpp"
0019
0020 namespace xsimd
0021 {
0022
0023     namespace kernel
0024     {
0025         using namespace types;
0026
0027         // abs
0028         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0029         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
0030         {
0031             if (std::is_signed<T>::value)
0032             {
0033                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0034                 {
0035                     return _mm256_abs_epi8(self);
0036                 }
0037                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0038                 {
0039                     return _mm256_abs_epi16(self);
0040                 }
0041                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0042                 {
0043                     return _mm256_abs_epi32(self);
0044                 }
0045                 else
0046                 {
0047                     return abs(self, avx {});
0048                 }
0049             }
0050             return self;
0051         }
0052
0053         // add
0054         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0055         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0056         {
0057             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0058             {
0059                 return _mm256_add_epi8(self, other);
0060             }
0061             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0062             {
0063                 return _mm256_add_epi16(self, other);
0064             }
0065             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0066             {
0067                 return _mm256_add_epi32(self, other);
0068             }
0069             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0070             {
0071                 return _mm256_add_epi64(self, other);
0072             }
0073             else
0074             {
0075                 return add(self, other, avx {});
0076             }
0077         }
0078
0079         // avgr
0080         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0081         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0082         {
0083             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0084             {
0085                 return _mm256_avg_epu8(self, other);
0086             }
0087             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0088             {
0089                 return _mm256_avg_epu16(self, other);
0090             }
0091             else
0092             {
0093                 return avgr(self, other, generic {});
0094             }
0095         }
0096
0097         // avg
0098         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0099         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0100         {
0101             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0102             {
0103                 auto adj = ((self ^ other) << 7) >> 7;
0104                 return avgr(self, other, A {}) - adj;
0105             }
0106             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0107             {
0108                 auto adj = ((self ^ other) << 15) >> 15;
0109                 return avgr(self, other, A {}) - adj;
0110             }
0111             else
0112             {
0113                 return avg(self, other, generic {});
0114             }
0115         }
0116
0117         // bitwise_and
0118         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0119         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0120         {
0121             return _mm256_and_si256(self, other);
0122         }
0123         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0124         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0125         {
0126             return _mm256_and_si256(self, other);
0127         }
0128
0129         // bitwise_andnot
0130         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0131         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0132         {
0133             return _mm256_andnot_si256(other, self);
0134         }
0135         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0136         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0137         {
0138             return _mm256_andnot_si256(other, self);
0139         }
0140
0141         // bitwise_not
0142         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0143         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
0144         {
0145             return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
0146         }
0147         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0148         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
0149         {
0150             return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
0151         }
0152
0153         // bitwise_lshift
0154         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0155         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
0156         {
0157             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0158             {
0159                 return _mm256_slli_epi16(self, other);
0160             }
0161             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0162             {
0163                 return _mm256_slli_epi32(self, other);
0164             }
0165             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0166             {
0167                 return _mm256_slli_epi64(self, other);
0168             }
0169             else
0170             {
0171                 return bitwise_lshift(self, other, avx {});
0172             }
0173         }
0174
0175         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0176         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0177         {
0178             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0179             {
0180                 return _mm256_sllv_epi32(self, other);
0181             }
0182             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0183             {
0184                 return _mm256_sllv_epi64(self, other);
0185             }
0186             else
0187             {
0188                 return bitwise_lshift(self, other, avx {});
0189             }
0190         }
0191
0192         // bitwise_or
0193         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0194         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0195         {
0196             return _mm256_or_si256(self, other);
0197         }
0198         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0199         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0200         {
0201             return _mm256_or_si256(self, other);
0202         }
0203
0204         // bitwise_rshift
0205         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0206         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
0207         {
0208             if (std::is_signed<T>::value)
0209             {
0210                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0211                 {
0212                     __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
0213                     __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
0214                     __m256i res = _mm256_srai_epi16(self, other);
0215                     return _mm256_or_si256(
0216                         detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0217                                            { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
0218                                            sign_mask, cmp_is_negative),
0219                         _mm256_andnot_si256(sign_mask, res));
0220                 }
0221                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0222                 {
0223                     return _mm256_srai_epi16(self, other);
0224                 }
0225                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0226                 {
0227                     return _mm256_srai_epi32(self, other);
0228                 }
0229                 else
0230                 {
0231                     return bitwise_rshift(self, other, avx {});
0232                 }
0233             }
0234             else
0235             {
0236                 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0237                 {
0238                     return _mm256_srli_epi16(self, other);
0239                 }
0240                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0241                 {
0242                     return _mm256_srli_epi32(self, other);
0243                 }
0244                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0245                 {
0246                     return _mm256_srli_epi64(self, other);
0247                 }
0248                 else
0249                 {
0250                     return bitwise_rshift(self, other, avx {});
0251                 }
0252             }
0253         }
0254
0255         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0256         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0257         {
0258             if (std::is_signed<T>::value)
0259             {
0260                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0261                 {
0262                     return _mm256_srav_epi32(self, other);
0263                 }
0264                 else
0265                 {
0266                     return bitwise_rshift(self, other, avx {});
0267                 }
0268             }
0269             else
0270             {
0271                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0272                 {
0273                     return _mm256_srlv_epi32(self, other);
0274                 }
0275                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0276                 {
0277                     return _mm256_srlv_epi64(self, other);
0278                 }
0279                 else
0280                 {
0281                     return bitwise_rshift(self, other, avx {});
0282                 }
0283             }
0284         }
0285
0286         // bitwise_xor
0287         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0288         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0289         {
0290             return _mm256_xor_si256(self, other);
0291         }
0292         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0293         XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0294         {
0295             return _mm256_xor_si256(self, other);
0296         }
0297
0298         // complex_low
0299         template <class A>
0300         XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
0301         {
0302             __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
0303             __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
0304             return _mm256_blend_pd(tmp0, tmp1, 10);
0305         }
0306
0307         // complex_high
0308         template <class A>
0309         XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
0310         {
0311             __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
0312             __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
0313             return _mm256_blend_pd(tmp0, tmp1, 10);
0314         }
0315
0316         // fast_cast
0317         namespace detail
0318         {
0319
0320             template <class A>
0321             XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
0322             {
0323                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0324                 // adapted to avx
0325                 __m256i xH = _mm256_srli_epi64(x, 32);
0326                 xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); //  2^84
0327                 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
0328                                                  0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
0329                 __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
0330                 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
0331                 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
0332             }
0333
0334             template <class A>
0335             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
0336             {
0337                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0338                 // adapted to avx
0339                 __m256i xH = _mm256_srai_epi32(x, 16);
0340                 xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
0341                 xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); //  3*2^67
0342                 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
0343                                                  0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
0344                 __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
0345                 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
0346                 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
0347             }
0348         }
0349
0350         // eq
0351         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0352         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0353         {
0354             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0355             {
0356                 return _mm256_cmpeq_epi8(self, other);
0357             }
0358             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0359             {
0360                 return _mm256_cmpeq_epi16(self, other);
0361             }
0362             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0363             {
0364                 return _mm256_cmpeq_epi32(self, other);
0365             }
0366             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0367             {
0368                 return _mm256_cmpeq_epi64(self, other);
0369             }
0370             else
0371             {
0372                 return eq(self, other, avx {});
0373             }
0374         }
0375
0376         // gather
0377         template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
0378         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0379                                         kernel::requires_arch<avx2>) noexcept
0380         {
0381             // scatter for this one is AVX512F+AVX512VL
0382             return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
0383         }
0384
0385         template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
0386         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0387                                         kernel::requires_arch<avx2>) noexcept
0388         {
0389             // scatter for this one is AVX512F+AVX512VL
0390             return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
0391         }
0392
0393         template <class A, class U,
0394                   detail::enable_sized_integral_t<U, 4> = 0>
0395         XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
0396                                             batch<U, A> const& index,
0397                                             kernel::requires_arch<avx2>) noexcept
0398         {
0399             // scatter for this one is AVX512F+AVX512VL
0400             return _mm256_i32gather_ps(src, index, sizeof(float));
0401         }
0402
0403         template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
0404         XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
0405                                              batch<U, A> const& index,
0406                                              requires_arch<avx2>) noexcept
0407         {
0408             // scatter for this one is AVX512F+AVX512VL
0409             return _mm256_i64gather_pd(src, index, sizeof(double));
0410         }
0411
0412         // gather: handmade conversions
0413         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0414         XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
0415                                             batch<V, A> const& index,
0416                                             requires_arch<avx2>) noexcept
0417         {
0418             const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
0419             const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
0420             return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
0421         }
0422
0423         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0424         XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
0425                                               batch<V, A> const& index,
0426                                               requires_arch<avx2>) noexcept
0427         {
0428             const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
0429             const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
0430             return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
0431         }
0432
0433         // lt
0434         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0435         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0436         {
0437             if (std::is_signed<T>::value)
0438             {
0439                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0440                 {
0441                     return _mm256_cmpgt_epi8(other, self);
0442                 }
0443                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0444                 {
0445                     return _mm256_cmpgt_epi16(other, self);
0446                 }
0447                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0448                 {
0449                     return _mm256_cmpgt_epi32(other, self);
0450                 }
0451                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0452                 {
0453                     return _mm256_cmpgt_epi64(other, self);
0454                 }
0455                 else
0456                 {
0457                     return lt(self, other, avx {});
0458                 }
0459             }
0460             else
0461             {
0462                 return lt(self, other, avx {});
0463             }
0464         }
0465
0466         // load_complex
0467         template <class A>
0468         XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
0469         {
0470             using batch_type = batch<float, A>;
0471             batch_type real = _mm256_castpd_ps(
0472                 _mm256_permute4x64_pd(
0473                     _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
0474                     _MM_SHUFFLE(3, 1, 2, 0)));
0475             batch_type imag = _mm256_castpd_ps(
0476                 _mm256_permute4x64_pd(
0477                     _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
0478                     _MM_SHUFFLE(3, 1, 2, 0)));
0479             return { real, imag };
0480         }
0481         template <class A>
0482         XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
0483         {
0484             using batch_type = batch<double, A>;
0485             batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
0486             batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
0487             return { real, imag };
0488         }
0489         // mask
0490         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0491         XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
0492         {
0493             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0494             {
0495                 return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
0496             }
0497             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0498             {
0499                 uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
0500                 return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
0501             }
0502             else
0503             {
0504                 return mask(self, avx {});
0505             }
0506         }
0507
0508         // max
0509         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0510         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0511         {
0512             if (std::is_signed<T>::value)
0513             {
0514                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0515                 {
0516                     return _mm256_max_epi8(self, other);
0517                 }
0518                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0519                 {
0520                     return _mm256_max_epi16(self, other);
0521                 }
0522                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0523                 {
0524                     return _mm256_max_epi32(self, other);
0525                 }
0526                 else
0527                 {
0528                     return max(self, other, avx {});
0529                 }
0530             }
0531             else
0532             {
0533                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0534                 {
0535                     return _mm256_max_epu8(self, other);
0536                 }
0537                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0538                 {
0539                     return _mm256_max_epu16(self, other);
0540                 }
0541                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0542                 {
0543                     return _mm256_max_epu32(self, other);
0544                 }
0545                 else
0546                 {
0547                     return max(self, other, avx {});
0548                 }
0549             }
0550         }
0551
0552         // min
0553         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0554         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0555         {
0556             if (std::is_signed<T>::value)
0557             {
0558                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0559                 {
0560                     return _mm256_min_epi8(self, other);
0561                 }
0562                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0563                 {
0564                     return _mm256_min_epi16(self, other);
0565                 }
0566                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0567                 {
0568                     return _mm256_min_epi32(self, other);
0569                 }
0570                 else
0571                 {
0572                     return min(self, other, avx {});
0573                 }
0574             }
0575             else
0576             {
0577                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0578                 {
0579                     return _mm256_min_epu8(self, other);
0580                 }
0581                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0582                 {
0583                     return _mm256_min_epu16(self, other);
0584                 }
0585                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0586                 {
0587                     return _mm256_min_epu32(self, other);
0588                 }
0589                 else
0590                 {
0591                     return min(self, other, avx {});
0592                 }
0593             }
0594         }
0595
0596         // mul
0597         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0598         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0599         {
0600             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0601             {
0602                 __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
0603                 __m256i res_lo = _mm256_mullo_epi16(self, other);
0604                 __m256i other_hi = _mm256_srli_epi16(other, 8);
0605                 __m256i self_hi = _mm256_and_si256(self, mask_hi);
0606                 __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
0607                 __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
0608                 return res;
0609             }
0610             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0611             {
0612                 return _mm256_mullo_epi16(self, other);
0613             }
0614             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0615             {
0616                 return _mm256_mullo_epi32(self, other);
0617             }
0618             else
0619             {
0620                 return mul(self, other, avx {});
0621             }
0622         }
0623
0624         // reduce_add
0625         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0626         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
0627         {
0628             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0629             {
0630                 __m256i tmp1 = _mm256_hadd_epi32(self, self);
0631                 __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
0632                 __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
0633                 __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
0634                 return _mm_cvtsi128_si32(tmp4);
0635             }
0636             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0637             {
0638                 __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
0639                 __m256i tmp2 = _mm256_add_epi64(self, tmp1);
0640                 __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
0641                 __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
0642 #if defined(__x86_64__)
0643                 return _mm_cvtsi128_si64(res);
0644 #else
0645                 __m128i m;
0646                 _mm_storel_epi64(&m, res);
0647                 int64_t i;
0648                 std::memcpy(&i, &m, sizeof(i));
0649                 return i;
0650 #endif
0651             }
0652             else
0653             {
0654                 return reduce_add(self, avx {});
0655             }
0656         }
0657
0658         // rotate_left
0659         template <size_t N, class A>
0660         XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
0661         {
0662             return _mm256_alignr_epi8(self, self, N);
0663         }
0664         template <size_t N, class A>
0665         XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
0666         {
0667             return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
0668         }
0669
0670         // sadd
0671         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0672         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0673         {
0674             if (std::is_signed<T>::value)
0675             {
0676                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0677                 {
0678                     return _mm256_adds_epi8(self, other);
0679                 }
0680                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0681                 {
0682                     return _mm256_adds_epi16(self, other);
0683                 }
0684                 else
0685                 {
0686                     return sadd(self, other, avx {});
0687                 }
0688             }
0689             else
0690             {
0691                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0692                 {
0693                     return _mm256_adds_epu8(self, other);
0694                 }
0695                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0696                 {
0697                     return _mm256_adds_epu16(self, other);
0698                 }
0699                 else
0700                 {
0701                     return sadd(self, other, avx {});
0702                 }
0703             }
0704         }
0705
0706         // select
0707         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0708         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
0709         {
0710             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0711             {
0712                 return _mm256_blendv_epi8(false_br, true_br, cond);
0713             }
0714             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0715             {
0716                 return _mm256_blendv_epi8(false_br, true_br, cond);
0717             }
0718             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0719             {
0720                 return _mm256_blendv_epi8(false_br, true_br, cond);
0721             }
0722             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0723             {
0724                 return _mm256_blendv_epi8(false_br, true_br, cond);
0725             }
0726             else
0727             {
0728                 return select(cond, true_br, false_br, avx {});
0729             }
0730         }
0731         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0732         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
0733         {
0734             constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
0735             // FIXME: for some reason mask here is not considered as an immediate,
0736             // but it's okay for _mm256_blend_epi32
0737             // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
0738             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0739             {
0740                 return _mm256_blend_epi32(false_br, true_br, mask);
0741             }
0742             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0743             {
0744                 constexpr int imask = detail::interleave(mask);
0745                 return _mm256_blend_epi32(false_br, true_br, imask);
0746             }
0747             else
0748             {
0749                 return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
0750             }
0751         }
0752
0753         // slide_left
0754         template <size_t N, class A, class T>
0755         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
0756         {
0757             constexpr unsigned BitCount = N * 8;
0758             if (BitCount == 0)
0759             {
0760                 return x;
0761             }
0762             if (BitCount >= 256)
0763             {
0764                 return batch<T, A>(T(0));
0765             }
0766             if (BitCount > 128)
0767             {
0768                 constexpr unsigned M = (BitCount - 128) / 8;
0769                 auto y = _mm256_bslli_epi128(x, M);
0770                 return _mm256_permute2x128_si256(y, y, 0x28);
0771             }
0772             if (BitCount == 128)
0773             {
0774                 return _mm256_permute2x128_si256(x, x, 0x28);
0775             }
0776             // shifting by [0, 128[ bits
0777             constexpr unsigned M = BitCount / 8;
0778             auto y = _mm256_bslli_epi128(x, M);
0779             auto z = _mm256_bsrli_epi128(x, 16 - M);
0780             auto w = _mm256_permute2x128_si256(z, z, 0x28);
0781             return _mm256_or_si256(y, w);
0782         }
0783
0784         // slide_right
0785         template <size_t N, class A, class T>
0786         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
0787         {
0788             constexpr unsigned BitCount = N * 8;
0789             if (BitCount == 0)
0790             {
0791                 return x;
0792             }
0793             if (BitCount >= 256)
0794             {
0795                 return batch<T, A>(T(0));
0796             }
0797             if (BitCount > 128)
0798             {
0799                 constexpr unsigned M = (BitCount - 128) / 8;
0800                 auto y = _mm256_bsrli_epi128(x, M);
0801                 return _mm256_permute2x128_si256(y, y, 0x81);
0802             }
0803             if (BitCount == 128)
0804             {
0805                 return _mm256_permute2x128_si256(x, x, 0x81);
0806             }
0807             // shifting by [0, 128[ bits
0808             constexpr unsigned M = BitCount / 8;
0809             auto y = _mm256_bsrli_epi128(x, M);
0810             auto z = _mm256_bslli_epi128(x, 16 - M);
0811             auto w = _mm256_permute2x128_si256(z, z, 0x81);
0812             return _mm256_or_si256(y, w);
0813         }
0814
0815         // ssub
0816         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0817         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0818         {
0819             if (std::is_signed<T>::value)
0820             {
0821                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0822                 {
0823                     return _mm256_subs_epi8(self, other);
0824                 }
0825                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0826                 {
0827                     return _mm256_subs_epi16(self, other);
0828                 }
0829                 else
0830                 {
0831                     return ssub(self, other, avx {});
0832                 }
0833             }
0834             else
0835             {
0836                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0837                 {
0838                     return _mm256_subs_epu8(self, other);
0839                 }
0840                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0841                 {
0842                     return _mm256_subs_epu16(self, other);
0843                 }
0844                 else
0845                 {
0846                     return ssub(self, other, avx {});
0847                 }
0848             }
0849         }
0850
0851         // sub
0852         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0853         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0854         {
0855             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0856             {
0857                 return _mm256_sub_epi8(self, other);
0858             }
0859             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0860             {
0861                 return _mm256_sub_epi16(self, other);
0862             }
0863             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0864             {
0865                 return _mm256_sub_epi32(self, other);
0866             }
0867             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0868             {
0869                 return _mm256_sub_epi64(self, other);
0870             }
0871             else
0872             {
0873                 return sub(self, other, avx {});
0874             }
0875         }
0876
0877         // swizzle (dynamic mask)
0878         template <class A>
0879         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
0880         {
0881             return _mm256_permutevar8x32_ps(self, mask);
0882         }
0883
0884         template <class A>
0885         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
0886         {
0887             batch<uint32_t, A> broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 };
0888             constexpr uint64_t comb = 0x0000000100000001ul * 2;
0889             return bitwise_cast<double>(swizzle(bitwise_cast<float>(self), bitwise_cast<uint32_t>(mask * comb) + broadcaster, avx2 {}));
0890         }
0891
0892         template <class A>
0893         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
0894         {
0895             return bitwise_cast<uint64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
0896         }
0897         template <class A>
0898         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
0899         {
0900             return bitwise_cast<int64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
0901         }
0902         template <class A>
0903         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
0904         {
0905             return _mm256_permutevar8x32_epi32(self, mask);
0906         }
0907         template <class A>
0908         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
0909         {
0910             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
0911         }
0912
0913         // swizzle (constant mask)
0914         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
0915         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
0916         {
0917             return _mm256_permutevar8x32_ps(self, mask.as_batch());
0918         }
0919
0920         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
0921         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
0922         {
0923             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
0924             return _mm256_permute4x64_pd(self, mask);
0925         }
0926
0927         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
0928         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
0929         {
0930             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
0931             return _mm256_permute4x64_epi64(self, mask);
0932         }
0933         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
0934         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
0935         {
0936             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
0937         }
0938         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
0939         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
0940         {
0941             return _mm256_permutevar8x32_epi32(self, mask.as_batch());
0942         }
0943         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
0944         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
0945         {
0946             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
0947         }
0948
0949         // zip_hi
0950         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0951         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0952         {
0953             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0954             {
0955                 auto lo = _mm256_unpacklo_epi8(self, other);
0956                 auto hi = _mm256_unpackhi_epi8(self, other);
0957                 return _mm256_permute2f128_si256(lo, hi, 0x31);
0958             }
0959             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0960             {
0961                 auto lo = _mm256_unpacklo_epi16(self, other);
0962                 auto hi = _mm256_unpackhi_epi16(self, other);
0963                 return _mm256_permute2f128_si256(lo, hi, 0x31);
0964             }
0965             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0966             {
0967                 auto lo = _mm256_unpacklo_epi32(self, other);
0968                 auto hi = _mm256_unpackhi_epi32(self, other);
0969                 return _mm256_permute2f128_si256(lo, hi, 0x31);
0970             }
0971             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0972             {
0973                 auto lo = _mm256_unpacklo_epi64(self, other);
0974                 auto hi = _mm256_unpackhi_epi64(self, other);
0975                 return _mm256_permute2f128_si256(lo, hi, 0x31);
0976             }
0977             else
0978             {
0979                 assert(false && "unsupported arch/op combination");
0980                 return {};
0981             }
0982         }
0983
0984         // zip_lo
0985         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0986         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0987         {
0988             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0989             {
0990                 auto lo = _mm256_unpacklo_epi8(self, other);
0991                 auto hi = _mm256_unpackhi_epi8(self, other);
0992                 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
0993             }
0994             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0995             {
0996                 auto lo = _mm256_unpacklo_epi16(self, other);
0997                 auto hi = _mm256_unpackhi_epi16(self, other);
0998                 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
0999             }
1000             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1001             {
1002                 auto lo = _mm256_unpacklo_epi32(self, other);
1003                 auto hi = _mm256_unpackhi_epi32(self, other);
1004                 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
1005             }
1006             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1007             {
1008                 auto lo = _mm256_unpacklo_epi64(self, other);
1009                 auto hi = _mm256_unpackhi_epi64(self, other);
1010                 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
1011             }
1012             else
1013             {
1014                 assert(false && "unsupported arch/op combination");
1015                 return {};
1016             }
1017         }
1018     }
1019 }
1020
1021 #endif