xsimd/arch/xsimd_avx512f.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_AVX512F_HPP
0013 #define XSIMD_AVX512F_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_avx512f_register.hpp"
0020
0021 namespace xsimd
0022 {
0023
0024     namespace kernel
0025     {
0026         using namespace types;
0027
0028         namespace detail
0029         {
0030             XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
0031             {
0032                 low = _mm512_castps512_ps256(val);
0033                 high = _mm512_extractf32x8_ps(val, 1);
0034             }
0035             XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
0036             {
0037                 low = _mm512_castpd512_pd256(val);
0038                 high = _mm512_extractf64x4_pd(val, 1);
0039             }
0040             XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
0041             {
0042                 low = _mm512_castsi512_si256(val);
0043                 high = _mm512_extracti64x4_epi64(val, 1);
0044             }
0045             XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
0046             {
0047                 return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
0048             }
0049             XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept
0050             {
0051                 return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
0052             }
0053             XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept
0054             {
0055                 return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
0056             }
0057             template <class F>
0058             __m512i fwd_to_avx(F f, __m512i self)
0059             {
0060                 __m256i self_low, self_high;
0061                 split_avx512(self, self_low, self_high);
0062                 __m256i res_low = f(self_low);
0063                 __m256i res_high = f(self_high);
0064                 return merge_avx(res_low, res_high);
0065             }
0066             template <class F>
0067             __m512i fwd_to_avx(F f, __m512i self, __m512i other)
0068             {
0069                 __m256i self_low, self_high, other_low, other_high;
0070                 split_avx512(self, self_low, self_high);
0071                 split_avx512(other, other_low, other_high);
0072                 __m256i res_low = f(self_low, other_low);
0073                 __m256i res_high = f(self_high, other_high);
0074                 return merge_avx(res_low, res_high);
0075             }
0076             template <class F>
0077             __m512i fwd_to_avx(F f, __m512i self, int32_t other)
0078             {
0079                 __m256i self_low, self_high;
0080                 split_avx512(self, self_low, self_high);
0081                 __m256i res_low = f(self_low, other);
0082                 __m256i res_high = f(self_high, other);
0083                 return merge_avx(res_low, res_high);
0084             }
0085         }
0086         namespace detail
0087         {
0088
0089             XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept
0090             {
0091
0092                 static const unsigned short MortonTable256[256] = {
0093                     0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
0094                     0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
0095                     0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
0096                     0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
0097                     0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
0098                     0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
0099                     0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
0100                     0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
0101                     0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
0102                     0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
0103                     0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
0104                     0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
0105                     0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
0106                     0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
0107                     0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
0108                     0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
0109                     0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
0110                     0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
0111                     0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
0112                     0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
0113                     0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
0114                     0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
0115                     0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
0116                     0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
0117                     0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
0118                     0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
0119                     0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
0120                     0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
0121                     0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
0122                     0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
0123                     0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
0124                     0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
0125                 };
0126
0127                 uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF];
0128                 return z;
0129             }
0130
0131             template <class A, class T, int Cmp>
0132             XSIMD_INLINE batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
0133             {
0134                 using register_type = typename batch_bool<T, A>::register_type;
0135                 if (std::is_signed<T>::value)
0136                 {
0137                     XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0138                     {
0139                         // shifting to take sign into account
0140                         uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
0141                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
0142                                                                    Cmp);
0143                         uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
0144                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
0145                                                                    Cmp);
0146                         uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
0147                                                                     (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
0148                                                                     Cmp);
0149                         uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
0150                                                                     (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
0151                                                                     Cmp);
0152                         uint64_t mask = 0;
0153                         for (unsigned i = 0; i < 16; ++i)
0154                         {
0155                             mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
0156                             mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
0157                             mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
0158                             mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
0159                         }
0160                         return (register_type)mask;
0161                     }
0162                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0163                     {
0164                         // shifting to take sign into account
0165                         uint16_t mask_low = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
0166                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
0167                                                                   Cmp);
0168                         uint16_t mask_high = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
0169                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
0170                                                                    Cmp);
0171                         return static_cast<register_type>(morton(mask_low, mask_high));
0172                     }
0173                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0174                     {
0175                         return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
0176                     }
0177                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0178                     {
0179                         return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
0180                     }
0181                 }
0182                 else
0183                 {
0184                     XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0185                     {
0186                         uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
0187                         uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
0188                         uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
0189                         uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
0190                         uint64_t mask = 0;
0191                         for (unsigned i = 0; i < 16; ++i)
0192                         {
0193                             mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
0194                             mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
0195                             mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
0196                             mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
0197                         }
0198                         return (register_type)mask;
0199                     }
0200                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0201                     {
0202                         uint16_t mask_low = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
0203                         uint16_t mask_high = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
0204                         return static_cast<register_type>(morton(mask_low, mask_high));
0205                     }
0206                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0207                     {
0208                         return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
0209                     }
0210                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0211                     {
0212                         return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
0213                     }
0214                 }
0215             }
0216         }
0217
0218         // abs
0219         template <class A>
0220         XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0221         {
0222             __m512 self_asf = (__m512)self;
0223             __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
0224             __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi);
0225             return *reinterpret_cast<__m512*>(&res_asi);
0226         }
0227         template <class A>
0228         XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0229         {
0230             __m512d self_asd = (__m512d)self;
0231             __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
0232             __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
0233                                                self_asi);
0234             return *reinterpret_cast<__m512d*>(&res_asi);
0235         }
0236         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0237         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
0238         {
0239             if (std::is_unsigned<T>::value)
0240             {
0241                 return self;
0242             }
0243
0244             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0245             {
0246                 return detail::fwd_to_avx([](__m256i s) noexcept
0247                                           { return abs(batch<T, avx2>(s)); },
0248                                           self);
0249             }
0250             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0251             {
0252                 return detail::fwd_to_avx([](__m256i s) noexcept
0253                                           { return abs(batch<T, avx2>(s)); },
0254                                           self);
0255             }
0256             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0257             {
0258                 return _mm512_abs_epi32(self);
0259             }
0260             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0261             {
0262                 return _mm512_abs_epi64(self);
0263             }
0264             else
0265             {
0266                 assert(false && "unsupported arch/op combination");
0267                 return {};
0268             }
0269         }
0270
0271         // add
0272         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0273         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0274         {
0275             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0276             {
0277                 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
0278                                           { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
0279                                           self, other);
0280             }
0281             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0282             {
0283                 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
0284                                           { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
0285                                           self, other);
0286             }
0287             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0288             {
0289                 return _mm512_add_epi32(self, other);
0290             }
0291             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0292             {
0293                 return _mm512_add_epi64(self, other);
0294             }
0295             else
0296             {
0297                 assert(false && "unsupported arch/op combination");
0298                 return {};
0299             }
0300         }
0301         template <class A>
0302         XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0303         {
0304             return _mm512_add_ps(self, other);
0305         }
0306         template <class A>
0307         XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0308         {
0309             return _mm512_add_pd(self, other);
0310         }
0311
0312         // all
0313         template <class A, class T>
0314         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0315         {
0316             using register_type = typename batch_bool<T, A>::register_type;
0317             return self.data == register_type(-1);
0318         }
0319
0320         // any
0321         template <class A, class T>
0322         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0323         {
0324             using register_type = typename batch_bool<T, A>::register_type;
0325             return self.data != register_type(0);
0326         }
0327
0328         // batch_bool_cast
0329         template <class A, class T_out, class T_in>
0330         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
0331         {
0332             return self.data;
0333         }
0334
0335         // bitwise_and
0336         template <class A>
0337         XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0338         {
0339 #if defined(_MSC_VER)
0340             return _mm512_and_ps(self, other);
0341 #else
0342             return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
0343 #endif
0344         }
0345         template <class A>
0346         XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0347         {
0348             return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
0349         }
0350
0351         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0352         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0353         {
0354             return _mm512_and_si512(self, other);
0355         }
0356
0357         template <class A, class T>
0358         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0359         {
0360             using register_type = typename batch_bool<T, A>::register_type;
0361             return register_type(self.data & other.data);
0362         }
0363
0364         // bitwise_andnot
0365         template <class A>
0366         XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0367         {
0368             return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
0369         }
0370         template <class A>
0371         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0372         {
0373             return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
0374         }
0375
0376         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0377         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0378         {
0379             return _mm512_andnot_si512(other, self);
0380         }
0381
0382         template <class A, class T>
0383         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0384         {
0385             using register_type = typename batch_bool<T, A>::register_type;
0386             return register_type(self.data & ~other.data);
0387         }
0388
0389         // bitwise_lshift
0390         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0391         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
0392         {
0393             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0394             {
0395 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0396                 __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
0397 #else
0398                 __m512i tmp = _mm512_slli_epi32(self, other);
0399 #endif
0400                 return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp);
0401             }
0402             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0403             {
0404                 return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
0405                                           { return bitwise_lshift(batch<T, avx2>(s), o, avx2 {}); },
0406                                           self, other);
0407 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0408             }
0409             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0410             {
0411                 return _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
0412             }
0413             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0414             {
0415                 return _mm512_sllv_epi64(self, _mm512_set1_epi64(other));
0416 #else
0417             }
0418             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0419             {
0420                 return _mm512_slli_epi32(self, other);
0421             }
0422             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0423             {
0424                 return _mm512_slli_epi64(self, other);
0425 #endif
0426             }
0427             else
0428             {
0429                 assert(false && "unsupported arch/op combination");
0430                 return {};
0431             }
0432         }
0433
0434         // bitwise_not
0435         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0436         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
0437         {
0438             return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
0439         }
0440         template <class A, class T>
0441         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0442         {
0443             using register_type = typename batch_bool<T, A>::register_type;
0444             return register_type(~self.data);
0445         }
0446
0447         template <class A>
0448         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0449         {
0450             return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
0451         }
0452         template <class A>
0453         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0454         {
0455             return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
0456         }
0457
0458         // bitwise_or
0459         template <class A>
0460         XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0461         {
0462             return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
0463         }
0464         template <class A>
0465         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0466         {
0467             return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
0468         }
0469
0470         template <class A, class T>
0471         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0472         {
0473             using register_type = typename batch_bool<T, A>::register_type;
0474             return register_type(self.data | other.data);
0475         }
0476
0477         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0478         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0479         {
0480             return _mm512_or_si512(self, other);
0481         }
0482
0483         // bitwise_rshift
0484         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0485         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
0486         {
0487             if (std::is_signed<T>::value)
0488             {
0489 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0490                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0491                 {
0492                     return _mm512_srav_epi32(self, _mm512_set1_epi32(other));
0493                 }
0494                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0495                 {
0496                     return _mm512_srav_epi64(self, _mm512_set1_epi64(other));
0497 #else
0498                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0499                 {
0500                     return _mm512_srai_epi32(self, other);
0501                 }
0502                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0503                 {
0504                     return _mm512_srai_epi64(self, other);
0505 #endif
0506                 }
0507                 else
0508                 {
0509                     return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
0510                                               { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
0511                                               self, other);
0512                 }
0513             }
0514             else
0515             {
0516                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0517                 {
0518 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0519                     __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
0520 #else
0521                     __m512i tmp = _mm512_srli_epi32(self, other);
0522 #endif
0523                     return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp);
0524 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0525                 }
0526                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0527                 {
0528                     return _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
0529                 }
0530                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0531                 {
0532                     return _mm512_srlv_epi64(self, _mm512_set1_epi64(other));
0533 #else
0534                 }
0535                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0536                 {
0537                     return _mm512_srli_epi32(self, other);
0538                 }
0539                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0540                 {
0541                     return _mm512_srli_epi64(self, other);
0542 #endif
0543                 }
0544                 else
0545                 {
0546                     return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
0547                                               { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
0548                                               self, other);
0549                 }
0550             }
0551         }
0552
0553         // bitwise_xor
0554         template <class A>
0555         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0556         {
0557             return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
0558         }
0559         template <class A>
0560         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0561         {
0562             return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
0563         }
0564
0565         template <class A, class T>
0566         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0567         {
0568             using register_type = typename batch_bool<T, A>::register_type;
0569             return register_type(self.data | other.data);
0570         }
0571
0572         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0573         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0574         {
0575             return _mm512_xor_si512(self, other);
0576         }
0577
0578         // bitwise_cast
0579         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0580         XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0581         {
0582             return _mm512_castsi512_ps(self);
0583         }
0584         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0585         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
0586         {
0587             return _mm512_castsi512_pd(self);
0588         }
0589         template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
0590         XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
0591         {
0592             return batch<Tp, A>(self.data);
0593         }
0594         template <class A>
0595         XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
0596         {
0597             return _mm512_castps_pd(self);
0598         }
0599         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0600         XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
0601         {
0602             return _mm512_castps_si512(self);
0603         }
0604         template <class A>
0605         XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0606         {
0607             return _mm512_castpd_ps(self);
0608         }
0609         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0610         XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
0611         {
0612             return _mm512_castpd_si512(self);
0613         }
0614
0615         // broadcast
0616         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0617         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
0618         {
0619             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0620             {
0621                 return _mm512_set1_epi8(val);
0622             }
0623             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0624             {
0625                 return _mm512_set1_epi16(val);
0626             }
0627             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0628             {
0629                 return _mm512_set1_epi32(val);
0630             }
0631             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0632             {
0633                 return _mm512_set1_epi64(val);
0634             }
0635             else
0636             {
0637                 assert(false && "unsupported");
0638                 return {};
0639             }
0640         }
0641         template <class A>
0642         XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
0643         {
0644             return _mm512_set1_ps(val);
0645         }
0646         template <class A>
0647         batch<double, A> XSIMD_INLINE broadcast(double val, requires_arch<avx512f>) noexcept
0648         {
0649             return _mm512_set1_pd(val);
0650         }
0651
0652         // ceil
0653         template <class A>
0654         XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0655         {
0656             return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
0657         }
0658         template <class A>
0659         XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0660         {
0661             return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
0662         }
0663
0664         // compress
0665         template <class A>
0666         XSIMD_INLINE batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
0667         {
0668             return _mm512_maskz_compress_ps(mask.mask(), self);
0669         }
0670         template <class A>
0671         XSIMD_INLINE batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
0672         {
0673             return _mm512_maskz_compress_pd(mask.mask(), self);
0674         }
0675         template <class A>
0676         XSIMD_INLINE batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
0677         {
0678             return _mm512_maskz_compress_epi32(mask.mask(), self);
0679         }
0680         template <class A>
0681         XSIMD_INLINE batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
0682         {
0683             return _mm512_maskz_compress_epi32(mask.mask(), self);
0684         }
0685         template <class A>
0686         XSIMD_INLINE batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
0687         {
0688             return _mm512_maskz_compress_epi64(mask.mask(), self);
0689         }
0690         template <class A>
0691         XSIMD_INLINE batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
0692         {
0693             return _mm512_maskz_compress_epi64(mask.mask(), self);
0694         }
0695
0696         // convert
0697         namespace detail
0698         {
0699             template <class A>
0700             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0701             {
0702                 return _mm512_cvtepi32_ps(self);
0703             }
0704
0705             template <class A>
0706             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
0707             {
0708                 return _mm512_cvttps_epi32(self);
0709             }
0710
0711             template <class A>
0712             XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0713             {
0714                 return _mm512_cvtepu32_ps(self);
0715             }
0716
0717             template <class A>
0718             batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
0719             {
0720                 return _mm512_cvttps_epu32(self);
0721             }
0722         }
0723
0724         namespace detail
0725         {
0726             // complex_low
0727             template <class A>
0728             XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
0729             {
0730                 __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
0731                 return _mm512_permutex2var_ps(self.real(), idx, self.imag());
0732             }
0733             template <class A>
0734             XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
0735             {
0736                 __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
0737                 return _mm512_permutex2var_pd(self.real(), idx, self.imag());
0738             }
0739
0740             // complex_high
0741             template <class A>
0742             XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
0743             {
0744                 __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
0745                 return _mm512_permutex2var_ps(self.real(), idx, self.imag());
0746             }
0747             template <class A>
0748             XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
0749             {
0750                 __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
0751                 return _mm512_permutex2var_pd(self.real(), idx, self.imag());
0752             }
0753         }
0754
0755         // div
0756         template <class A>
0757         XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0758         {
0759             return _mm512_div_ps(self, other);
0760         }
0761         template <class A>
0762         XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0763         {
0764             return _mm512_div_pd(self, other);
0765         }
0766
0767         // eq
0768         template <class A>
0769         XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0770         {
0771             return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
0772         }
0773         template <class A>
0774         XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0775         {
0776             return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
0777         }
0778
0779         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0780         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0781         {
0782             return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
0783         }
0784         template <class A, class T>
0785         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0786         {
0787             using register_type = typename batch_bool<T, A>::register_type;
0788             return register_type(~self.data ^ other.data);
0789         }
0790
0791         // expand
0792         template <class A>
0793         XSIMD_INLINE batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
0794         {
0795             return _mm512_maskz_expand_ps(mask.mask(), self);
0796         }
0797         template <class A>
0798         XSIMD_INLINE batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
0799         {
0800             return _mm512_maskz_expand_pd(mask.mask(), self);
0801         }
0802         template <class A>
0803         XSIMD_INLINE batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
0804         {
0805             return _mm512_maskz_expand_epi32(mask.mask(), self);
0806         }
0807         template <class A>
0808         XSIMD_INLINE batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
0809         {
0810             return _mm512_maskz_expand_epi32(mask.mask(), self);
0811         }
0812         template <class A>
0813         XSIMD_INLINE batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
0814         {
0815             return _mm512_maskz_expand_epi64(mask.mask(), self);
0816         }
0817         template <class A>
0818         XSIMD_INLINE batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
0819         {
0820             return _mm512_maskz_expand_epi64(mask.mask(), self);
0821         }
0822
0823         // floor
0824         template <class A>
0825         XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0826         {
0827             return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
0828         }
0829         template <class A>
0830         XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0831         {
0832             return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
0833         }
0834
0835         // fnma
0836         template <class A>
0837         XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
0838         {
0839             return _mm512_fnmadd_ps(x, y, z);
0840         }
0841
0842         template <class A>
0843         XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
0844         {
0845             return _mm512_fnmadd_pd(x, y, z);
0846         }
0847
0848         // fma
0849         template <class A>
0850         XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
0851         {
0852             return _mm512_fmadd_ps(x, y, z);
0853         }
0854
0855         template <class A>
0856         XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
0857         {
0858             return _mm512_fmadd_pd(x, y, z);
0859         }
0860
0861         // fms
0862         template <class A>
0863         XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
0864         {
0865             return _mm512_fmsub_ps(x, y, z);
0866         }
0867
0868         template <class A>
0869         XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
0870         {
0871             return _mm512_fmsub_pd(x, y, z);
0872         }
0873
0874         // from bool
0875         template <class A, class T>
0876         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0877         {
0878             return select(self, batch<T, A>(1), batch<T, A>(0));
0879         }
0880
0881         // from_mask
0882         template <class T, class A>
0883         XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
0884         {
0885             return static_cast<typename batch_bool<T, A>::register_type>(mask);
0886         }
0887
0888         // gather
0889         template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
0890         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0891                                         kernel::requires_arch<avx512f>) noexcept
0892         {
0893             return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
0894         }
0895
0896         template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
0897         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0898                                         kernel::requires_arch<avx512f>) noexcept
0899         {
0900             return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
0901         }
0902
0903         template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
0904         XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
0905                                             batch<U, A> const& index,
0906                                             kernel::requires_arch<avx512f>) noexcept
0907         {
0908             return _mm512_i32gather_ps(index, src, sizeof(float));
0909         }
0910
0911         template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
0912         XSIMD_INLINE batch<double, A>
0913         gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
0914                kernel::requires_arch<avx512f>) noexcept
0915         {
0916             return _mm512_i64gather_pd(index, src, sizeof(double));
0917         }
0918
0919         // gather: handmade conversions
0920         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0921         XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
0922                                             batch<V, A> const& index,
0923                                             requires_arch<avx512f>) noexcept
0924         {
0925             const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
0926             const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
0927             return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data));
0928         }
0929
0930         template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0931         XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
0932                                               batch<V, A> const& index,
0933                                               requires_arch<avx512f>) noexcept
0934         {
0935             const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
0936             const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
0937             return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data));
0938         }
0939
0940         // ge
0941         template <class A>
0942         XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0943         {
0944             return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
0945         }
0946         template <class A>
0947         XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0948         {
0949             return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
0950         }
0951         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0952         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0953         {
0954             return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
0955         }
0956
0957         // gt
0958         template <class A>
0959         XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0960         {
0961             return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
0962         }
0963         template <class A>
0964         XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0965         {
0966             return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
0967         }
0968         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0969         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0970         {
0971             return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
0972         }
0973
0974         // haddp
0975         template <class A>
0976         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
0977         {
0978             // The following folds over the vector once:
0979             // tmp1 = [a0..8, b0..8]
0980             // tmp2 = [a8..f, b8..f]
0981 #define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
0982     batch<float, avx512f> res##I;                                        \
0983     {                                                                    \
0984         auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
0985         auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
0986         res##I = _mm512_add_ps(tmp1, tmp2);                              \
0987     }
0988
0989             XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
0990             XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
0991             XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
0992             XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
0993             XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
0994             XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
0995             XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
0996             XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
0997
0998 #undef XSIMD_AVX512_HADDP_STEP1
0999
1000             // The following flds the code and shuffles so that hadd_ps produces the correct result
1001             // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
1002             // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
1003             // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
1004 #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                                                                                                         \
1005     batch<float, avx2> halfx##I;                                                                                                                        \
1006     {                                                                                                                                                   \
1007         auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
1008         auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
1009                                                                                                                                                         \
1010         auto resx1 = _mm512_add_ps(tmp1, tmp2);                                                                                                         \
1011                                                                                                                                                         \
1012         auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
1013         auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
1014                                                                                                                                                         \
1015         auto resx2 = _mm512_add_ps(tmp3, tmp4);                                                                                                         \
1016                                                                                                                                                         \
1017         auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0));                                                                           \
1018         auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1));                                                                           \
1019                                                                                                                                                         \
1020         auto resx3 = _mm512_add_ps(tmp5, tmp6);                                                                                                         \
1021                                                                                                                                                         \
1022         halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1),  \
1023                                   _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \
1024     }
1025
1026             XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
1027             XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
1028
1029 #undef XSIMD_AVX512_HADDP_STEP2
1030
1031             auto concat = _mm512_castps256_ps512(halfx0);
1032             concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1));
1033             return concat;
1034         }
1035
1036         template <class A>
1037         XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
1038         {
1039 #define step1(I, a, b)                                                   \
1040     batch<double, avx512f> res##I;                                       \
1041     {                                                                    \
1042         auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
1043         auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
1044         res##I = _mm512_add_pd(tmp1, tmp2);                              \
1045     }
1046
1047             step1(1, row[0], row[2]);
1048             step1(2, row[4], row[6]);
1049             step1(3, row[1], row[3]);
1050             step1(4, row[5], row[7]);
1051
1052 #undef step1
1053
1054             auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
1055             auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
1056
1057             auto resx1 = _mm512_add_pd(tmp5, tmp6);
1058
1059             auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
1060             auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
1061
1062             auto resx2 = _mm512_add_pd(tmp7, tmp8);
1063
1064             auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
1065             auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
1066
1067             return _mm512_add_pd(tmpx, tmpy);
1068         }
1069
1070         // isnan
1071         template <class A>
1072         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1073         {
1074             return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
1075         }
1076         template <class A>
1077         XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1078         {
1079             return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
1080         }
1081
1082         // ldexp
1083         template <class A>
1084         XSIMD_INLINE batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
1085         {
1086             return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
1087         }
1088
1089         template <class A>
1090         XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
1091         {
1092             // FIXME: potential data loss here when converting other elements to
1093             // int32 before converting them back to double.
1094             __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other));
1095             return _mm512_scalef_pd(self, adjusted_index);
1096         }
1097
1098         // le
1099         template <class A>
1100         XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1101         {
1102             return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
1103         }
1104         template <class A>
1105         XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1106         {
1107             return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
1108         }
1109         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1110         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1111         {
1112             return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
1113         }
1114
1115         // load_aligned
1116         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1117         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1118         {
1119             return _mm512_load_si512((__m512i const*)mem);
1120         }
1121         template <class A>
1122         XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1123         {
1124             return _mm512_load_ps(mem);
1125         }
1126         template <class A>
1127         XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1128         {
1129             return _mm512_load_pd(mem);
1130         }
1131
1132         // load_complex
1133         namespace detail
1134         {
1135             template <class A>
1136             XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
1137             {
1138                 __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
1139                 __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
1140                 auto real = _mm512_permutex2var_ps(hi, real_idx, lo);
1141                 auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo);
1142                 return { real, imag };
1143             }
1144             template <class A>
1145             XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
1146             {
1147                 __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
1148                 __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
1149                 auto real = _mm512_permutex2var_pd(hi, real_idx, lo);
1150                 auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo);
1151                 return { real, imag };
1152             }
1153         }
1154
1155         // load_unaligned
1156         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1157         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1158         {
1159             return _mm512_loadu_si512((__m512i const*)mem);
1160         }
1161         template <class A>
1162         XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1163         {
1164             return _mm512_loadu_ps(mem);
1165         }
1166         template <class A>
1167         XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1168         {
1169             return _mm512_loadu_pd(mem);
1170         }
1171
1172         // lt
1173         template <class A>
1174         XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1175         {
1176             return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
1177         }
1178         template <class A>
1179         XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1180         {
1181             return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
1182         }
1183
1184         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1185         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1186         {
1187             return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
1188         }
1189
1190         // mask
1191         template <class A, class T>
1192         XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1193         {
1194             return self.data;
1195         }
1196
1197         // max
1198         template <class A>
1199         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1200         {
1201             return _mm512_max_ps(self, other);
1202         }
1203         template <class A>
1204         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1205         {
1206             return _mm512_max_pd(self, other);
1207         }
1208         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1209         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1210         {
1211             if (std::is_signed<T>::value)
1212             {
1213                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1214                 {
1215                     return _mm512_max_epi32(self, other);
1216                 }
1217                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1218                 {
1219                     return _mm512_max_epi64(self, other);
1220                 }
1221                 else
1222                 {
1223                     return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1224                                               { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
1225                                               self, other);
1226                 }
1227             }
1228             else
1229             {
1230                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1231                 {
1232                     return _mm512_max_epu32(self, other);
1233                 }
1234                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1235                 {
1236                     return _mm512_max_epu64(self, other);
1237                 }
1238                 else
1239                 {
1240                     return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1241                                               { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
1242                                               self, other);
1243                 }
1244             }
1245         }
1246
1247         // min
1248         template <class A>
1249         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1250         {
1251             return _mm512_min_ps(self, other);
1252         }
1253         template <class A>
1254         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1255         {
1256             return _mm512_min_pd(self, other);
1257         }
1258         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1259         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1260         {
1261             if (std::is_signed<T>::value)
1262             {
1263                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1264                 {
1265                     return _mm512_min_epi32(self, other);
1266                 }
1267                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1268                 {
1269                     return _mm512_min_epi64(self, other);
1270                 }
1271                 else
1272                 {
1273                     return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1274                                               { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
1275                                               self, other);
1276                 }
1277             }
1278             else
1279             {
1280                 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1281                 {
1282                     return _mm512_min_epu32(self, other);
1283                 }
1284                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1285                 {
1286                     return _mm512_min_epu64(self, other);
1287                 }
1288                 else
1289                 {
1290                     return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1291                                               { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
1292                                               self, other);
1293                 }
1294             }
1295         }
1296
1297         // mul
1298         template <class A>
1299         XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1300         {
1301             return _mm512_mul_ps(self, other);
1302         }
1303         template <class A>
1304         XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1305         {
1306             return _mm512_mul_pd(self, other);
1307         }
1308         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1309         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1310         {
1311             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1312             {
1313                 return _mm512_mullo_epi32(self, other);
1314             }
1315             else
1316             {
1317                 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1318                                           { return mul(batch<T, avx2>(s), batch<T, avx2>(o)); },
1319                                           self, other);
1320             }
1321         }
1322
1323         // nearbyint
1324         template <class A>
1325         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1326         {
1327             return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
1328         }
1329         template <class A>
1330         XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1331         {
1332             return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
1333         }
1334
1335         // nearbyint_as_int
1336         template <class A>
1337         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1338                                                         requires_arch<avx512f>) noexcept
1339         {
1340             return _mm512_cvtps_epi32(self);
1341         }
1342
1343         // neg
1344         template <class A, class T>
1345         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1346         {
1347             return 0 - self;
1348         }
1349
1350         // neq
1351         template <class A>
1352         XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1353         {
1354             return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
1355         }
1356         template <class A>
1357         XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1358         {
1359             return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
1360         }
1361         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1362         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1363         {
1364             return ~(self == other);
1365         }
1366
1367         template <class A, class T>
1368         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
1369         {
1370             using register_type = typename batch_bool<T, A>::register_type;
1371             return register_type(self.data ^ other.data);
1372         }
1373
1374         // reciprocal
1375         template <class A>
1376         XSIMD_INLINE batch<float, A>
1377         reciprocal(batch<float, A> const& self,
1378                    kernel::requires_arch<avx512f>) noexcept
1379         {
1380             return _mm512_rcp14_ps(self);
1381         }
1382
1383         template <class A>
1384         XSIMD_INLINE batch<double, A>
1385         reciprocal(batch<double, A> const& self,
1386                    kernel::requires_arch<avx512f>) noexcept
1387         {
1388             return _mm512_rcp14_pd(self);
1389         }
1390
1391         // reduce_add
1392         template <class A>
1393         XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1394         {
1395             __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0);
1396             __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1);
1397             __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2);
1398             __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3);
1399             __m128 res1 = _mm_add_ps(tmp1, tmp2);
1400             __m128 res2 = _mm_add_ps(tmp3, tmp4);
1401             __m128 res3 = _mm_add_ps(res1, res2);
1402             return reduce_add(batch<float, sse4_2>(res3), sse4_2 {});
1403         }
1404         template <class A>
1405         XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1406         {
1407             __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
1408             __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
1409             __m256d res1 = _mm256_add_pd(tmp1, tmp2);
1410             return reduce_add(batch<double, avx2>(res1), avx2 {});
1411         }
1412         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1413         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1414         {
1415             __m256i low, high;
1416             detail::split_avx512(self, low, high);
1417             batch<T, avx2> blow(low), bhigh(high);
1418             return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
1419         }
1420
1421         // reduce_max
1422         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
1423         XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1424         {
1425             constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
1426             batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
1427             batch<T, A> acc = max(self, step);
1428             __m256i low = _mm512_castsi512_si256(acc);
1429             return reduce_max(batch<T, avx2>(low));
1430         }
1431
1432         // reduce_min
1433         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
1434         XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1435         {
1436             constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
1437             batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
1438             batch<T, A> acc = min(self, step);
1439             __m256i low = _mm512_castsi512_si256(acc);
1440             return reduce_min(batch<T, avx2>(low));
1441         }
1442
1443         // rsqrt
1444         template <class A>
1445         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
1446         {
1447             return _mm512_rsqrt14_ps(val);
1448         }
1449         template <class A>
1450         XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
1451         {
1452             return _mm512_rsqrt14_pd(val);
1453         }
1454
1455         // sadd
1456         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1457         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1458         {
1459             if (std::is_signed<T>::value)
1460             {
1461                 auto mask = other < 0;
1462                 auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
1463                 auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
1464                 return other + select(mask, self_neg_branch, self_pos_branch);
1465             }
1466             else
1467             {
1468                 const auto diffmax = std::numeric_limits<T>::max() - self;
1469                 const auto mindiff = min(diffmax, other);
1470                 return self + mindiff;
1471             }
1472         }
1473
1474         // scatter
1475         template <class A, class T,
1476                   class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
1477         XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
1478                                   batch<int32_t, A> const& index,
1479                                   kernel::requires_arch<avx512f>) noexcept
1480         {
1481             _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
1482         }
1483
1484         template <class A, class T,
1485                   class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
1486         XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
1487                                   batch<int64_t, A> const& index,
1488                                   kernel::requires_arch<avx512f>) noexcept
1489         {
1490             _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
1491         }
1492
1493         template <class A>
1494         XSIMD_INLINE void scatter(batch<float, A> const& src, float* dst,
1495                                   batch<int32_t, A> const& index,
1496                                   kernel::requires_arch<avx512f>) noexcept
1497         {
1498             _mm512_i32scatter_ps(dst, index, src, sizeof(float));
1499         }
1500
1501         template <class A>
1502         XSIMD_INLINE void scatter(batch<double, A> const& src, double* dst,
1503                                   batch<int64_t, A> const& index,
1504                                   kernel::requires_arch<avx512f>) noexcept
1505         {
1506             _mm512_i64scatter_pd(dst, index, src, sizeof(double));
1507         }
1508
1509         // select
1510         template <class A>
1511         XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
1512         {
1513             return _mm512_mask_blend_ps(cond, false_br, true_br);
1514         }
1515         template <class A>
1516         XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
1517         {
1518             return _mm512_mask_blend_pd(cond, false_br, true_br);
1519         }
1520
1521         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1522         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
1523         {
1524             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1525             {
1526                 alignas(avx2::alignment()) uint8_t buffer[64];
1527                 // FIXME: ultra inefficient
1528                 for (int i = 0; i < 64; ++i)
1529                     buffer[i] = cond.data & (1ull << i) ? 0xFF : 0;
1530                 __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
1531                 __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
1532
1533                 __m256i true_low, true_hi;
1534                 detail::split_avx512(true_br, true_low, true_hi);
1535
1536                 __m256i false_low, false_hi;
1537                 detail::split_avx512(false_br, false_low, false_hi);
1538
1539                 __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
1540                 __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
1541                 return detail::merge_avx(res_low, res_hi);
1542             }
1543             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1544             {
1545                 __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
1546                 __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
1547
1548                 __m256i true_low, true_hi;
1549                 detail::split_avx512(true_br, true_low, true_hi);
1550
1551                 __m256i false_low, false_hi;
1552                 detail::split_avx512(false_br, false_low, false_hi);
1553
1554                 __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
1555                 __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
1556                 return detail::merge_avx(res_low, res_hi);
1557             }
1558             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1559             {
1560                 return _mm512_mask_blend_epi32(cond, false_br, true_br);
1561             }
1562             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1563             {
1564                 return _mm512_mask_blend_epi64(cond, false_br, true_br);
1565             }
1566             else
1567             {
1568                 assert(false && "unsupported arch/type combination");
1569                 return {};
1570             }
1571         }
1572
1573         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1574         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
1575         {
1576             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
1577         }
1578
1579         namespace detail
1580         {
1581             template <class T>
1582             using enable_signed_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
1583                                                                     int>::type;
1584
1585             template <class T>
1586             using enable_unsigned_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
1587                                                                       int>::type;
1588         }
1589
1590         // set
1591         template <class A>
1592         XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
1593         {
1594             return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1595         }
1596
1597         template <class A>
1598         XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
1599         {
1600             return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
1601         }
1602         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1603         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1604         {
1605             return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
1606         }
1607         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1608         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1609                                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1610         {
1611             return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1612         }
1613         template <class A, class T, detail::enable_signed_integer_t<T> = 0>
1614         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1615                                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1616                                      T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1617                                      T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1618         {
1619 #if defined(__clang__) || __GNUC__
1620             return __extension__(__m512i)(__v32hi) {
1621                 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1622                 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1623             };
1624 #else
1625             return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1626                                     v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1627 #endif
1628         }
1629
1630         template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
1631         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1632                                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1633                                      T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1634                                      T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1635         {
1636 #if defined(__clang__) || __GNUC__
1637             return __extension__(__m512i)(__v32hu) {
1638                 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1639                 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1640             };
1641 #else
1642             return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1643                                     v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1644 #endif
1645         }
1646
1647         template <class A, class T, detail::enable_signed_integer_t<T> = 0>
1648         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1649                                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1650                                      T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1651                                      T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
1652                                      T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
1653                                      T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
1654                                      T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
1655                                      T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
1656         {
1657
1658 #if defined(__clang__) || __GNUC__
1659             return __extension__(__m512i)(__v64qi) {
1660                 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1661                 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1662                 v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1663                 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1664             };
1665 #else
1666             return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1667                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1668                                    v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1669                                    v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
1670 #endif
1671         }
1672         template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
1673         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1674                                      T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1675                                      T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1676                                      T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
1677                                      T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
1678                                      T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
1679                                      T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
1680                                      T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
1681         {
1682
1683 #if defined(__clang__) || __GNUC__
1684             return __extension__(__m512i)(__v64qu) {
1685                 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1686                 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1687                 v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1688                 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1689             };
1690 #else
1691             return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1692                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1693                                    v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1694                                    v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
1695 #endif
1696         }
1697
1698         template <class A, class T, class... Values>
1699         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
1700         {
1701             static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
1702             using register_type = typename batch_bool<T, A>::register_type;
1703             register_type r = 0;
1704             unsigned shift = 0;
1705             (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
1706             return r;
1707         }
1708
1709         // shuffle
1710         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
1711         XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
1712                                              batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
1713                                              requires_arch<avx512f>) noexcept
1714         {
1715             constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
1716
1717             // shuffle within lane
1718             if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20)
1719                 return _mm512_shuffle_ps(x, y, smask);
1720
1721             // shuffle within opposite lane
1722             if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20)
1723                 return _mm512_shuffle_ps(y, x, smask);
1724
1725             return shuffle(x, y, mask, generic {});
1726         }
1727
1728         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
1729         XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
1730         {
1731             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
1732             // shuffle within lane
1733             if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14)
1734                 return _mm512_shuffle_pd(x, y, smask);
1735
1736             // shuffle within opposite lane
1737             if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14)
1738                 return _mm512_shuffle_pd(y, x, smask);
1739
1740             return shuffle(x, y, mask, generic {});
1741         }
1742
1743         // slide_left
1744         template <size_t N, class A, class T>
1745         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
1746         {
1747             static_assert(N == 0xDEAD, "not implemented yet");
1748             return {};
1749         }
1750
1751         // slide_right
1752         template <size_t N, class A, class T>
1753         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
1754         {
1755             static_assert(N == 0xDEAD, "not implemented yet");
1756             return {};
1757         }
1758
1759         // sqrt
1760         template <class A>
1761         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
1762         {
1763             return _mm512_sqrt_ps(val);
1764         }
1765         template <class A>
1766         XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
1767         {
1768             return _mm512_sqrt_pd(val);
1769         }
1770
1771         // ssub
1772         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1773         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1774         {
1775             if (std::is_signed<T>::value)
1776             {
1777                 return sadd(self, -other);
1778             }
1779             else
1780             {
1781                 const auto diff = min(self, other);
1782                 return self - diff;
1783             }
1784         }
1785
1786         // store
1787         template <class T, class A>
1788         XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
1789         {
1790             using register_type = typename batch_bool<T, A>::register_type;
1791             constexpr auto size = batch_bool<T, A>::size;
1792             for (std::size_t i = 0; i < size; ++i)
1793                 mem[i] = self.data & (register_type(1) << i);
1794         }
1795
1796         // store_aligned
1797         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1798         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
1799         {
1800             return _mm512_store_si512((__m512i*)mem, self);
1801         }
1802         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1803         XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1804         {
1805             return _mm512_store_si512((__m512i*)mem, self);
1806         }
1807         template <class A>
1808         XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
1809         {
1810             return _mm512_store_ps(mem, self);
1811         }
1812         template <class A>
1813         XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
1814         {
1815             return _mm512_store_pd(mem, self);
1816         }
1817
1818         // store_unaligned
1819         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1820         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
1821         {
1822             return _mm512_storeu_si512((__m512i*)mem, self);
1823         }
1824         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1825         XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1826         {
1827             return _mm512_storeu_si512((__m512i*)mem, self);
1828         }
1829         template <class A>
1830         XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
1831         {
1832             return _mm512_storeu_ps(mem, self);
1833         }
1834         template <class A>
1835         XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
1836         {
1837             return _mm512_storeu_pd(mem, self);
1838         }
1839
1840         // sub
1841         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1842         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1843         {
1844             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1845             {
1846                 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1847                                           { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
1848                                           self, other);
1849             }
1850             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1851             {
1852                 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1853                                           { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
1854                                           self, other);
1855             }
1856             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1857             {
1858                 return _mm512_sub_epi32(self, other);
1859             }
1860             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1861             {
1862                 return _mm512_sub_epi64(self, other);
1863             }
1864             else
1865             {
1866                 assert(false && "unsupported arch/op combination");
1867                 return {};
1868             }
1869         }
1870         template <class A>
1871         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1872         {
1873             return _mm512_sub_ps(self, other);
1874         }
1875         template <class A>
1876         XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1877         {
1878             return _mm512_sub_pd(self, other);
1879         }
1880
1881         // swizzle (dynamic version)
1882         template <class A>
1883         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
1884         {
1885             return _mm512_permutexvar_ps(mask, self);
1886         }
1887
1888         template <class A>
1889         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
1890         {
1891             return _mm512_permutexvar_pd(mask, self);
1892         }
1893
1894         template <class A>
1895         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
1896         {
1897             return _mm512_permutexvar_epi64(mask, self);
1898         }
1899
1900         template <class A>
1901         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
1902         {
1903             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
1904         }
1905
1906         template <class A>
1907         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
1908         {
1909             return _mm512_permutexvar_epi32(mask, self);
1910         }
1911
1912         template <class A>
1913         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
1914         {
1915             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
1916         }
1917
1918         // swizzle (constant version)
1919         template <class A, uint32_t... Vs>
1920         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1921         {
1922             return swizzle(self, mask.as_batch(), avx512f {});
1923         }
1924
1925         template <class A, uint64_t... Vs>
1926         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1927         {
1928             return swizzle(self, mask.as_batch(), avx512f {});
1929         }
1930
1931         template <class A, uint64_t... Vs>
1932         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1933         {
1934             return swizzle(self, mask.as_batch(), avx512f {});
1935         }
1936
1937         template <class A, uint64_t... Vs>
1938         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1939         {
1940             return swizzle(self, mask.as_batch(), avx512f {});
1941         }
1942
1943         template <class A, uint32_t... Vs>
1944         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1945         {
1946             return swizzle(self, mask.as_batch(), avx512f {});
1947         }
1948
1949         template <class A, uint32_t... Vs>
1950         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1951         {
1952             return swizzle(self, mask.as_batch(), avx512f {});
1953         }
1954
1955         namespace detail
1956         {
1957             template <class T, class A, T... Idx>
1958             struct is_pair_of_contiguous_indices;
1959
1960             template <class T, class A>
1961             struct is_pair_of_contiguous_indices<T, A> : std::true_type
1962             {
1963             };
1964
1965             template <class T, class A, T Idx0, T Idx1, T... Idx>
1966             struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
1967             {
1968             };
1969
1970             template <class A, uint16_t I0, uint16_t I1, uint16_t I2, uint16_t I3, uint16_t I4, uint16_t I5, uint16_t I6, uint16_t I7,
1971                       uint16_t I8, uint16_t I9, uint16_t I10, uint16_t I11, uint16_t I12, uint16_t I13, uint16_t I14, uint16_t I15,
1972                       uint16_t I16, uint16_t I17, uint16_t I18, uint16_t I19, uint16_t I20, uint16_t I21, uint16_t I22, uint16_t I23,
1973                       uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
1974             struct fold_batch_constant
1975             {
1976                 using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
1977                                             I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
1978             };
1979
1980         }
1981
1982         template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
1983         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
1984         {
1985             constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
1986             return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
1987         }
1988
1989         template <class A>
1990         XSIMD_INLINE batch<uint16_t, A>
1991         swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
1992         {
1993             // FIXME: this sequence is very inefficient, but it's here to catch
1994             // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
1995             // The whole pattern is actually decently folded by GCC and Clang,
1996             // so bare with it.
1997             constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
1998             auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
1999
2000             alignas(A::alignment()) uint16_t buffer[32];
2001             _mm512_store_si512((__m512i*)&buffer[0], tmp);
2002             buffer[0] = buffer[1];
2003             return _mm512_load_si512(&buffer[0]);
2004         }
2005
2006         template <class A, uint16_t... Vs>
2007         XSIMD_INLINE batch<int16_t, A>
2008         swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2009         {
2010             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
2011         }
2012
2013         // trunc
2014         template <class A>
2015         XSIMD_INLINE batch<float, A>
2016         trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
2017         {
2018             return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
2019         }
2020         template <class A>
2021         XSIMD_INLINE batch<double, A>
2022         trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
2023         {
2024             return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
2025         }
2026
2027         // zip_hi
2028         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2029         XSIMD_INLINE batch<T, A>
2030         zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2031         {
2032             __m512i lo, hi;
2033             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2034             {
2035                 assert(false && "not implemented yet");
2036                 return {};
2037             }
2038             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2039             {
2040                 assert(false && "not implemented yet");
2041                 return {};
2042             }
2043             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2044             {
2045                 lo = _mm512_unpacklo_epi32(self, other);
2046                 hi = _mm512_unpackhi_epi32(self, other);
2047             }
2048             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2049             {
2050                 lo = _mm512_unpacklo_epi64(self, other);
2051                 hi = _mm512_unpackhi_epi64(self, other);
2052             }
2053             else
2054             {
2055                 assert(false && "unsupported arch/op combination");
2056                 return {};
2057             }
2058             return _mm512_inserti32x4(
2059                 _mm512_inserti32x4(
2060                     _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
2061                     _mm512_extracti32x4_epi32(lo, 3),
2062                     2),
2063                 _mm512_extracti32x4_epi32(hi, 2),
2064                 1);
2065         }
2066         template <class A>
2067         XSIMD_INLINE batch<float, A>
2068         zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2069         {
2070             auto lo = _mm512_unpacklo_ps(self, other);
2071             auto hi = _mm512_unpackhi_ps(self, other);
2072             return _mm512_insertf32x4(
2073                 _mm512_insertf32x4(
2074                     _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
2075                     _mm512_extractf32x4_ps(lo, 3),
2076                     2),
2077                 _mm512_extractf32x4_ps(hi, 2),
2078                 1);
2079         }
2080         template <class A>
2081         XSIMD_INLINE batch<double, A>
2082         zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2083         {
2084             auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
2085             auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
2086             return _mm512_castps_pd(_mm512_insertf32x4(
2087                 _mm512_insertf32x4(
2088                     _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
2089                     _mm512_extractf32x4_ps(lo, 3),
2090                     2),
2091                 _mm512_extractf32x4_ps(hi, 2),
2092                 1));
2093         }
2094
2095         // zip_lo
2096         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2097         XSIMD_INLINE batch<T, A>
2098         zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2099         {
2100             __m512i lo, hi;
2101             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2102             {
2103                 assert(false && "not implemented yet");
2104                 return {};
2105             }
2106             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2107             {
2108                 assert(false && "not implemented yet");
2109                 return {};
2110             }
2111             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2112             {
2113                 lo = _mm512_unpacklo_epi32(self, other);
2114                 hi = _mm512_unpackhi_epi32(self, other);
2115             }
2116             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2117             {
2118                 lo = _mm512_unpacklo_epi64(self, other);
2119                 hi = _mm512_unpackhi_epi64(self, other);
2120             }
2121             else
2122             {
2123                 assert(false && "unsupported arch/op combination");
2124                 return {};
2125             }
2126             return _mm512_inserti32x4(
2127                 _mm512_inserti32x4(
2128                     _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
2129                     _mm512_extracti32x4_epi32(hi, 1),
2130                     3),
2131                 _mm512_extracti32x4_epi32(lo, 1),
2132                 2);
2133         }
2134         template <class A>
2135         XSIMD_INLINE batch<float, A>
2136         zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2137         {
2138             auto lo = _mm512_unpacklo_ps(self, other);
2139             auto hi = _mm512_unpackhi_ps(self, other);
2140             return _mm512_insertf32x4(
2141                 _mm512_insertf32x4(
2142                     _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
2143                     _mm512_extractf32x4_ps(hi, 1),
2144                     3),
2145                 _mm512_extractf32x4_ps(lo, 1),
2146                 2);
2147         }
2148         template <class A>
2149         XSIMD_INLINE batch<double, A>
2150         zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2151         {
2152             auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
2153             auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
2154             return _mm512_castps_pd(_mm512_insertf32x4(
2155                 _mm512_insertf32x4(
2156                     _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
2157                     _mm512_extractf32x4_ps(hi, 1),
2158                     3),
2159                 _mm512_extractf32x4_ps(lo, 1),
2160                 2));
2161         }
2162
2163     }
2164
2165 }
2166
2167 #endif