xsimd/arch/xsimd_avx512bw.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_AVX512BW_HPP
0013 #define XSIMD_AVX512BW_HPP
0014
0015 #include <array>
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_avx512bw_register.hpp"
0019
0020 namespace xsimd
0021 {
0022
0023     namespace kernel
0024     {
0025         using namespace types;
0026
0027         namespace detail
0028         {
0029             template <class A, class T, int Cmp>
0030             XSIMD_INLINE batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
0031             {
0032                 using register_type = typename batch_bool<T, A>::register_type;
0033                 if (std::is_signed<T>::value)
0034                 {
0035                     XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0036                     {
0037                         return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
0038                     }
0039                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0040                     {
0041                         return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
0042                     }
0043                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0044                     {
0045                         return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
0046                     }
0047                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0048                     {
0049                         return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
0050                     }
0051                 }
0052                 else
0053                 {
0054                     XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0055                     {
0056                         return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
0057                     }
0058                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0059                     {
0060                         return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
0061                     }
0062                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0063                     {
0064                         return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
0065                     }
0066                     else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0067                     {
0068                         return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
0069                     }
0070                 }
0071             }
0072         }
0073
0074         // abs
0075         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0076         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
0077         {
0078             if (std::is_unsigned<T>::value)
0079             {
0080                 return self;
0081             }
0082
0083             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0084             {
0085                 return _mm512_abs_epi8(self);
0086             }
0087             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0088             {
0089                 return _mm512_abs_epi16(self);
0090             }
0091             else
0092             {
0093                 return abs(self, avx512dq {});
0094             }
0095         }
0096
0097         // add
0098         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0099         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0100         {
0101             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0102             {
0103                 return _mm512_add_epi8(self, other);
0104             }
0105             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0106             {
0107                 return _mm512_add_epi16(self, other);
0108             }
0109             else
0110             {
0111                 return add(self, other, avx512dq {});
0112             }
0113         }
0114
0115         // avgr
0116         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0117         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0118         {
0119             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0120             {
0121                 return _mm512_avg_epu8(self, other);
0122             }
0123             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0124             {
0125                 return _mm512_avg_epu16(self, other);
0126             }
0127             else
0128             {
0129                 return avgr(self, other, generic {});
0130             }
0131         }
0132
0133         // avg
0134         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0135         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0136         {
0137             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0138             {
0139                 auto adj = ((self ^ other) << 7) >> 7;
0140                 return avgr(self, other, A {}) - adj;
0141             }
0142             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0143             {
0144                 auto adj = ((self ^ other) << 15) >> 15;
0145                 return avgr(self, other, A {}) - adj;
0146             }
0147             else
0148             {
0149                 return avg(self, other, generic {});
0150             }
0151         }
0152
0153         // bitwise_lshift
0154         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0155         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
0156         {
0157 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0158             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0159             {
0160                 return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
0161 #else
0162             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0163             {
0164                 return _mm512_slli_epi16(self, other);
0165 #endif
0166             }
0167             else
0168             {
0169                 return bitwise_lshift(self, other, avx512dq {});
0170             }
0171         }
0172
0173         // bitwise_rshift
0174         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0175         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
0176         {
0177             if (std::is_signed<T>::value)
0178             {
0179                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0180                 {
0181                     __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
0182                     __m512i zeros = _mm512_setzero_si512();
0183                     __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
0184                     __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
0185 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0186                     __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
0187 #else
0188                     __m512i res = _mm512_srai_epi16(self, other);
0189 #endif
0190                     return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
0191 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0192                 }
0193                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0194                 {
0195                     return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
0196 #else
0197                 }
0198                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0199                 {
0200                     return _mm512_srai_epi16(self, other);
0201 #endif
0202                 }
0203                 else
0204                 {
0205                     return bitwise_rshift(self, other, avx512dq {});
0206                 }
0207             }
0208             else
0209             {
0210 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0211                 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0212                 {
0213                     return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
0214 #else
0215                 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0216                 {
0217                     return _mm512_srli_epi16(self, other);
0218 #endif
0219                 }
0220                 else
0221                 {
0222                     return bitwise_rshift(self, other, avx512dq {});
0223                 }
0224             }
0225         }
0226
0227         // eq
0228         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0229         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0230         {
0231             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
0232         }
0233
0234         // ge
0235         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0236         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0237         {
0238             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
0239         }
0240
0241         // gt
0242         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0243         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0244         {
0245             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
0246         }
0247
0248         // le
0249         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0250         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0251         {
0252             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
0253         }
0254
0255         // lt
0256         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0257         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0258         {
0259             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
0260         }
0261
0262         // max
0263         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0264         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0265         {
0266             if (std::is_signed<T>::value)
0267             {
0268                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0269                 {
0270                     return _mm512_max_epi8(self, other);
0271                 }
0272                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0273                 {
0274                     return _mm512_max_epi16(self, other);
0275                 }
0276                 else
0277                 {
0278                     return max(self, other, avx512dq {});
0279                 }
0280             }
0281             else
0282             {
0283                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0284                 {
0285                     return _mm512_max_epu8(self, other);
0286                 }
0287                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0288                 {
0289                     return _mm512_max_epu16(self, other);
0290                 }
0291                 else
0292                 {
0293                     return max(self, other, avx512dq {});
0294                 }
0295             }
0296         }
0297
0298         // min
0299         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0300         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0301         {
0302             if (std::is_signed<T>::value)
0303             {
0304                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0305                 {
0306                     return _mm512_min_epi8(self, other);
0307                 }
0308                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0309                 {
0310                     return _mm512_min_epi16(self, other);
0311                 }
0312                 else
0313                 {
0314                     return min(self, other, avx512dq {});
0315                 }
0316             }
0317             else
0318             {
0319                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0320                 {
0321                     return _mm512_min_epu8(self, other);
0322                 }
0323                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0324                 {
0325                     return _mm512_min_epu16(self, other);
0326                 }
0327                 else
0328                 {
0329                     return min(self, other, avx512dq {});
0330                 }
0331             }
0332         }
0333
0334         // mul
0335         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0336         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0337         {
0338             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0339             {
0340                 __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
0341                 __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
0342                 return _mm512_or_si512(upper, lower);
0343             }
0344             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0345             {
0346                 return _mm512_mullo_epi16(self, other);
0347             }
0348             else
0349             {
0350                 return mul(self, other, avx512dq {});
0351             }
0352         }
0353
0354         // neq
0355         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0356         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0357         {
0358             return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
0359         }
0360
0361         // rotate_left
0362         template <size_t N, class A>
0363         XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
0364         {
0365             return _mm512_alignr_epi8(self, self, N);
0366         }
0367         template <size_t N, class A>
0368         XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
0369         {
0370             return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
0371         }
0372
0373         // sadd
0374         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0375         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0376         {
0377             if (std::is_signed<T>::value)
0378             {
0379                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0380                 {
0381                     return _mm512_adds_epi8(self, other);
0382                 }
0383                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0384                 {
0385                     return _mm512_adds_epi16(self, other);
0386                 }
0387                 else
0388                 {
0389                     return sadd(self, other, avx512dq {});
0390                 }
0391             }
0392             else
0393             {
0394                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0395                 {
0396                     return _mm512_adds_epu8(self, other);
0397                 }
0398                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0399                 {
0400                     return _mm512_adds_epu16(self, other);
0401                 }
0402                 else
0403                 {
0404                     return sadd(self, other, avx512dq {});
0405                 }
0406             }
0407         }
0408
0409         // select
0410         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0411         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
0412         {
0413             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0414             {
0415                 return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
0416             }
0417             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0418             {
0419                 return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
0420             }
0421             else
0422             {
0423                 return select(cond, true_br, false_br, avx512dq {});
0424             }
0425         }
0426
0427         // slide_left
0428         namespace detail
0429         {
0430             template <size_t... Is>
0431             constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
0432             {
0433                 return { (Is == 0 ? 8 : Is - 1)... };
0434             }
0435
0436             template <size_t N, size_t... Is>
0437             constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
0438             {
0439                 return { (Is >= N ? Is - N : 0)... };
0440             }
0441             template <size_t N, size_t... Is>
0442             constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
0443             {
0444                 return { (Is >= N ? 0xFFFF : 0x0000)... };
0445             }
0446         }
0447
0448         template <size_t N, class A, class T>
0449         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
0450         {
0451             constexpr unsigned BitCount = N * 8;
0452             if (BitCount == 0)
0453             {
0454                 return x;
0455             }
0456             if (BitCount >= 512)
0457             {
0458                 return batch<T, A>(T(0));
0459             }
0460             batch<T, A> xx;
0461             if (N & 1)
0462             {
0463                 alignas(A::alignment()) uint64_t buffer[8];
0464                 _mm512_store_epi64(&buffer[0], x);
0465                 for (int i = 7; i > 0; --i)
0466                     buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
0467                 buffer[0] = buffer[0] << 8;
0468                 xx = _mm512_load_epi64(&buffer[0]);
0469
0470                 alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
0471                 __m512i xl = _mm512_slli_epi64(x, 8);
0472                 __m512i xr = _mm512_srli_epi64(x, 56);
0473                 xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
0474                 xx = _mm512_or_si512(xr, xl);
0475                 if (N == 1)
0476                     return xx;
0477             }
0478             else
0479             {
0480                 xx = x;
0481             }
0482             alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0483             alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0484             return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
0485         }
0486
0487         // slide_right
0488         namespace detail
0489         {
0490             template <size_t... Is>
0491             constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
0492             {
0493                 return { (Is + 1)... };
0494             }
0495
0496             template <size_t N, size_t... Is>
0497             constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
0498             {
0499                 return { (Is < (32 - N) ? Is + N : 0)... };
0500             }
0501             template <size_t N, size_t... Is>
0502             constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
0503             {
0504                 return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
0505             }
0506         }
0507         template <size_t N, class A, class T>
0508         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
0509         {
0510             constexpr unsigned BitCount = N * 8;
0511             if (BitCount == 0)
0512             {
0513                 return x;
0514             }
0515             if (BitCount >= 512)
0516             {
0517                 return batch<T, A>(T(0));
0518             }
0519             batch<T, A> xx;
0520             if (N & 1)
0521             {
0522                 alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
0523                 __m512i xr = _mm512_srli_epi64(x, 8);
0524                 __m512i xl = _mm512_slli_epi64(x, 56);
0525                 xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
0526                 xx = _mm512_or_si512(xr, xl);
0527                 if (N == 1)
0528                     return xx;
0529             }
0530             else
0531             {
0532                 xx = x;
0533             }
0534             alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0535             alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0536             return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
0537         }
0538
0539         // ssub
0540         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0541         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0542         {
0543             if (std::is_signed<T>::value)
0544             {
0545                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0546                 {
0547                     return _mm512_subs_epi8(self, other);
0548                 }
0549                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0550                 {
0551                     return _mm512_subs_epi16(self, other);
0552                 }
0553                 else
0554                 {
0555                     return ssub(self, other, avx512dq {});
0556                 }
0557             }
0558             else
0559             {
0560                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0561                 {
0562                     return _mm512_subs_epu8(self, other);
0563                 }
0564                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0565                 {
0566                     return _mm512_subs_epu16(self, other);
0567                 }
0568                 else
0569                 {
0570                     return ssub(self, other, avx512dq {});
0571                 }
0572             }
0573         }
0574
0575         // sub
0576         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0577         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0578         {
0579             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0580             {
0581                 return _mm512_sub_epi8(self, other);
0582             }
0583             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0584             {
0585                 return _mm512_sub_epi16(self, other);
0586             }
0587             else
0588             {
0589                 return sub(self, other, avx512dq {});
0590             }
0591         }
0592
0593         // swizzle (dynamic version)
0594         template <class A>
0595         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
0596         {
0597             return _mm512_permutexvar_epi16(mask, self);
0598         }
0599
0600         template <class A>
0601         XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
0602         {
0603             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
0604         }
0605
0606         template <class A>
0607         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
0608         {
0609             return _mm512_shuffle_epi8(self, mask);
0610         }
0611
0612         template <class A>
0613         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
0614         {
0615             return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
0616         }
0617
0618         // swizzle (static version)
0619         template <class A, uint16_t... Vs>
0620         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0621         {
0622             return swizzle(self, mask.as_batch(), avx512bw {});
0623         }
0624
0625         template <class A, uint16_t... Vs>
0626         XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0627         {
0628             return swizzle(self, mask.as_batch(), avx512bw {});
0629         }
0630
0631         template <class A, uint8_t... Vs>
0632         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0633         {
0634             return swizzle(self, mask.as_batch(), avx512bw {});
0635         }
0636
0637         template <class A, uint8_t... Vs>
0638         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0639         {
0640             return swizzle(self, mask.as_batch(), avx512bw {});
0641         }
0642
0643         // zip_hi
0644         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0645         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0646         {
0647             __m512i lo, hi;
0648             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0649             {
0650                 lo = _mm512_unpacklo_epi8(self, other);
0651                 hi = _mm512_unpackhi_epi8(self, other);
0652             }
0653             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0654             {
0655                 lo = _mm512_unpacklo_epi16(self, other);
0656                 hi = _mm512_unpackhi_epi16(self, other);
0657             }
0658             else
0659             {
0660                 return zip_hi(self, other, avx512f {});
0661             }
0662             return _mm512_inserti32x4(
0663                 _mm512_inserti32x4(
0664                     _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
0665                     _mm512_extracti32x4_epi32(lo, 3),
0666                     2),
0667                 _mm512_extracti32x4_epi32(hi, 2),
0668                 1);
0669         }
0670
0671         // zip_lo
0672         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0673         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0674         {
0675             __m512i lo, hi;
0676             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0677             {
0678                 lo = _mm512_unpacklo_epi8(self, other);
0679                 hi = _mm512_unpackhi_epi8(self, other);
0680             }
0681             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0682             {
0683                 lo = _mm512_unpacklo_epi16(self, other);
0684                 hi = _mm512_unpackhi_epi16(self, other);
0685             }
0686             else
0687             {
0688                 return zip_lo(self, other, avx512f {});
0689             }
0690             return _mm512_inserti32x4(
0691                 _mm512_inserti32x4(
0692                     _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
0693                     _mm512_extracti32x4_epi32(hi, 1),
0694                     3),
0695                 _mm512_extracti32x4_epi32(lo, 1),
0696                 2);
0697         }
0698     }
0699 }
0700
0701 #endif