File indexing completed on 2025-08-28 09:11:30
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_AVX512BW_HPP
0013 #define XSIMD_AVX512BW_HPP
0014
0015 #include <array>
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_avx512bw_register.hpp"
0019
0020 namespace xsimd
0021 {
0022
0023 namespace kernel
0024 {
0025 using namespace types;
0026
0027 namespace detail
0028 {
0029 template <class A, class T, int Cmp>
0030 XSIMD_INLINE batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
0031 {
0032 using register_type = typename batch_bool<T, A>::register_type;
0033 if (std::is_signed<T>::value)
0034 {
0035 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0036 {
0037 return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
0038 }
0039 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0040 {
0041 return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
0042 }
0043 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0044 {
0045 return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
0046 }
0047 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0048 {
0049 return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
0050 }
0051 }
0052 else
0053 {
0054 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0055 {
0056 return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
0057 }
0058 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0059 {
0060 return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
0061 }
0062 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0063 {
0064 return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
0065 }
0066 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0067 {
0068 return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
0069 }
0070 }
0071 }
0072 }
0073
0074
0075 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0076 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
0077 {
0078 if (std::is_unsigned<T>::value)
0079 {
0080 return self;
0081 }
0082
0083 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0084 {
0085 return _mm512_abs_epi8(self);
0086 }
0087 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0088 {
0089 return _mm512_abs_epi16(self);
0090 }
0091 else
0092 {
0093 return abs(self, avx512dq {});
0094 }
0095 }
0096
0097
0098 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0099 XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0100 {
0101 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0102 {
0103 return _mm512_add_epi8(self, other);
0104 }
0105 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0106 {
0107 return _mm512_add_epi16(self, other);
0108 }
0109 else
0110 {
0111 return add(self, other, avx512dq {});
0112 }
0113 }
0114
0115
0116 template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0117 XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0118 {
0119 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0120 {
0121 return _mm512_avg_epu8(self, other);
0122 }
0123 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0124 {
0125 return _mm512_avg_epu16(self, other);
0126 }
0127 else
0128 {
0129 return avgr(self, other, generic {});
0130 }
0131 }
0132
0133
0134 template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0135 XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0136 {
0137 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0138 {
0139 auto adj = ((self ^ other) << 7) >> 7;
0140 return avgr(self, other, A {}) - adj;
0141 }
0142 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0143 {
0144 auto adj = ((self ^ other) << 15) >> 15;
0145 return avgr(self, other, A {}) - adj;
0146 }
0147 else
0148 {
0149 return avg(self, other, generic {});
0150 }
0151 }
0152
0153
0154 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0155 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
0156 {
0157 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0158 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0159 {
0160 return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
0161 #else
0162 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0163 {
0164 return _mm512_slli_epi16(self, other);
0165 #endif
0166 }
0167 else
0168 {
0169 return bitwise_lshift(self, other, avx512dq {});
0170 }
0171 }
0172
0173
0174 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0175 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
0176 {
0177 if (std::is_signed<T>::value)
0178 {
0179 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0180 {
0181 __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
0182 __m512i zeros = _mm512_setzero_si512();
0183 __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
0184 __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
0185 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0186 __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
0187 #else
0188 __m512i res = _mm512_srai_epi16(self, other);
0189 #endif
0190 return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
0191 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0192 }
0193 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0194 {
0195 return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
0196 #else
0197 }
0198 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0199 {
0200 return _mm512_srai_epi16(self, other);
0201 #endif
0202 }
0203 else
0204 {
0205 return bitwise_rshift(self, other, avx512dq {});
0206 }
0207 }
0208 else
0209 {
0210 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0211 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0212 {
0213 return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
0214 #else
0215 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0216 {
0217 return _mm512_srli_epi16(self, other);
0218 #endif
0219 }
0220 else
0221 {
0222 return bitwise_rshift(self, other, avx512dq {});
0223 }
0224 }
0225 }
0226
0227
0228 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0229 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0230 {
0231 return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
0232 }
0233
0234
0235 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0236 XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0237 {
0238 return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
0239 }
0240
0241
0242 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0243 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0244 {
0245 return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
0246 }
0247
0248
0249 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0250 XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0251 {
0252 return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
0253 }
0254
0255
0256 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0257 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0258 {
0259 return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
0260 }
0261
0262
0263 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0264 XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0265 {
0266 if (std::is_signed<T>::value)
0267 {
0268 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0269 {
0270 return _mm512_max_epi8(self, other);
0271 }
0272 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0273 {
0274 return _mm512_max_epi16(self, other);
0275 }
0276 else
0277 {
0278 return max(self, other, avx512dq {});
0279 }
0280 }
0281 else
0282 {
0283 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0284 {
0285 return _mm512_max_epu8(self, other);
0286 }
0287 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0288 {
0289 return _mm512_max_epu16(self, other);
0290 }
0291 else
0292 {
0293 return max(self, other, avx512dq {});
0294 }
0295 }
0296 }
0297
0298
0299 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0300 XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0301 {
0302 if (std::is_signed<T>::value)
0303 {
0304 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0305 {
0306 return _mm512_min_epi8(self, other);
0307 }
0308 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0309 {
0310 return _mm512_min_epi16(self, other);
0311 }
0312 else
0313 {
0314 return min(self, other, avx512dq {});
0315 }
0316 }
0317 else
0318 {
0319 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0320 {
0321 return _mm512_min_epu8(self, other);
0322 }
0323 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0324 {
0325 return _mm512_min_epu16(self, other);
0326 }
0327 else
0328 {
0329 return min(self, other, avx512dq {});
0330 }
0331 }
0332 }
0333
0334
0335 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0336 XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0337 {
0338 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0339 {
0340 __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
0341 __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
0342 return _mm512_or_si512(upper, lower);
0343 }
0344 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0345 {
0346 return _mm512_mullo_epi16(self, other);
0347 }
0348 else
0349 {
0350 return mul(self, other, avx512dq {});
0351 }
0352 }
0353
0354
0355 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0356 XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0357 {
0358 return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
0359 }
0360
0361
0362 template <size_t N, class A>
0363 XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx512bw>) noexcept
0364 {
0365 return _mm512_alignr_epi8(self, self, N);
0366 }
0367 template <size_t N, class A>
0368 XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx512bw>) noexcept
0369 {
0370 return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
0371 }
0372
0373
0374 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0375 XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0376 {
0377 if (std::is_signed<T>::value)
0378 {
0379 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0380 {
0381 return _mm512_adds_epi8(self, other);
0382 }
0383 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0384 {
0385 return _mm512_adds_epi16(self, other);
0386 }
0387 else
0388 {
0389 return sadd(self, other, avx512dq {});
0390 }
0391 }
0392 else
0393 {
0394 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0395 {
0396 return _mm512_adds_epu8(self, other);
0397 }
0398 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0399 {
0400 return _mm512_adds_epu16(self, other);
0401 }
0402 else
0403 {
0404 return sadd(self, other, avx512dq {});
0405 }
0406 }
0407 }
0408
0409
0410 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0411 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
0412 {
0413 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0414 {
0415 return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
0416 }
0417 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0418 {
0419 return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
0420 }
0421 else
0422 {
0423 return select(cond, true_br, false_br, avx512dq {});
0424 }
0425 }
0426
0427
0428 namespace detail
0429 {
0430 template <size_t... Is>
0431 constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
0432 {
0433 return { (Is == 0 ? 8 : Is - 1)... };
0434 }
0435
0436 template <size_t N, size_t... Is>
0437 constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
0438 {
0439 return { (Is >= N ? Is - N : 0)... };
0440 }
0441 template <size_t N, size_t... Is>
0442 constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
0443 {
0444 return { (Is >= N ? 0xFFFF : 0x0000)... };
0445 }
0446 }
0447
0448 template <size_t N, class A, class T>
0449 XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
0450 {
0451 constexpr unsigned BitCount = N * 8;
0452 if (BitCount == 0)
0453 {
0454 return x;
0455 }
0456 if (BitCount >= 512)
0457 {
0458 return batch<T, A>(T(0));
0459 }
0460 batch<T, A> xx;
0461 if (N & 1)
0462 {
0463 alignas(A::alignment()) uint64_t buffer[8];
0464 _mm512_store_epi64(&buffer[0], x);
0465 for (int i = 7; i > 0; --i)
0466 buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
0467 buffer[0] = buffer[0] << 8;
0468 xx = _mm512_load_epi64(&buffer[0]);
0469
0470 alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
0471 __m512i xl = _mm512_slli_epi64(x, 8);
0472 __m512i xr = _mm512_srli_epi64(x, 56);
0473 xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
0474 xx = _mm512_or_si512(xr, xl);
0475 if (N == 1)
0476 return xx;
0477 }
0478 else
0479 {
0480 xx = x;
0481 }
0482 alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0483 alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0484 return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
0485 }
0486
0487
0488 namespace detail
0489 {
0490 template <size_t... Is>
0491 constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
0492 {
0493 return { (Is + 1)... };
0494 }
0495
0496 template <size_t N, size_t... Is>
0497 constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
0498 {
0499 return { (Is < (32 - N) ? Is + N : 0)... };
0500 }
0501 template <size_t N, size_t... Is>
0502 constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
0503 {
0504 return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
0505 }
0506 }
0507 template <size_t N, class A, class T>
0508 XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
0509 {
0510 constexpr unsigned BitCount = N * 8;
0511 if (BitCount == 0)
0512 {
0513 return x;
0514 }
0515 if (BitCount >= 512)
0516 {
0517 return batch<T, A>(T(0));
0518 }
0519 batch<T, A> xx;
0520 if (N & 1)
0521 {
0522 alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
0523 __m512i xr = _mm512_srli_epi64(x, 8);
0524 __m512i xl = _mm512_slli_epi64(x, 56);
0525 xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
0526 xx = _mm512_or_si512(xr, xl);
0527 if (N == 1)
0528 return xx;
0529 }
0530 else
0531 {
0532 xx = x;
0533 }
0534 alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0535 alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
0536 return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
0537 }
0538
0539
0540 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0541 XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0542 {
0543 if (std::is_signed<T>::value)
0544 {
0545 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0546 {
0547 return _mm512_subs_epi8(self, other);
0548 }
0549 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0550 {
0551 return _mm512_subs_epi16(self, other);
0552 }
0553 else
0554 {
0555 return ssub(self, other, avx512dq {});
0556 }
0557 }
0558 else
0559 {
0560 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0561 {
0562 return _mm512_subs_epu8(self, other);
0563 }
0564 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0565 {
0566 return _mm512_subs_epu16(self, other);
0567 }
0568 else
0569 {
0570 return ssub(self, other, avx512dq {});
0571 }
0572 }
0573 }
0574
0575
0576 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0577 XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0578 {
0579 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0580 {
0581 return _mm512_sub_epi8(self, other);
0582 }
0583 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0584 {
0585 return _mm512_sub_epi16(self, other);
0586 }
0587 else
0588 {
0589 return sub(self, other, avx512dq {});
0590 }
0591 }
0592
0593
0594 template <class A>
0595 XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
0596 {
0597 return _mm512_permutexvar_epi16(mask, self);
0598 }
0599
0600 template <class A>
0601 XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch<uint16_t, A> mask, requires_arch<avx512bw>) noexcept
0602 {
0603 return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
0604 }
0605
0606 template <class A>
0607 XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
0608 {
0609 return _mm512_shuffle_epi8(self, mask);
0610 }
0611
0612 template <class A>
0613 XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512bw>) noexcept
0614 {
0615 return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
0616 }
0617
0618
0619 template <class A, uint16_t... Vs>
0620 XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0621 {
0622 return swizzle(self, mask.as_batch(), avx512bw {});
0623 }
0624
0625 template <class A, uint16_t... Vs>
0626 XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0627 {
0628 return swizzle(self, mask.as_batch(), avx512bw {});
0629 }
0630
0631 template <class A, uint8_t... Vs>
0632 XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0633 {
0634 return swizzle(self, mask.as_batch(), avx512bw {});
0635 }
0636
0637 template <class A, uint8_t... Vs>
0638 XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
0639 {
0640 return swizzle(self, mask.as_batch(), avx512bw {});
0641 }
0642
0643
0644 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0645 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0646 {
0647 __m512i lo, hi;
0648 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0649 {
0650 lo = _mm512_unpacklo_epi8(self, other);
0651 hi = _mm512_unpackhi_epi8(self, other);
0652 }
0653 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0654 {
0655 lo = _mm512_unpacklo_epi16(self, other);
0656 hi = _mm512_unpackhi_epi16(self, other);
0657 }
0658 else
0659 {
0660 return zip_hi(self, other, avx512f {});
0661 }
0662 return _mm512_inserti32x4(
0663 _mm512_inserti32x4(
0664 _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
0665 _mm512_extracti32x4_epi32(lo, 3),
0666 2),
0667 _mm512_extracti32x4_epi32(hi, 2),
0668 1);
0669 }
0670
0671
0672 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0673 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
0674 {
0675 __m512i lo, hi;
0676 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0677 {
0678 lo = _mm512_unpacklo_epi8(self, other);
0679 hi = _mm512_unpackhi_epi8(self, other);
0680 }
0681 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0682 {
0683 lo = _mm512_unpacklo_epi16(self, other);
0684 hi = _mm512_unpackhi_epi16(self, other);
0685 }
0686 else
0687 {
0688 return zip_lo(self, other, avx512f {});
0689 }
0690 return _mm512_inserti32x4(
0691 _mm512_inserti32x4(
0692 _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
0693 _mm512_extracti32x4_epi32(hi, 1),
0694 3),
0695 _mm512_extracti32x4_epi32(lo, 1),
0696 2);
0697 }
0698 }
0699 }
0700
0701 #endif