File indexing completed on 2025-08-28 09:11:30
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_AVX2_HPP
0013 #define XSIMD_AVX2_HPP
0014
0015 #include <complex>
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_avx2_register.hpp"
0019
0020 namespace xsimd
0021 {
0022
0023 namespace kernel
0024 {
0025 using namespace types;
0026
0027
0028 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0029 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
0030 {
0031 if (std::is_signed<T>::value)
0032 {
0033 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0034 {
0035 return _mm256_abs_epi8(self);
0036 }
0037 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0038 {
0039 return _mm256_abs_epi16(self);
0040 }
0041 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0042 {
0043 return _mm256_abs_epi32(self);
0044 }
0045 else
0046 {
0047 return abs(self, avx {});
0048 }
0049 }
0050 return self;
0051 }
0052
0053
0054 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0055 XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0056 {
0057 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0058 {
0059 return _mm256_add_epi8(self, other);
0060 }
0061 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0062 {
0063 return _mm256_add_epi16(self, other);
0064 }
0065 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0066 {
0067 return _mm256_add_epi32(self, other);
0068 }
0069 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0070 {
0071 return _mm256_add_epi64(self, other);
0072 }
0073 else
0074 {
0075 return add(self, other, avx {});
0076 }
0077 }
0078
0079
0080 template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0081 XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0082 {
0083 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0084 {
0085 return _mm256_avg_epu8(self, other);
0086 }
0087 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0088 {
0089 return _mm256_avg_epu16(self, other);
0090 }
0091 else
0092 {
0093 return avgr(self, other, generic {});
0094 }
0095 }
0096
0097
0098 template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0099 XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0100 {
0101 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0102 {
0103 auto adj = ((self ^ other) << 7) >> 7;
0104 return avgr(self, other, A {}) - adj;
0105 }
0106 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0107 {
0108 auto adj = ((self ^ other) << 15) >> 15;
0109 return avgr(self, other, A {}) - adj;
0110 }
0111 else
0112 {
0113 return avg(self, other, generic {});
0114 }
0115 }
0116
0117
0118 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0119 XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0120 {
0121 return _mm256_and_si256(self, other);
0122 }
0123 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0124 XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0125 {
0126 return _mm256_and_si256(self, other);
0127 }
0128
0129
0130 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0131 XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0132 {
0133 return _mm256_andnot_si256(other, self);
0134 }
0135 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0136 XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0137 {
0138 return _mm256_andnot_si256(other, self);
0139 }
0140
0141
0142 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0143 XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
0144 {
0145 return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
0146 }
0147 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0148 XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
0149 {
0150 return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
0151 }
0152
0153
0154 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0155 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
0156 {
0157 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0158 {
0159 return _mm256_slli_epi16(self, other);
0160 }
0161 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0162 {
0163 return _mm256_slli_epi32(self, other);
0164 }
0165 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0166 {
0167 return _mm256_slli_epi64(self, other);
0168 }
0169 else
0170 {
0171 return bitwise_lshift(self, other, avx {});
0172 }
0173 }
0174
0175 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0176 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0177 {
0178 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0179 {
0180 return _mm256_sllv_epi32(self, other);
0181 }
0182 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0183 {
0184 return _mm256_sllv_epi64(self, other);
0185 }
0186 else
0187 {
0188 return bitwise_lshift(self, other, avx {});
0189 }
0190 }
0191
0192
0193 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0194 XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0195 {
0196 return _mm256_or_si256(self, other);
0197 }
0198 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0199 XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0200 {
0201 return _mm256_or_si256(self, other);
0202 }
0203
0204
0205 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0206 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
0207 {
0208 if (std::is_signed<T>::value)
0209 {
0210 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0211 {
0212 __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
0213 __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
0214 __m256i res = _mm256_srai_epi16(self, other);
0215 return _mm256_or_si256(
0216 detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0217 { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
0218 sign_mask, cmp_is_negative),
0219 _mm256_andnot_si256(sign_mask, res));
0220 }
0221 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0222 {
0223 return _mm256_srai_epi16(self, other);
0224 }
0225 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0226 {
0227 return _mm256_srai_epi32(self, other);
0228 }
0229 else
0230 {
0231 return bitwise_rshift(self, other, avx {});
0232 }
0233 }
0234 else
0235 {
0236 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0237 {
0238 return _mm256_srli_epi16(self, other);
0239 }
0240 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0241 {
0242 return _mm256_srli_epi32(self, other);
0243 }
0244 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0245 {
0246 return _mm256_srli_epi64(self, other);
0247 }
0248 else
0249 {
0250 return bitwise_rshift(self, other, avx {});
0251 }
0252 }
0253 }
0254
0255 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0256 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0257 {
0258 if (std::is_signed<T>::value)
0259 {
0260 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0261 {
0262 return _mm256_srav_epi32(self, other);
0263 }
0264 else
0265 {
0266 return bitwise_rshift(self, other, avx {});
0267 }
0268 }
0269 else
0270 {
0271 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0272 {
0273 return _mm256_srlv_epi32(self, other);
0274 }
0275 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0276 {
0277 return _mm256_srlv_epi64(self, other);
0278 }
0279 else
0280 {
0281 return bitwise_rshift(self, other, avx {});
0282 }
0283 }
0284 }
0285
0286
0287 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0288 XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0289 {
0290 return _mm256_xor_si256(self, other);
0291 }
0292 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0293 XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
0294 {
0295 return _mm256_xor_si256(self, other);
0296 }
0297
0298
0299 template <class A>
0300 XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
0301 {
0302 __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
0303 __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
0304 return _mm256_blend_pd(tmp0, tmp1, 10);
0305 }
0306
0307
0308 template <class A>
0309 XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
0310 {
0311 __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
0312 __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
0313 return _mm256_blend_pd(tmp0, tmp1, 10);
0314 }
0315
0316
0317 namespace detail
0318 {
0319
0320 template <class A>
0321 XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
0322 {
0323
0324
0325 __m256i xH = _mm256_srli_epi64(x, 32);
0326 xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.)));
0327 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
0328 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
0329 __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000))));
0330 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.));
0331 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
0332 }
0333
0334 template <class A>
0335 XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
0336 {
0337
0338
0339 __m256i xH = _mm256_srai_epi32(x, 16);
0340 xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
0341 xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.)));
0342 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
0343 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
0344 __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000))));
0345 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.));
0346 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
0347 }
0348 }
0349
0350
0351 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0352 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0353 {
0354 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0355 {
0356 return _mm256_cmpeq_epi8(self, other);
0357 }
0358 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0359 {
0360 return _mm256_cmpeq_epi16(self, other);
0361 }
0362 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0363 {
0364 return _mm256_cmpeq_epi32(self, other);
0365 }
0366 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0367 {
0368 return _mm256_cmpeq_epi64(self, other);
0369 }
0370 else
0371 {
0372 return eq(self, other, avx {});
0373 }
0374 }
0375
0376
0377 template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
0378 XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0379 kernel::requires_arch<avx2>) noexcept
0380 {
0381
0382 return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
0383 }
0384
0385 template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
0386 XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0387 kernel::requires_arch<avx2>) noexcept
0388 {
0389
0390 return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
0391 }
0392
0393 template <class A, class U,
0394 detail::enable_sized_integral_t<U, 4> = 0>
0395 XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
0396 batch<U, A> const& index,
0397 kernel::requires_arch<avx2>) noexcept
0398 {
0399
0400 return _mm256_i32gather_ps(src, index, sizeof(float));
0401 }
0402
0403 template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
0404 XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
0405 batch<U, A> const& index,
0406 requires_arch<avx2>) noexcept
0407 {
0408
0409 return _mm256_i64gather_pd(src, index, sizeof(double));
0410 }
0411
0412
0413 template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0414 XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
0415 batch<V, A> const& index,
0416 requires_arch<avx2>) noexcept
0417 {
0418 const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
0419 const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
0420 return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
0421 }
0422
0423 template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0424 XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
0425 batch<V, A> const& index,
0426 requires_arch<avx2>) noexcept
0427 {
0428 const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
0429 const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
0430 return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
0431 }
0432
0433
0434 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0435 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0436 {
0437 if (std::is_signed<T>::value)
0438 {
0439 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0440 {
0441 return _mm256_cmpgt_epi8(other, self);
0442 }
0443 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0444 {
0445 return _mm256_cmpgt_epi16(other, self);
0446 }
0447 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0448 {
0449 return _mm256_cmpgt_epi32(other, self);
0450 }
0451 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0452 {
0453 return _mm256_cmpgt_epi64(other, self);
0454 }
0455 else
0456 {
0457 return lt(self, other, avx {});
0458 }
0459 }
0460 else
0461 {
0462 return lt(self, other, avx {});
0463 }
0464 }
0465
0466
0467 template <class A>
0468 XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
0469 {
0470 using batch_type = batch<float, A>;
0471 batch_type real = _mm256_castpd_ps(
0472 _mm256_permute4x64_pd(
0473 _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
0474 _MM_SHUFFLE(3, 1, 2, 0)));
0475 batch_type imag = _mm256_castpd_ps(
0476 _mm256_permute4x64_pd(
0477 _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
0478 _MM_SHUFFLE(3, 1, 2, 0)));
0479 return { real, imag };
0480 }
0481 template <class A>
0482 XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
0483 {
0484 using batch_type = batch<double, A>;
0485 batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
0486 batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
0487 return { real, imag };
0488 }
0489
0490 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0491 XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
0492 {
0493 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0494 {
0495 return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
0496 }
0497 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0498 {
0499 uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
0500 return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
0501 }
0502 else
0503 {
0504 return mask(self, avx {});
0505 }
0506 }
0507
0508
0509 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0510 XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0511 {
0512 if (std::is_signed<T>::value)
0513 {
0514 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0515 {
0516 return _mm256_max_epi8(self, other);
0517 }
0518 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0519 {
0520 return _mm256_max_epi16(self, other);
0521 }
0522 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0523 {
0524 return _mm256_max_epi32(self, other);
0525 }
0526 else
0527 {
0528 return max(self, other, avx {});
0529 }
0530 }
0531 else
0532 {
0533 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0534 {
0535 return _mm256_max_epu8(self, other);
0536 }
0537 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0538 {
0539 return _mm256_max_epu16(self, other);
0540 }
0541 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0542 {
0543 return _mm256_max_epu32(self, other);
0544 }
0545 else
0546 {
0547 return max(self, other, avx {});
0548 }
0549 }
0550 }
0551
0552
0553 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0554 XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0555 {
0556 if (std::is_signed<T>::value)
0557 {
0558 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0559 {
0560 return _mm256_min_epi8(self, other);
0561 }
0562 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0563 {
0564 return _mm256_min_epi16(self, other);
0565 }
0566 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0567 {
0568 return _mm256_min_epi32(self, other);
0569 }
0570 else
0571 {
0572 return min(self, other, avx {});
0573 }
0574 }
0575 else
0576 {
0577 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0578 {
0579 return _mm256_min_epu8(self, other);
0580 }
0581 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0582 {
0583 return _mm256_min_epu16(self, other);
0584 }
0585 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0586 {
0587 return _mm256_min_epu32(self, other);
0588 }
0589 else
0590 {
0591 return min(self, other, avx {});
0592 }
0593 }
0594 }
0595
0596
0597 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0598 XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0599 {
0600 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0601 {
0602 __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
0603 __m256i res_lo = _mm256_mullo_epi16(self, other);
0604 __m256i other_hi = _mm256_srli_epi16(other, 8);
0605 __m256i self_hi = _mm256_and_si256(self, mask_hi);
0606 __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
0607 __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
0608 return res;
0609 }
0610 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0611 {
0612 return _mm256_mullo_epi16(self, other);
0613 }
0614 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0615 {
0616 return _mm256_mullo_epi32(self, other);
0617 }
0618 else
0619 {
0620 return mul(self, other, avx {});
0621 }
0622 }
0623
0624
0625 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0626 XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
0627 {
0628 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0629 {
0630 __m256i tmp1 = _mm256_hadd_epi32(self, self);
0631 __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
0632 __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
0633 __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
0634 return _mm_cvtsi128_si32(tmp4);
0635 }
0636 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0637 {
0638 __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
0639 __m256i tmp2 = _mm256_add_epi64(self, tmp1);
0640 __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
0641 __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
0642 #if defined(__x86_64__)
0643 return _mm_cvtsi128_si64(res);
0644 #else
0645 __m128i m;
0646 _mm_storel_epi64(&m, res);
0647 int64_t i;
0648 std::memcpy(&i, &m, sizeof(i));
0649 return i;
0650 #endif
0651 }
0652 else
0653 {
0654 return reduce_add(self, avx {});
0655 }
0656 }
0657
0658
0659 template <size_t N, class A>
0660 XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<avx2>) noexcept
0661 {
0662 return _mm256_alignr_epi8(self, self, N);
0663 }
0664 template <size_t N, class A>
0665 XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<avx2>) noexcept
0666 {
0667 return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), avx2 {}));
0668 }
0669
0670
0671 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0672 XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0673 {
0674 if (std::is_signed<T>::value)
0675 {
0676 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0677 {
0678 return _mm256_adds_epi8(self, other);
0679 }
0680 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0681 {
0682 return _mm256_adds_epi16(self, other);
0683 }
0684 else
0685 {
0686 return sadd(self, other, avx {});
0687 }
0688 }
0689 else
0690 {
0691 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0692 {
0693 return _mm256_adds_epu8(self, other);
0694 }
0695 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0696 {
0697 return _mm256_adds_epu16(self, other);
0698 }
0699 else
0700 {
0701 return sadd(self, other, avx {});
0702 }
0703 }
0704 }
0705
0706
0707 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0708 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
0709 {
0710 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0711 {
0712 return _mm256_blendv_epi8(false_br, true_br, cond);
0713 }
0714 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0715 {
0716 return _mm256_blendv_epi8(false_br, true_br, cond);
0717 }
0718 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0719 {
0720 return _mm256_blendv_epi8(false_br, true_br, cond);
0721 }
0722 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0723 {
0724 return _mm256_blendv_epi8(false_br, true_br, cond);
0725 }
0726 else
0727 {
0728 return select(cond, true_br, false_br, avx {});
0729 }
0730 }
0731 template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0732 XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
0733 {
0734 constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
0735
0736
0737
0738 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0739 {
0740 return _mm256_blend_epi32(false_br, true_br, mask);
0741 }
0742 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0743 {
0744 constexpr int imask = detail::interleave(mask);
0745 return _mm256_blend_epi32(false_br, true_br, imask);
0746 }
0747 else
0748 {
0749 return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
0750 }
0751 }
0752
0753
0754 template <size_t N, class A, class T>
0755 XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
0756 {
0757 constexpr unsigned BitCount = N * 8;
0758 if (BitCount == 0)
0759 {
0760 return x;
0761 }
0762 if (BitCount >= 256)
0763 {
0764 return batch<T, A>(T(0));
0765 }
0766 if (BitCount > 128)
0767 {
0768 constexpr unsigned M = (BitCount - 128) / 8;
0769 auto y = _mm256_bslli_epi128(x, M);
0770 return _mm256_permute2x128_si256(y, y, 0x28);
0771 }
0772 if (BitCount == 128)
0773 {
0774 return _mm256_permute2x128_si256(x, x, 0x28);
0775 }
0776
0777 constexpr unsigned M = BitCount / 8;
0778 auto y = _mm256_bslli_epi128(x, M);
0779 auto z = _mm256_bsrli_epi128(x, 16 - M);
0780 auto w = _mm256_permute2x128_si256(z, z, 0x28);
0781 return _mm256_or_si256(y, w);
0782 }
0783
0784
0785 template <size_t N, class A, class T>
0786 XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
0787 {
0788 constexpr unsigned BitCount = N * 8;
0789 if (BitCount == 0)
0790 {
0791 return x;
0792 }
0793 if (BitCount >= 256)
0794 {
0795 return batch<T, A>(T(0));
0796 }
0797 if (BitCount > 128)
0798 {
0799 constexpr unsigned M = (BitCount - 128) / 8;
0800 auto y = _mm256_bsrli_epi128(x, M);
0801 return _mm256_permute2x128_si256(y, y, 0x81);
0802 }
0803 if (BitCount == 128)
0804 {
0805 return _mm256_permute2x128_si256(x, x, 0x81);
0806 }
0807
0808 constexpr unsigned M = BitCount / 8;
0809 auto y = _mm256_bsrli_epi128(x, M);
0810 auto z = _mm256_bslli_epi128(x, 16 - M);
0811 auto w = _mm256_permute2x128_si256(z, z, 0x81);
0812 return _mm256_or_si256(y, w);
0813 }
0814
0815
0816 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0817 XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0818 {
0819 if (std::is_signed<T>::value)
0820 {
0821 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0822 {
0823 return _mm256_subs_epi8(self, other);
0824 }
0825 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0826 {
0827 return _mm256_subs_epi16(self, other);
0828 }
0829 else
0830 {
0831 return ssub(self, other, avx {});
0832 }
0833 }
0834 else
0835 {
0836 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0837 {
0838 return _mm256_subs_epu8(self, other);
0839 }
0840 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0841 {
0842 return _mm256_subs_epu16(self, other);
0843 }
0844 else
0845 {
0846 return ssub(self, other, avx {});
0847 }
0848 }
0849 }
0850
0851
0852 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0853 XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0854 {
0855 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0856 {
0857 return _mm256_sub_epi8(self, other);
0858 }
0859 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0860 {
0861 return _mm256_sub_epi16(self, other);
0862 }
0863 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0864 {
0865 return _mm256_sub_epi32(self, other);
0866 }
0867 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0868 {
0869 return _mm256_sub_epi64(self, other);
0870 }
0871 else
0872 {
0873 return sub(self, other, avx {});
0874 }
0875 }
0876
0877
0878 template <class A>
0879 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
0880 {
0881 return _mm256_permutevar8x32_ps(self, mask);
0882 }
0883
0884 template <class A>
0885 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
0886 {
0887 batch<uint32_t, A> broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 };
0888 constexpr uint64_t comb = 0x0000000100000001ul * 2;
0889 return bitwise_cast<double>(swizzle(bitwise_cast<float>(self), bitwise_cast<uint32_t>(mask * comb) + broadcaster, avx2 {}));
0890 }
0891
0892 template <class A>
0893 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
0894 {
0895 return bitwise_cast<uint64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
0896 }
0897 template <class A>
0898 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx2>) noexcept
0899 {
0900 return bitwise_cast<int64_t>(swizzle(bitwise_cast<double>(self), mask, avx2 {}));
0901 }
0902 template <class A>
0903 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
0904 {
0905 return _mm256_permutevar8x32_epi32(self, mask);
0906 }
0907 template <class A>
0908 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx2>) noexcept
0909 {
0910 return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
0911 }
0912
0913
0914 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
0915 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
0916 {
0917 return _mm256_permutevar8x32_ps(self, mask.as_batch());
0918 }
0919
0920 template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
0921 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
0922 {
0923 constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
0924 return _mm256_permute4x64_pd(self, mask);
0925 }
0926
0927 template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
0928 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
0929 {
0930 constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
0931 return _mm256_permute4x64_epi64(self, mask);
0932 }
0933 template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
0934 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
0935 {
0936 return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
0937 }
0938 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
0939 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
0940 {
0941 return _mm256_permutevar8x32_epi32(self, mask.as_batch());
0942 }
0943 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
0944 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
0945 {
0946 return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
0947 }
0948
0949
0950 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0951 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0952 {
0953 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0954 {
0955 auto lo = _mm256_unpacklo_epi8(self, other);
0956 auto hi = _mm256_unpackhi_epi8(self, other);
0957 return _mm256_permute2f128_si256(lo, hi, 0x31);
0958 }
0959 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0960 {
0961 auto lo = _mm256_unpacklo_epi16(self, other);
0962 auto hi = _mm256_unpackhi_epi16(self, other);
0963 return _mm256_permute2f128_si256(lo, hi, 0x31);
0964 }
0965 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0966 {
0967 auto lo = _mm256_unpacklo_epi32(self, other);
0968 auto hi = _mm256_unpackhi_epi32(self, other);
0969 return _mm256_permute2f128_si256(lo, hi, 0x31);
0970 }
0971 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0972 {
0973 auto lo = _mm256_unpacklo_epi64(self, other);
0974 auto hi = _mm256_unpackhi_epi64(self, other);
0975 return _mm256_permute2f128_si256(lo, hi, 0x31);
0976 }
0977 else
0978 {
0979 assert(false && "unsupported arch/op combination");
0980 return {};
0981 }
0982 }
0983
0984
0985 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0986 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
0987 {
0988 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0989 {
0990 auto lo = _mm256_unpacklo_epi8(self, other);
0991 auto hi = _mm256_unpackhi_epi8(self, other);
0992 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
0993 }
0994 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0995 {
0996 auto lo = _mm256_unpacklo_epi16(self, other);
0997 auto hi = _mm256_unpackhi_epi16(self, other);
0998 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
0999 }
1000 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1001 {
1002 auto lo = _mm256_unpacklo_epi32(self, other);
1003 auto hi = _mm256_unpackhi_epi32(self, other);
1004 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
1005 }
1006 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1007 {
1008 auto lo = _mm256_unpacklo_epi64(self, other);
1009 auto hi = _mm256_unpackhi_epi64(self, other);
1010 return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
1011 }
1012 else
1013 {
1014 assert(false && "unsupported arch/op combination");
1015 return {};
1016 }
1017 }
1018 }
1019 }
1020
1021 #endif