File indexing completed on 2025-08-28 09:11:32
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_AVX512F_HPP
0013 #define XSIMD_AVX512F_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_avx512f_register.hpp"
0020
0021 namespace xsimd
0022 {
0023
0024 namespace kernel
0025 {
0026 using namespace types;
0027
0028 namespace detail
0029 {
0030 XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
0031 {
0032 low = _mm512_castps512_ps256(val);
0033 high = _mm512_extractf32x8_ps(val, 1);
0034 }
0035 XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
0036 {
0037 low = _mm512_castpd512_pd256(val);
0038 high = _mm512_extractf64x4_pd(val, 1);
0039 }
0040 XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
0041 {
0042 low = _mm512_castsi512_si256(val);
0043 high = _mm512_extracti64x4_epi64(val, 1);
0044 }
0045 XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept
0046 {
0047 return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
0048 }
0049 XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept
0050 {
0051 return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
0052 }
0053 XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept
0054 {
0055 return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
0056 }
0057 template <class F>
0058 __m512i fwd_to_avx(F f, __m512i self)
0059 {
0060 __m256i self_low, self_high;
0061 split_avx512(self, self_low, self_high);
0062 __m256i res_low = f(self_low);
0063 __m256i res_high = f(self_high);
0064 return merge_avx(res_low, res_high);
0065 }
0066 template <class F>
0067 __m512i fwd_to_avx(F f, __m512i self, __m512i other)
0068 {
0069 __m256i self_low, self_high, other_low, other_high;
0070 split_avx512(self, self_low, self_high);
0071 split_avx512(other, other_low, other_high);
0072 __m256i res_low = f(self_low, other_low);
0073 __m256i res_high = f(self_high, other_high);
0074 return merge_avx(res_low, res_high);
0075 }
0076 template <class F>
0077 __m512i fwd_to_avx(F f, __m512i self, int32_t other)
0078 {
0079 __m256i self_low, self_high;
0080 split_avx512(self, self_low, self_high);
0081 __m256i res_low = f(self_low, other);
0082 __m256i res_high = f(self_high, other);
0083 return merge_avx(res_low, res_high);
0084 }
0085 }
0086 namespace detail
0087 {
0088
0089 XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept
0090 {
0091
0092 static const unsigned short MortonTable256[256] = {
0093 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
0094 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
0095 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
0096 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
0097 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
0098 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
0099 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
0100 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
0101 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
0102 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
0103 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
0104 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
0105 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
0106 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
0107 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
0108 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
0109 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
0110 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
0111 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
0112 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
0113 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
0114 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
0115 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
0116 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
0117 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
0118 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
0119 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
0120 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
0121 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
0122 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
0123 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
0124 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
0125 };
0126
0127 uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF];
0128 return z;
0129 }
0130
0131 template <class A, class T, int Cmp>
0132 XSIMD_INLINE batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
0133 {
0134 using register_type = typename batch_bool<T, A>::register_type;
0135 if (std::is_signed<T>::value)
0136 {
0137 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0138 {
0139
0140 uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
0141 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
0142 Cmp);
0143 uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
0144 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
0145 Cmp);
0146 uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
0147 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
0148 Cmp);
0149 uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
0150 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
0151 Cmp);
0152 uint64_t mask = 0;
0153 for (unsigned i = 0; i < 16; ++i)
0154 {
0155 mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
0156 mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
0157 mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
0158 mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
0159 }
0160 return (register_type)mask;
0161 }
0162 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0163 {
0164
0165 uint16_t mask_low = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
0166 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
0167 Cmp);
0168 uint16_t mask_high = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
0169 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
0170 Cmp);
0171 return static_cast<register_type>(morton(mask_low, mask_high));
0172 }
0173 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0174 {
0175 return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
0176 }
0177 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0178 {
0179 return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
0180 }
0181 }
0182 else
0183 {
0184 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0185 {
0186 uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
0187 uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
0188 uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
0189 uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
0190 uint64_t mask = 0;
0191 for (unsigned i = 0; i < 16; ++i)
0192 {
0193 mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
0194 mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
0195 mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
0196 mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
0197 }
0198 return (register_type)mask;
0199 }
0200 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0201 {
0202 uint16_t mask_low = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
0203 uint16_t mask_high = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
0204 return static_cast<register_type>(morton(mask_low, mask_high));
0205 }
0206 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0207 {
0208 return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
0209 }
0210 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0211 {
0212 return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
0213 }
0214 }
0215 }
0216 }
0217
0218
0219 template <class A>
0220 XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0221 {
0222 __m512 self_asf = (__m512)self;
0223 __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
0224 __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi);
0225 return *reinterpret_cast<__m512*>(&res_asi);
0226 }
0227 template <class A>
0228 XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0229 {
0230 __m512d self_asd = (__m512d)self;
0231 __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
0232 __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
0233 self_asi);
0234 return *reinterpret_cast<__m512d*>(&res_asi);
0235 }
0236 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0237 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
0238 {
0239 if (std::is_unsigned<T>::value)
0240 {
0241 return self;
0242 }
0243
0244 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0245 {
0246 return detail::fwd_to_avx([](__m256i s) noexcept
0247 { return abs(batch<T, avx2>(s)); },
0248 self);
0249 }
0250 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0251 {
0252 return detail::fwd_to_avx([](__m256i s) noexcept
0253 { return abs(batch<T, avx2>(s)); },
0254 self);
0255 }
0256 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0257 {
0258 return _mm512_abs_epi32(self);
0259 }
0260 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0261 {
0262 return _mm512_abs_epi64(self);
0263 }
0264 else
0265 {
0266 assert(false && "unsupported arch/op combination");
0267 return {};
0268 }
0269 }
0270
0271
0272 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0273 XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0274 {
0275 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0276 {
0277 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
0278 { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
0279 self, other);
0280 }
0281 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0282 {
0283 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
0284 { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
0285 self, other);
0286 }
0287 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0288 {
0289 return _mm512_add_epi32(self, other);
0290 }
0291 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0292 {
0293 return _mm512_add_epi64(self, other);
0294 }
0295 else
0296 {
0297 assert(false && "unsupported arch/op combination");
0298 return {};
0299 }
0300 }
0301 template <class A>
0302 XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0303 {
0304 return _mm512_add_ps(self, other);
0305 }
0306 template <class A>
0307 XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0308 {
0309 return _mm512_add_pd(self, other);
0310 }
0311
0312
0313 template <class A, class T>
0314 XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0315 {
0316 using register_type = typename batch_bool<T, A>::register_type;
0317 return self.data == register_type(-1);
0318 }
0319
0320
0321 template <class A, class T>
0322 XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0323 {
0324 using register_type = typename batch_bool<T, A>::register_type;
0325 return self.data != register_type(0);
0326 }
0327
0328
0329 template <class A, class T_out, class T_in>
0330 XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
0331 {
0332 return self.data;
0333 }
0334
0335
0336 template <class A>
0337 XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0338 {
0339 #if defined(_MSC_VER)
0340 return _mm512_and_ps(self, other);
0341 #else
0342 return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
0343 #endif
0344 }
0345 template <class A>
0346 XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0347 {
0348 return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
0349 }
0350
0351 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0352 XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0353 {
0354 return _mm512_and_si512(self, other);
0355 }
0356
0357 template <class A, class T>
0358 XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0359 {
0360 using register_type = typename batch_bool<T, A>::register_type;
0361 return register_type(self.data & other.data);
0362 }
0363
0364
0365 template <class A>
0366 XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0367 {
0368 return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
0369 }
0370 template <class A>
0371 XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0372 {
0373 return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
0374 }
0375
0376 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0377 XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0378 {
0379 return _mm512_andnot_si512(other, self);
0380 }
0381
0382 template <class A, class T>
0383 XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0384 {
0385 using register_type = typename batch_bool<T, A>::register_type;
0386 return register_type(self.data & ~other.data);
0387 }
0388
0389
0390 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0391 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
0392 {
0393 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0394 {
0395 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0396 __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
0397 #else
0398 __m512i tmp = _mm512_slli_epi32(self, other);
0399 #endif
0400 return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp);
0401 }
0402 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0403 {
0404 return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
0405 { return bitwise_lshift(batch<T, avx2>(s), o, avx2 {}); },
0406 self, other);
0407 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0408 }
0409 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0410 {
0411 return _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
0412 }
0413 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0414 {
0415 return _mm512_sllv_epi64(self, _mm512_set1_epi64(other));
0416 #else
0417 }
0418 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0419 {
0420 return _mm512_slli_epi32(self, other);
0421 }
0422 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0423 {
0424 return _mm512_slli_epi64(self, other);
0425 #endif
0426 }
0427 else
0428 {
0429 assert(false && "unsupported arch/op combination");
0430 return {};
0431 }
0432 }
0433
0434
0435 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0436 XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
0437 {
0438 return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
0439 }
0440 template <class A, class T>
0441 XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0442 {
0443 using register_type = typename batch_bool<T, A>::register_type;
0444 return register_type(~self.data);
0445 }
0446
0447 template <class A>
0448 XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0449 {
0450 return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
0451 }
0452 template <class A>
0453 XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0454 {
0455 return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
0456 }
0457
0458
0459 template <class A>
0460 XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0461 {
0462 return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
0463 }
0464 template <class A>
0465 XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0466 {
0467 return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
0468 }
0469
0470 template <class A, class T>
0471 XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0472 {
0473 using register_type = typename batch_bool<T, A>::register_type;
0474 return register_type(self.data | other.data);
0475 }
0476
0477 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0478 XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0479 {
0480 return _mm512_or_si512(self, other);
0481 }
0482
0483
0484 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0485 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
0486 {
0487 if (std::is_signed<T>::value)
0488 {
0489 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0490 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0491 {
0492 return _mm512_srav_epi32(self, _mm512_set1_epi32(other));
0493 }
0494 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0495 {
0496 return _mm512_srav_epi64(self, _mm512_set1_epi64(other));
0497 #else
0498 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0499 {
0500 return _mm512_srai_epi32(self, other);
0501 }
0502 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0503 {
0504 return _mm512_srai_epi64(self, other);
0505 #endif
0506 }
0507 else
0508 {
0509 return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
0510 { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
0511 self, other);
0512 }
0513 }
0514 else
0515 {
0516 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0517 {
0518 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0519 __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
0520 #else
0521 __m512i tmp = _mm512_srli_epi32(self, other);
0522 #endif
0523 return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp);
0524 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
0525 }
0526 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0527 {
0528 return _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
0529 }
0530 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0531 {
0532 return _mm512_srlv_epi64(self, _mm512_set1_epi64(other));
0533 #else
0534 }
0535 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0536 {
0537 return _mm512_srli_epi32(self, other);
0538 }
0539 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0540 {
0541 return _mm512_srli_epi64(self, other);
0542 #endif
0543 }
0544 else
0545 {
0546 return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
0547 { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
0548 self, other);
0549 }
0550 }
0551 }
0552
0553
0554 template <class A>
0555 XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0556 {
0557 return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
0558 }
0559 template <class A>
0560 XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0561 {
0562 return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
0563 }
0564
0565 template <class A, class T>
0566 XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0567 {
0568 using register_type = typename batch_bool<T, A>::register_type;
0569 return register_type(self.data | other.data);
0570 }
0571
0572 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0573 XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0574 {
0575 return _mm512_xor_si512(self, other);
0576 }
0577
0578
0579 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0580 XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0581 {
0582 return _mm512_castsi512_ps(self);
0583 }
0584 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0585 XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
0586 {
0587 return _mm512_castsi512_pd(self);
0588 }
0589 template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
0590 XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
0591 {
0592 return batch<Tp, A>(self.data);
0593 }
0594 template <class A>
0595 XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
0596 {
0597 return _mm512_castps_pd(self);
0598 }
0599 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0600 XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
0601 {
0602 return _mm512_castps_si512(self);
0603 }
0604 template <class A>
0605 XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0606 {
0607 return _mm512_castpd_ps(self);
0608 }
0609 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0610 XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
0611 {
0612 return _mm512_castpd_si512(self);
0613 }
0614
0615
0616 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0617 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
0618 {
0619 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0620 {
0621 return _mm512_set1_epi8(val);
0622 }
0623 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0624 {
0625 return _mm512_set1_epi16(val);
0626 }
0627 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0628 {
0629 return _mm512_set1_epi32(val);
0630 }
0631 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0632 {
0633 return _mm512_set1_epi64(val);
0634 }
0635 else
0636 {
0637 assert(false && "unsupported");
0638 return {};
0639 }
0640 }
0641 template <class A>
0642 XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
0643 {
0644 return _mm512_set1_ps(val);
0645 }
0646 template <class A>
0647 batch<double, A> XSIMD_INLINE broadcast(double val, requires_arch<avx512f>) noexcept
0648 {
0649 return _mm512_set1_pd(val);
0650 }
0651
0652
0653 template <class A>
0654 XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0655 {
0656 return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
0657 }
0658 template <class A>
0659 XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0660 {
0661 return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
0662 }
0663
0664
0665 template <class A>
0666 XSIMD_INLINE batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
0667 {
0668 return _mm512_maskz_compress_ps(mask.mask(), self);
0669 }
0670 template <class A>
0671 XSIMD_INLINE batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
0672 {
0673 return _mm512_maskz_compress_pd(mask.mask(), self);
0674 }
0675 template <class A>
0676 XSIMD_INLINE batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
0677 {
0678 return _mm512_maskz_compress_epi32(mask.mask(), self);
0679 }
0680 template <class A>
0681 XSIMD_INLINE batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
0682 {
0683 return _mm512_maskz_compress_epi32(mask.mask(), self);
0684 }
0685 template <class A>
0686 XSIMD_INLINE batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
0687 {
0688 return _mm512_maskz_compress_epi64(mask.mask(), self);
0689 }
0690 template <class A>
0691 XSIMD_INLINE batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
0692 {
0693 return _mm512_maskz_compress_epi64(mask.mask(), self);
0694 }
0695
0696
0697 namespace detail
0698 {
0699 template <class A>
0700 XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0701 {
0702 return _mm512_cvtepi32_ps(self);
0703 }
0704
0705 template <class A>
0706 XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
0707 {
0708 return _mm512_cvttps_epi32(self);
0709 }
0710
0711 template <class A>
0712 XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
0713 {
0714 return _mm512_cvtepu32_ps(self);
0715 }
0716
0717 template <class A>
0718 batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
0719 {
0720 return _mm512_cvttps_epu32(self);
0721 }
0722 }
0723
0724 namespace detail
0725 {
0726
0727 template <class A>
0728 XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
0729 {
0730 __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
0731 return _mm512_permutex2var_ps(self.real(), idx, self.imag());
0732 }
0733 template <class A>
0734 XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
0735 {
0736 __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
0737 return _mm512_permutex2var_pd(self.real(), idx, self.imag());
0738 }
0739
0740
0741 template <class A>
0742 XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
0743 {
0744 __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
0745 return _mm512_permutex2var_ps(self.real(), idx, self.imag());
0746 }
0747 template <class A>
0748 XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
0749 {
0750 __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
0751 return _mm512_permutex2var_pd(self.real(), idx, self.imag());
0752 }
0753 }
0754
0755
0756 template <class A>
0757 XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0758 {
0759 return _mm512_div_ps(self, other);
0760 }
0761 template <class A>
0762 XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0763 {
0764 return _mm512_div_pd(self, other);
0765 }
0766
0767
0768 template <class A>
0769 XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0770 {
0771 return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
0772 }
0773 template <class A>
0774 XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0775 {
0776 return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
0777 }
0778
0779 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0780 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0781 {
0782 return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
0783 }
0784 template <class A, class T>
0785 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
0786 {
0787 using register_type = typename batch_bool<T, A>::register_type;
0788 return register_type(~self.data ^ other.data);
0789 }
0790
0791
0792 template <class A>
0793 XSIMD_INLINE batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
0794 {
0795 return _mm512_maskz_expand_ps(mask.mask(), self);
0796 }
0797 template <class A>
0798 XSIMD_INLINE batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
0799 {
0800 return _mm512_maskz_expand_pd(mask.mask(), self);
0801 }
0802 template <class A>
0803 XSIMD_INLINE batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
0804 {
0805 return _mm512_maskz_expand_epi32(mask.mask(), self);
0806 }
0807 template <class A>
0808 XSIMD_INLINE batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
0809 {
0810 return _mm512_maskz_expand_epi32(mask.mask(), self);
0811 }
0812 template <class A>
0813 XSIMD_INLINE batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
0814 {
0815 return _mm512_maskz_expand_epi64(mask.mask(), self);
0816 }
0817 template <class A>
0818 XSIMD_INLINE batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
0819 {
0820 return _mm512_maskz_expand_epi64(mask.mask(), self);
0821 }
0822
0823
0824 template <class A>
0825 XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0826 {
0827 return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
0828 }
0829 template <class A>
0830 XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0831 {
0832 return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
0833 }
0834
0835
0836 template <class A>
0837 XSIMD_INLINE batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
0838 {
0839 return _mm512_fnmadd_ps(x, y, z);
0840 }
0841
0842 template <class A>
0843 XSIMD_INLINE batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
0844 {
0845 return _mm512_fnmadd_pd(x, y, z);
0846 }
0847
0848
0849 template <class A>
0850 XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
0851 {
0852 return _mm512_fmadd_ps(x, y, z);
0853 }
0854
0855 template <class A>
0856 XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
0857 {
0858 return _mm512_fmadd_pd(x, y, z);
0859 }
0860
0861
0862 template <class A>
0863 XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
0864 {
0865 return _mm512_fmsub_ps(x, y, z);
0866 }
0867
0868 template <class A>
0869 XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
0870 {
0871 return _mm512_fmsub_pd(x, y, z);
0872 }
0873
0874
0875 template <class A, class T>
0876 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
0877 {
0878 return select(self, batch<T, A>(1), batch<T, A>(0));
0879 }
0880
0881
0882 template <class T, class A>
0883 XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
0884 {
0885 return static_cast<typename batch_bool<T, A>::register_type>(mask);
0886 }
0887
0888
0889 template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
0890 XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0891 kernel::requires_arch<avx512f>) noexcept
0892 {
0893 return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
0894 }
0895
0896 template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
0897 XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
0898 kernel::requires_arch<avx512f>) noexcept
0899 {
0900 return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
0901 }
0902
0903 template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
0904 XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
0905 batch<U, A> const& index,
0906 kernel::requires_arch<avx512f>) noexcept
0907 {
0908 return _mm512_i32gather_ps(index, src, sizeof(float));
0909 }
0910
0911 template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
0912 XSIMD_INLINE batch<double, A>
0913 gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
0914 kernel::requires_arch<avx512f>) noexcept
0915 {
0916 return _mm512_i64gather_pd(index, src, sizeof(double));
0917 }
0918
0919
0920 template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0921 XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, double const* src,
0922 batch<V, A> const& index,
0923 requires_arch<avx512f>) noexcept
0924 {
0925 const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
0926 const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
0927 return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data));
0928 }
0929
0930 template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
0931 XSIMD_INLINE batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
0932 batch<V, A> const& index,
0933 requires_arch<avx512f>) noexcept
0934 {
0935 const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
0936 const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
0937 return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data));
0938 }
0939
0940
0941 template <class A>
0942 XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0943 {
0944 return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
0945 }
0946 template <class A>
0947 XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0948 {
0949 return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
0950 }
0951 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0952 XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0953 {
0954 return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
0955 }
0956
0957
0958 template <class A>
0959 XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
0960 {
0961 return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
0962 }
0963 template <class A>
0964 XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
0965 {
0966 return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
0967 }
0968 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0969 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
0970 {
0971 return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
0972 }
0973
0974
0975 template <class A>
0976 XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
0977 {
0978
0979
0980
0981 #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
0982 batch<float, avx512f> res##I; \
0983 { \
0984 auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
0985 auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
0986 res##I = _mm512_add_ps(tmp1, tmp2); \
0987 }
0988
0989 XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
0990 XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
0991 XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
0992 XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
0993 XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
0994 XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
0995 XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
0996 XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
0997
0998 #undef XSIMD_AVX512_HADDP_STEP1
0999
1000
1001
1002
1003
1004 #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
1005 batch<float, avx2> halfx##I; \
1006 { \
1007 auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
1008 auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
1009 \
1010 auto resx1 = _mm512_add_ps(tmp1, tmp2); \
1011 \
1012 auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
1013 auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
1014 \
1015 auto resx2 = _mm512_add_ps(tmp3, tmp4); \
1016 \
1017 auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
1018 auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
1019 \
1020 auto resx3 = _mm512_add_ps(tmp5, tmp6); \
1021 \
1022 halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1), \
1023 _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \
1024 }
1025
1026 XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
1027 XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
1028
1029 #undef XSIMD_AVX512_HADDP_STEP2
1030
1031 auto concat = _mm512_castps256_ps512(halfx0);
1032 concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1));
1033 return concat;
1034 }
1035
1036 template <class A>
1037 XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
1038 {
1039 #define step1(I, a, b) \
1040 batch<double, avx512f> res##I; \
1041 { \
1042 auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
1043 auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
1044 res##I = _mm512_add_pd(tmp1, tmp2); \
1045 }
1046
1047 step1(1, row[0], row[2]);
1048 step1(2, row[4], row[6]);
1049 step1(3, row[1], row[3]);
1050 step1(4, row[5], row[7]);
1051
1052 #undef step1
1053
1054 auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
1055 auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
1056
1057 auto resx1 = _mm512_add_pd(tmp5, tmp6);
1058
1059 auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
1060 auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
1061
1062 auto resx2 = _mm512_add_pd(tmp7, tmp8);
1063
1064 auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
1065 auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
1066
1067 return _mm512_add_pd(tmpx, tmpy);
1068 }
1069
1070
1071 template <class A>
1072 XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1073 {
1074 return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
1075 }
1076 template <class A>
1077 XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1078 {
1079 return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
1080 }
1081
1082
1083 template <class A>
1084 XSIMD_INLINE batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
1085 {
1086 return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
1087 }
1088
1089 template <class A>
1090 XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
1091 {
1092
1093
1094 __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other));
1095 return _mm512_scalef_pd(self, adjusted_index);
1096 }
1097
1098
1099 template <class A>
1100 XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1101 {
1102 return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
1103 }
1104 template <class A>
1105 XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1106 {
1107 return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
1108 }
1109 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1110 XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1111 {
1112 return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
1113 }
1114
1115
1116 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1117 XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1118 {
1119 return _mm512_load_si512((__m512i const*)mem);
1120 }
1121 template <class A>
1122 XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1123 {
1124 return _mm512_load_ps(mem);
1125 }
1126 template <class A>
1127 XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1128 {
1129 return _mm512_load_pd(mem);
1130 }
1131
1132
1133 namespace detail
1134 {
1135 template <class A>
1136 XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
1137 {
1138 __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
1139 __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
1140 auto real = _mm512_permutex2var_ps(hi, real_idx, lo);
1141 auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo);
1142 return { real, imag };
1143 }
1144 template <class A>
1145 XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
1146 {
1147 __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
1148 __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
1149 auto real = _mm512_permutex2var_pd(hi, real_idx, lo);
1150 auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo);
1151 return { real, imag };
1152 }
1153 }
1154
1155
1156 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1157 XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
1158 {
1159 return _mm512_loadu_si512((__m512i const*)mem);
1160 }
1161 template <class A>
1162 XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
1163 {
1164 return _mm512_loadu_ps(mem);
1165 }
1166 template <class A>
1167 XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
1168 {
1169 return _mm512_loadu_pd(mem);
1170 }
1171
1172
1173 template <class A>
1174 XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1175 {
1176 return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
1177 }
1178 template <class A>
1179 XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1180 {
1181 return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
1182 }
1183
1184 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1185 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1186 {
1187 return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
1188 }
1189
1190
1191 template <class A, class T>
1192 XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1193 {
1194 return self.data;
1195 }
1196
1197
1198 template <class A>
1199 XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1200 {
1201 return _mm512_max_ps(self, other);
1202 }
1203 template <class A>
1204 XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1205 {
1206 return _mm512_max_pd(self, other);
1207 }
1208 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1209 XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1210 {
1211 if (std::is_signed<T>::value)
1212 {
1213 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1214 {
1215 return _mm512_max_epi32(self, other);
1216 }
1217 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1218 {
1219 return _mm512_max_epi64(self, other);
1220 }
1221 else
1222 {
1223 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1224 { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
1225 self, other);
1226 }
1227 }
1228 else
1229 {
1230 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1231 {
1232 return _mm512_max_epu32(self, other);
1233 }
1234 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1235 {
1236 return _mm512_max_epu64(self, other);
1237 }
1238 else
1239 {
1240 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1241 { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
1242 self, other);
1243 }
1244 }
1245 }
1246
1247
1248 template <class A>
1249 XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1250 {
1251 return _mm512_min_ps(self, other);
1252 }
1253 template <class A>
1254 XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1255 {
1256 return _mm512_min_pd(self, other);
1257 }
1258 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1259 XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1260 {
1261 if (std::is_signed<T>::value)
1262 {
1263 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1264 {
1265 return _mm512_min_epi32(self, other);
1266 }
1267 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1268 {
1269 return _mm512_min_epi64(self, other);
1270 }
1271 else
1272 {
1273 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1274 { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
1275 self, other);
1276 }
1277 }
1278 else
1279 {
1280 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1281 {
1282 return _mm512_min_epu32(self, other);
1283 }
1284 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1285 {
1286 return _mm512_min_epu64(self, other);
1287 }
1288 else
1289 {
1290 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1291 { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
1292 self, other);
1293 }
1294 }
1295 }
1296
1297
1298 template <class A>
1299 XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1300 {
1301 return _mm512_mul_ps(self, other);
1302 }
1303 template <class A>
1304 XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1305 {
1306 return _mm512_mul_pd(self, other);
1307 }
1308 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1309 XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1310 {
1311 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1312 {
1313 return _mm512_mullo_epi32(self, other);
1314 }
1315 else
1316 {
1317 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1318 { return mul(batch<T, avx2>(s), batch<T, avx2>(o)); },
1319 self, other);
1320 }
1321 }
1322
1323
1324 template <class A>
1325 XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
1326 {
1327 return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
1328 }
1329 template <class A>
1330 XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
1331 {
1332 return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
1333 }
1334
1335
1336 template <class A>
1337 XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1338 requires_arch<avx512f>) noexcept
1339 {
1340 return _mm512_cvtps_epi32(self);
1341 }
1342
1343
1344 template <class A, class T>
1345 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1346 {
1347 return 0 - self;
1348 }
1349
1350
1351 template <class A>
1352 XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1353 {
1354 return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
1355 }
1356 template <class A>
1357 XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1358 {
1359 return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
1360 }
1361 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1362 XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1363 {
1364 return ~(self == other);
1365 }
1366
1367 template <class A, class T>
1368 XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
1369 {
1370 using register_type = typename batch_bool<T, A>::register_type;
1371 return register_type(self.data ^ other.data);
1372 }
1373
1374
1375 template <class A>
1376 XSIMD_INLINE batch<float, A>
1377 reciprocal(batch<float, A> const& self,
1378 kernel::requires_arch<avx512f>) noexcept
1379 {
1380 return _mm512_rcp14_ps(self);
1381 }
1382
1383 template <class A>
1384 XSIMD_INLINE batch<double, A>
1385 reciprocal(batch<double, A> const& self,
1386 kernel::requires_arch<avx512f>) noexcept
1387 {
1388 return _mm512_rcp14_pd(self);
1389 }
1390
1391
1392 template <class A>
1393 XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
1394 {
1395 __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0);
1396 __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1);
1397 __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2);
1398 __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3);
1399 __m128 res1 = _mm_add_ps(tmp1, tmp2);
1400 __m128 res2 = _mm_add_ps(tmp3, tmp4);
1401 __m128 res3 = _mm_add_ps(res1, res2);
1402 return reduce_add(batch<float, sse4_2>(res3), sse4_2 {});
1403 }
1404 template <class A>
1405 XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1406 {
1407 __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
1408 __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
1409 __m256d res1 = _mm256_add_pd(tmp1, tmp2);
1410 return reduce_add(batch<double, avx2>(res1), avx2 {});
1411 }
1412 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1413 XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1414 {
1415 __m256i low, high;
1416 detail::split_avx512(self, low, high);
1417 batch<T, avx2> blow(low), bhigh(high);
1418 return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
1419 }
1420
1421
1422 template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
1423 XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1424 {
1425 constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
1426 batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
1427 batch<T, A> acc = max(self, step);
1428 __m256i low = _mm512_castsi512_si256(acc);
1429 return reduce_max(batch<T, avx2>(low));
1430 }
1431
1432
1433 template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
1434 XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
1435 {
1436 constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
1437 batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
1438 batch<T, A> acc = min(self, step);
1439 __m256i low = _mm512_castsi512_si256(acc);
1440 return reduce_min(batch<T, avx2>(low));
1441 }
1442
1443
1444 template <class A>
1445 XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
1446 {
1447 return _mm512_rsqrt14_ps(val);
1448 }
1449 template <class A>
1450 XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
1451 {
1452 return _mm512_rsqrt14_pd(val);
1453 }
1454
1455
1456 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1457 XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1458 {
1459 if (std::is_signed<T>::value)
1460 {
1461 auto mask = other < 0;
1462 auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
1463 auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
1464 return other + select(mask, self_neg_branch, self_pos_branch);
1465 }
1466 else
1467 {
1468 const auto diffmax = std::numeric_limits<T>::max() - self;
1469 const auto mindiff = min(diffmax, other);
1470 return self + mindiff;
1471 }
1472 }
1473
1474
1475 template <class A, class T,
1476 class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
1477 XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
1478 batch<int32_t, A> const& index,
1479 kernel::requires_arch<avx512f>) noexcept
1480 {
1481 _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
1482 }
1483
1484 template <class A, class T,
1485 class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
1486 XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst,
1487 batch<int64_t, A> const& index,
1488 kernel::requires_arch<avx512f>) noexcept
1489 {
1490 _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
1491 }
1492
1493 template <class A>
1494 XSIMD_INLINE void scatter(batch<float, A> const& src, float* dst,
1495 batch<int32_t, A> const& index,
1496 kernel::requires_arch<avx512f>) noexcept
1497 {
1498 _mm512_i32scatter_ps(dst, index, src, sizeof(float));
1499 }
1500
1501 template <class A>
1502 XSIMD_INLINE void scatter(batch<double, A> const& src, double* dst,
1503 batch<int64_t, A> const& index,
1504 kernel::requires_arch<avx512f>) noexcept
1505 {
1506 _mm512_i64scatter_pd(dst, index, src, sizeof(double));
1507 }
1508
1509
1510 template <class A>
1511 XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
1512 {
1513 return _mm512_mask_blend_ps(cond, false_br, true_br);
1514 }
1515 template <class A>
1516 XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
1517 {
1518 return _mm512_mask_blend_pd(cond, false_br, true_br);
1519 }
1520
1521 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1522 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
1523 {
1524 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1525 {
1526 alignas(avx2::alignment()) uint8_t buffer[64];
1527
1528 for (int i = 0; i < 64; ++i)
1529 buffer[i] = cond.data & (1ull << i) ? 0xFF : 0;
1530 __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
1531 __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
1532
1533 __m256i true_low, true_hi;
1534 detail::split_avx512(true_br, true_low, true_hi);
1535
1536 __m256i false_low, false_hi;
1537 detail::split_avx512(false_br, false_low, false_hi);
1538
1539 __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
1540 __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
1541 return detail::merge_avx(res_low, res_hi);
1542 }
1543 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1544 {
1545 __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
1546 __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
1547
1548 __m256i true_low, true_hi;
1549 detail::split_avx512(true_br, true_low, true_hi);
1550
1551 __m256i false_low, false_hi;
1552 detail::split_avx512(false_br, false_low, false_hi);
1553
1554 __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
1555 __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
1556 return detail::merge_avx(res_low, res_hi);
1557 }
1558 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1559 {
1560 return _mm512_mask_blend_epi32(cond, false_br, true_br);
1561 }
1562 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1563 {
1564 return _mm512_mask_blend_epi64(cond, false_br, true_br);
1565 }
1566 else
1567 {
1568 assert(false && "unsupported arch/type combination");
1569 return {};
1570 }
1571 }
1572
1573 template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1574 XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
1575 {
1576 return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
1577 }
1578
1579 namespace detail
1580 {
1581 template <class T>
1582 using enable_signed_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
1583 int>::type;
1584
1585 template <class T>
1586 using enable_unsigned_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
1587 int>::type;
1588 }
1589
1590
1591 template <class A>
1592 XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
1593 {
1594 return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1595 }
1596
1597 template <class A>
1598 XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
1599 {
1600 return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
1601 }
1602 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1603 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1604 {
1605 return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
1606 }
1607 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1608 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1609 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1610 {
1611 return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1612 }
1613 template <class A, class T, detail::enable_signed_integer_t<T> = 0>
1614 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1615 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1616 T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1617 T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1618 {
1619 #if defined(__clang__) || __GNUC__
1620 return __extension__(__m512i)(__v32hi) {
1621 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1622 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1623 };
1624 #else
1625 return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1626 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1627 #endif
1628 }
1629
1630 template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
1631 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1632 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1633 T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1634 T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1635 {
1636 #if defined(__clang__) || __GNUC__
1637 return __extension__(__m512i)(__v32hu) {
1638 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1639 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
1640 };
1641 #else
1642 return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1643 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1644 #endif
1645 }
1646
1647 template <class A, class T, detail::enable_signed_integer_t<T> = 0>
1648 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1649 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1650 T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1651 T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
1652 T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
1653 T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
1654 T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
1655 T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
1656 {
1657
1658 #if defined(__clang__) || __GNUC__
1659 return __extension__(__m512i)(__v64qi) {
1660 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1661 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1662 v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1663 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1664 };
1665 #else
1666 return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1667 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1668 v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1669 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
1670 #endif
1671 }
1672 template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
1673 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
1674 T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1675 T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
1676 T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
1677 T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
1678 T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
1679 T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
1680 T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
1681 {
1682
1683 #if defined(__clang__) || __GNUC__
1684 return __extension__(__m512i)(__v64qu) {
1685 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1686 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1687 v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1688 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
1689 };
1690 #else
1691 return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
1692 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
1693 v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
1694 v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
1695 #endif
1696 }
1697
1698 template <class A, class T, class... Values>
1699 XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
1700 {
1701 static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
1702 using register_type = typename batch_bool<T, A>::register_type;
1703 register_type r = 0;
1704 unsigned shift = 0;
1705 (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
1706 return r;
1707 }
1708
1709
1710 template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
1711 XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
1712 batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
1713 requires_arch<avx512f>) noexcept
1714 {
1715 constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
1716
1717
1718 if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20)
1719 return _mm512_shuffle_ps(x, y, smask);
1720
1721
1722 if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20)
1723 return _mm512_shuffle_ps(y, x, smask);
1724
1725 return shuffle(x, y, mask, generic {});
1726 }
1727
1728 template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
1729 XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
1730 {
1731 constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
1732
1733 if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14)
1734 return _mm512_shuffle_pd(x, y, smask);
1735
1736
1737 if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14)
1738 return _mm512_shuffle_pd(y, x, smask);
1739
1740 return shuffle(x, y, mask, generic {});
1741 }
1742
1743
1744 template <size_t N, class A, class T>
1745 XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
1746 {
1747 static_assert(N == 0xDEAD, "not implemented yet");
1748 return {};
1749 }
1750
1751
1752 template <size_t N, class A, class T>
1753 XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
1754 {
1755 static_assert(N == 0xDEAD, "not implemented yet");
1756 return {};
1757 }
1758
1759
1760 template <class A>
1761 XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
1762 {
1763 return _mm512_sqrt_ps(val);
1764 }
1765 template <class A>
1766 XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
1767 {
1768 return _mm512_sqrt_pd(val);
1769 }
1770
1771
1772 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1773 XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1774 {
1775 if (std::is_signed<T>::value)
1776 {
1777 return sadd(self, -other);
1778 }
1779 else
1780 {
1781 const auto diff = min(self, other);
1782 return self - diff;
1783 }
1784 }
1785
1786
1787 template <class T, class A>
1788 XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
1789 {
1790 using register_type = typename batch_bool<T, A>::register_type;
1791 constexpr auto size = batch_bool<T, A>::size;
1792 for (std::size_t i = 0; i < size; ++i)
1793 mem[i] = self.data & (register_type(1) << i);
1794 }
1795
1796
1797 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1798 XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
1799 {
1800 return _mm512_store_si512((__m512i*)mem, self);
1801 }
1802 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1803 XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1804 {
1805 return _mm512_store_si512((__m512i*)mem, self);
1806 }
1807 template <class A>
1808 XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
1809 {
1810 return _mm512_store_ps(mem, self);
1811 }
1812 template <class A>
1813 XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
1814 {
1815 return _mm512_store_pd(mem, self);
1816 }
1817
1818
1819 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1820 XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
1821 {
1822 return _mm512_storeu_si512((__m512i*)mem, self);
1823 }
1824 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1825 XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
1826 {
1827 return _mm512_storeu_si512((__m512i*)mem, self);
1828 }
1829 template <class A>
1830 XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
1831 {
1832 return _mm512_storeu_ps(mem, self);
1833 }
1834 template <class A>
1835 XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
1836 {
1837 return _mm512_storeu_pd(mem, self);
1838 }
1839
1840
1841 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1842 XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
1843 {
1844 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1845 {
1846 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1847 { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
1848 self, other);
1849 }
1850 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1851 {
1852 return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
1853 { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
1854 self, other);
1855 }
1856 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1857 {
1858 return _mm512_sub_epi32(self, other);
1859 }
1860 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1861 {
1862 return _mm512_sub_epi64(self, other);
1863 }
1864 else
1865 {
1866 assert(false && "unsupported arch/op combination");
1867 return {};
1868 }
1869 }
1870 template <class A>
1871 XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
1872 {
1873 return _mm512_sub_ps(self, other);
1874 }
1875 template <class A>
1876 XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
1877 {
1878 return _mm512_sub_pd(self, other);
1879 }
1880
1881
1882 template <class A>
1883 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
1884 {
1885 return _mm512_permutexvar_ps(mask, self);
1886 }
1887
1888 template <class A>
1889 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
1890 {
1891 return _mm512_permutexvar_pd(mask, self);
1892 }
1893
1894 template <class A>
1895 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
1896 {
1897 return _mm512_permutexvar_epi64(mask, self);
1898 }
1899
1900 template <class A>
1901 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512f>) noexcept
1902 {
1903 return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
1904 }
1905
1906 template <class A>
1907 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
1908 {
1909 return _mm512_permutexvar_epi32(mask, self);
1910 }
1911
1912 template <class A>
1913 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512f>) noexcept
1914 {
1915 return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
1916 }
1917
1918
1919 template <class A, uint32_t... Vs>
1920 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1921 {
1922 return swizzle(self, mask.as_batch(), avx512f {});
1923 }
1924
1925 template <class A, uint64_t... Vs>
1926 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1927 {
1928 return swizzle(self, mask.as_batch(), avx512f {});
1929 }
1930
1931 template <class A, uint64_t... Vs>
1932 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1933 {
1934 return swizzle(self, mask.as_batch(), avx512f {});
1935 }
1936
1937 template <class A, uint64_t... Vs>
1938 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1939 {
1940 return swizzle(self, mask.as_batch(), avx512f {});
1941 }
1942
1943 template <class A, uint32_t... Vs>
1944 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1945 {
1946 return swizzle(self, mask.as_batch(), avx512f {});
1947 }
1948
1949 template <class A, uint32_t... Vs>
1950 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
1951 {
1952 return swizzle(self, mask.as_batch(), avx512f {});
1953 }
1954
1955 namespace detail
1956 {
1957 template <class T, class A, T... Idx>
1958 struct is_pair_of_contiguous_indices;
1959
1960 template <class T, class A>
1961 struct is_pair_of_contiguous_indices<T, A> : std::true_type
1962 {
1963 };
1964
1965 template <class T, class A, T Idx0, T Idx1, T... Idx>
1966 struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
1967 {
1968 };
1969
1970 template <class A, uint16_t I0, uint16_t I1, uint16_t I2, uint16_t I3, uint16_t I4, uint16_t I5, uint16_t I6, uint16_t I7,
1971 uint16_t I8, uint16_t I9, uint16_t I10, uint16_t I11, uint16_t I12, uint16_t I13, uint16_t I14, uint16_t I15,
1972 uint16_t I16, uint16_t I17, uint16_t I18, uint16_t I19, uint16_t I20, uint16_t I21, uint16_t I22, uint16_t I23,
1973 uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
1974 struct fold_batch_constant
1975 {
1976 using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
1977 I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
1978 };
1979
1980 }
1981
1982 template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
1983 XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
1984 {
1985 constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
1986 return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
1987 }
1988
1989 template <class A>
1990 XSIMD_INLINE batch<uint16_t, A>
1991 swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
1992 {
1993
1994
1995
1996
1997 constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
1998 auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
1999
2000 alignas(A::alignment()) uint16_t buffer[32];
2001 _mm512_store_si512((__m512i*)&buffer[0], tmp);
2002 buffer[0] = buffer[1];
2003 return _mm512_load_si512(&buffer[0]);
2004 }
2005
2006 template <class A, uint16_t... Vs>
2007 XSIMD_INLINE batch<int16_t, A>
2008 swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
2009 {
2010 return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
2011 }
2012
2013
2014 template <class A>
2015 XSIMD_INLINE batch<float, A>
2016 trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
2017 {
2018 return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
2019 }
2020 template <class A>
2021 XSIMD_INLINE batch<double, A>
2022 trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
2023 {
2024 return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
2025 }
2026
2027
2028 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2029 XSIMD_INLINE batch<T, A>
2030 zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2031 {
2032 __m512i lo, hi;
2033 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2034 {
2035 assert(false && "not implemented yet");
2036 return {};
2037 }
2038 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2039 {
2040 assert(false && "not implemented yet");
2041 return {};
2042 }
2043 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2044 {
2045 lo = _mm512_unpacklo_epi32(self, other);
2046 hi = _mm512_unpackhi_epi32(self, other);
2047 }
2048 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2049 {
2050 lo = _mm512_unpacklo_epi64(self, other);
2051 hi = _mm512_unpackhi_epi64(self, other);
2052 }
2053 else
2054 {
2055 assert(false && "unsupported arch/op combination");
2056 return {};
2057 }
2058 return _mm512_inserti32x4(
2059 _mm512_inserti32x4(
2060 _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
2061 _mm512_extracti32x4_epi32(lo, 3),
2062 2),
2063 _mm512_extracti32x4_epi32(hi, 2),
2064 1);
2065 }
2066 template <class A>
2067 XSIMD_INLINE batch<float, A>
2068 zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2069 {
2070 auto lo = _mm512_unpacklo_ps(self, other);
2071 auto hi = _mm512_unpackhi_ps(self, other);
2072 return _mm512_insertf32x4(
2073 _mm512_insertf32x4(
2074 _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
2075 _mm512_extractf32x4_ps(lo, 3),
2076 2),
2077 _mm512_extractf32x4_ps(hi, 2),
2078 1);
2079 }
2080 template <class A>
2081 XSIMD_INLINE batch<double, A>
2082 zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2083 {
2084 auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
2085 auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
2086 return _mm512_castps_pd(_mm512_insertf32x4(
2087 _mm512_insertf32x4(
2088 _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
2089 _mm512_extractf32x4_ps(lo, 3),
2090 2),
2091 _mm512_extractf32x4_ps(hi, 2),
2092 1));
2093 }
2094
2095
2096 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
2097 XSIMD_INLINE batch<T, A>
2098 zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
2099 {
2100 __m512i lo, hi;
2101 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
2102 {
2103 assert(false && "not implemented yet");
2104 return {};
2105 }
2106 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
2107 {
2108 assert(false && "not implemented yet");
2109 return {};
2110 }
2111 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
2112 {
2113 lo = _mm512_unpacklo_epi32(self, other);
2114 hi = _mm512_unpackhi_epi32(self, other);
2115 }
2116 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
2117 {
2118 lo = _mm512_unpacklo_epi64(self, other);
2119 hi = _mm512_unpackhi_epi64(self, other);
2120 }
2121 else
2122 {
2123 assert(false && "unsupported arch/op combination");
2124 return {};
2125 }
2126 return _mm512_inserti32x4(
2127 _mm512_inserti32x4(
2128 _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
2129 _mm512_extracti32x4_epi32(hi, 1),
2130 3),
2131 _mm512_extracti32x4_epi32(lo, 1),
2132 2);
2133 }
2134 template <class A>
2135 XSIMD_INLINE batch<float, A>
2136 zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
2137 {
2138 auto lo = _mm512_unpacklo_ps(self, other);
2139 auto hi = _mm512_unpackhi_ps(self, other);
2140 return _mm512_insertf32x4(
2141 _mm512_insertf32x4(
2142 _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
2143 _mm512_extractf32x4_ps(hi, 1),
2144 3),
2145 _mm512_extractf32x4_ps(lo, 1),
2146 2);
2147 }
2148 template <class A>
2149 XSIMD_INLINE batch<double, A>
2150 zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
2151 {
2152 auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
2153 auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
2154 return _mm512_castps_pd(_mm512_insertf32x4(
2155 _mm512_insertf32x4(
2156 _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
2157 _mm512_extractf32x4_ps(hi, 1),
2158 3),
2159 _mm512_extractf32x4_ps(lo, 1),
2160 2));
2161 }
2162
2163 }
2164
2165 }
2166
2167 #endif