File indexing completed on 2025-08-28 09:11:30
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_AVX_HPP
0013 #define XSIMD_AVX_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_avx_register.hpp"
0020
0021 namespace xsimd
0022 {
0023
0024 namespace kernel
0025 {
0026 using namespace types;
0027
0028
0029 template <class A, class T, size_t I>
0030 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
0031
0032 namespace detail
0033 {
0034 XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
0035 {
0036 low = _mm256_castsi256_si128(val);
0037 high = _mm256_extractf128_si256(val, 1);
0038 }
0039 XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
0040 {
0041 low = _mm256_castps256_ps128(val);
0042 high = _mm256_extractf128_ps(val, 1);
0043 }
0044 XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
0045 {
0046 low = _mm256_castpd256_pd128(val);
0047 high = _mm256_extractf128_pd(val, 1);
0048 }
0049 XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
0050 {
0051 return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
0052 }
0053 XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept
0054 {
0055 return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
0056 }
0057 XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept
0058 {
0059 return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
0060 }
0061 template <class F>
0062 XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
0063 {
0064 __m128i self_low, self_high;
0065 split_avx(self, self_low, self_high);
0066 __m128i res_low = f(self_low);
0067 __m128i res_high = f(self_high);
0068 return merge_sse(res_low, res_high);
0069 }
0070 template <class F>
0071 XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
0072 {
0073 __m128i self_low, self_high, other_low, other_high;
0074 split_avx(self, self_low, self_high);
0075 split_avx(other, other_low, other_high);
0076 __m128i res_low = f(self_low, other_low);
0077 __m128i res_high = f(self_high, other_high);
0078 return merge_sse(res_low, res_high);
0079 }
0080 template <class F>
0081 XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
0082 {
0083 __m128i self_low, self_high;
0084 split_avx(self, self_low, self_high);
0085 __m128i res_low = f(self_low, other);
0086 __m128i res_high = f(self_high, other);
0087 return merge_sse(res_low, res_high);
0088 }
0089 }
0090
0091
0092 template <class A>
0093 XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
0094 {
0095 __m256 sign_mask = _mm256_set1_ps(-0.f);
0096 return _mm256_andnot_ps(sign_mask, self);
0097 }
0098 template <class A>
0099 XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
0100 {
0101 __m256d sign_mask = _mm256_set1_pd(-0.f);
0102 return _mm256_andnot_pd(sign_mask, self);
0103 }
0104
0105
0106 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0107 XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0108 {
0109 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0110 { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0111 self, other);
0112 }
0113 template <class A>
0114 XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0115 {
0116 return _mm256_add_ps(self, other);
0117 }
0118 template <class A>
0119 XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0120 {
0121 return _mm256_add_pd(self, other);
0122 }
0123
0124
0125 template <class A>
0126 XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0127 {
0128 return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
0129 }
0130 template <class A>
0131 XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0132 {
0133 return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
0134 }
0135 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0136 XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0137 {
0138 return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
0139 }
0140
0141
0142 template <class A>
0143 XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0144 {
0145 return !_mm256_testz_ps(self, self);
0146 }
0147 template <class A>
0148 XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0149 {
0150 return !_mm256_testz_pd(self, self);
0151 }
0152 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0153 XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0154 {
0155 return !_mm256_testz_si256(self, self);
0156 }
0157
0158
0159 template <class A, class T_out, class T_in>
0160 XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
0161 {
0162 return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
0163 }
0164
0165
0166 template <class A>
0167 XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0168 {
0169 return _mm256_and_ps(self, other);
0170 }
0171 template <class A>
0172 XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0173 {
0174 return _mm256_and_pd(self, other);
0175 }
0176
0177 template <class A>
0178 XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0179 {
0180 return _mm256_and_ps(self, other);
0181 }
0182 template <class A>
0183 XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0184 {
0185 return _mm256_and_pd(self, other);
0186 }
0187
0188 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0189 XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0190 {
0191 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0192 { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0193 self, other);
0194 }
0195 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0196 XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0197 {
0198 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0199 { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0200 self, other);
0201 }
0202
0203
0204 template <class A>
0205 XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0206 {
0207 return _mm256_andnot_ps(other, self);
0208 }
0209 template <class A>
0210 XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0211 {
0212 return _mm256_andnot_pd(other, self);
0213 }
0214
0215 template <class A>
0216 XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0217 {
0218 return _mm256_andnot_ps(other, self);
0219 }
0220 template <class A>
0221 XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0222 {
0223 return _mm256_andnot_pd(other, self);
0224 }
0225
0226 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0227 XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0228 {
0229 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0230 { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0231 self, other);
0232 }
0233 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0234 XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0235 {
0236 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0237 { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0238 self, other);
0239 }
0240
0241
0242 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0243 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
0244 {
0245 return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
0246 { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
0247 self, other);
0248 }
0249
0250
0251 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0252 XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
0253 {
0254 return detail::fwd_to_sse([](__m128i s) noexcept
0255 { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
0256 self);
0257 }
0258 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0259 XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0260 {
0261 return detail::fwd_to_sse([](__m128i s) noexcept
0262 { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
0263 self);
0264 }
0265
0266
0267 template <class A>
0268 XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0269 {
0270 return _mm256_or_ps(self, other);
0271 }
0272 template <class A>
0273 XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0274 {
0275 return _mm256_or_pd(self, other);
0276 }
0277 template <class A>
0278 XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0279 {
0280 return _mm256_or_ps(self, other);
0281 }
0282 template <class A>
0283 XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0284 {
0285 return _mm256_or_pd(self, other);
0286 }
0287 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0288 XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0289 {
0290 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0291 { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0292 self, other);
0293 }
0294 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0295 XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0296 {
0297 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0298 { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
0299 self, other);
0300 }
0301
0302
0303 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0304 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
0305 {
0306 return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
0307 { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
0308 self, other);
0309 }
0310
0311
0312 template <class A>
0313 XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0314 {
0315 return _mm256_xor_ps(self, other);
0316 }
0317 template <class A>
0318 XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0319 {
0320 return _mm256_xor_pd(self, other);
0321 }
0322 template <class A>
0323 XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0324 {
0325 return _mm256_xor_ps(self, other);
0326 }
0327 template <class A>
0328 XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0329 {
0330 return _mm256_xor_pd(self, other);
0331 }
0332 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0333 XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0334 {
0335 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0336 { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
0337 self, other);
0338 }
0339 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0340 XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0341 {
0342 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0343 { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
0344 self, other);
0345 }
0346
0347
0348 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0349 XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
0350 {
0351 return _mm256_castsi256_ps(self);
0352 }
0353 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0354 XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
0355 {
0356 return _mm256_castsi256_pd(self);
0357 }
0358 template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
0359 XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
0360 {
0361 return batch<Tp, A>(self.data);
0362 }
0363 template <class A>
0364 XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
0365 {
0366 return _mm256_castps_pd(self);
0367 }
0368 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0369 XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
0370 {
0371 return _mm256_castps_si256(self);
0372 }
0373 template <class A>
0374 XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
0375 {
0376 return _mm256_castpd_ps(self);
0377 }
0378 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0379 XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
0380 {
0381 return _mm256_castpd_si256(self);
0382 }
0383
0384
0385 template <class A>
0386 XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
0387 {
0388 return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
0389 }
0390 template <class A>
0391 XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
0392 {
0393 return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
0394 }
0395 template <class A>
0396 XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0397 {
0398 return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
0399 }
0400 template <class A>
0401 XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0402 {
0403 return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
0404 }
0405
0406
0407 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0408 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
0409 {
0410 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0411 {
0412 return _mm256_set1_epi8(val);
0413 }
0414 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0415 {
0416 return _mm256_set1_epi16(val);
0417 }
0418 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0419 {
0420 return _mm256_set1_epi32(val);
0421 }
0422 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0423 {
0424 return _mm256_set1_epi64x(val);
0425 }
0426 else
0427 {
0428 assert(false && "unsupported");
0429 return {};
0430 }
0431 }
0432 template <class A>
0433 XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
0434 {
0435 return _mm256_set1_ps(val);
0436 }
0437 template <class A>
0438 XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
0439 {
0440 return _mm256_set1_pd(val);
0441 }
0442
0443
0444 template <class A>
0445 XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
0446 {
0447 return _mm256_ceil_ps(self);
0448 }
0449 template <class A>
0450 XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
0451 {
0452 return _mm256_ceil_pd(self);
0453 }
0454
0455 namespace detail
0456 {
0457
0458
0459 template <int index, class B>
0460 XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept
0461 {
0462 __m128 tmp0 = _mm256_extractf128_ps(real, index);
0463 __m128 tmp1 = _mm256_extractf128_ps(imag, index);
0464 __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
0465 tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
0466 __m256 res = real;
0467 res = _mm256_insertf128_ps(res, tmp0, 0);
0468 res = _mm256_insertf128_ps(res, tmp2, 1);
0469 return res;
0470 }
0471 template <int index, class B>
0472 XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept
0473 {
0474 __m128d tmp0 = _mm256_extractf128_pd(real, index);
0475 __m128d tmp1 = _mm256_extractf128_pd(imag, index);
0476 __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
0477 tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
0478 __m256d res = real;
0479 res = _mm256_insertf128_pd(res, tmp0, 0);
0480 res = _mm256_insertf128_pd(res, tmp2, 1);
0481 return res;
0482 }
0483
0484
0485 template <class A>
0486 XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
0487 {
0488 return get_half_complex_f<0>(self.real(), self.imag());
0489 }
0490 template <class A>
0491 XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
0492 {
0493 return get_half_complex_d<0>(self.real(), self.imag());
0494 }
0495
0496
0497 template <class A>
0498 XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
0499 {
0500 return get_half_complex_f<1>(self.real(), self.imag());
0501 }
0502 template <class A>
0503 XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
0504 {
0505 return get_half_complex_d<1>(self.real(), self.imag());
0506 }
0507 }
0508
0509
0510 namespace detail
0511 {
0512 template <class A>
0513 XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
0514 {
0515 return _mm256_cvtepi32_ps(self);
0516 }
0517
0518 template <class A>
0519 XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
0520 {
0521 return _mm256_cvttps_epi32(self);
0522 }
0523 }
0524
0525
0526 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0527 XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
0528 {
0529 return self + batch<T, A>(mask.data);
0530 }
0531
0532
0533 template <class A>
0534 XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0535 {
0536 return _mm256_div_ps(self, other);
0537 }
0538 template <class A>
0539 XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0540 {
0541 return _mm256_div_pd(self, other);
0542 }
0543
0544
0545 template <class A>
0546 XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0547 {
0548 return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
0549 }
0550 template <class A>
0551 XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0552 {
0553 return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
0554 }
0555 template <class A>
0556 XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0557 {
0558 return ~(self != other);
0559 }
0560 template <class A>
0561 XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0562 {
0563 return ~(self != other);
0564 }
0565 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0566 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0567 {
0568 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0569 { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
0570 self, other);
0571 }
0572
0573 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0574 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0575 {
0576 return ~(self != other);
0577 }
0578
0579
0580 template <class A>
0581 XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
0582 {
0583 return _mm256_floor_ps(self);
0584 }
0585 template <class A>
0586 XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
0587 {
0588 return _mm256_floor_pd(self);
0589 }
0590
0591
0592 template <class A>
0593 XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
0594 {
0595 alignas(A::alignment()) static const uint64_t lut32[] = {
0596 0x0000000000000000ul,
0597 0x00000000FFFFFFFFul,
0598 0xFFFFFFFF00000000ul,
0599 0xFFFFFFFFFFFFFFFFul,
0600 };
0601 assert(!(mask & ~0xFFul) && "inbound mask");
0602 return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
0603 }
0604 template <class A>
0605 XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
0606 {
0607 alignas(A::alignment()) static const uint64_t lut64[][4] = {
0608 { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
0609 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
0610 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
0611 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
0612 { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0613 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0614 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0615 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0616 { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0617 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0618 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0619 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0620 { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0621 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0622 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0623 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0624 };
0625 assert(!(mask & ~0xFul) && "inbound mask");
0626 return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
0627 }
0628 template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0629 XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
0630 {
0631 alignas(A::alignment()) static const uint32_t lut32[] = {
0632 0x00000000,
0633 0x000000FF,
0634 0x0000FF00,
0635 0x0000FFFF,
0636 0x00FF0000,
0637 0x00FF00FF,
0638 0x00FFFF00,
0639 0x00FFFFFF,
0640 0xFF000000,
0641 0xFF0000FF,
0642 0xFF00FF00,
0643 0xFF00FFFF,
0644 0xFFFF0000,
0645 0xFFFF00FF,
0646 0xFFFFFF00,
0647 0xFFFFFFFF,
0648 };
0649 alignas(A::alignment()) static const uint64_t lut64[] = {
0650 0x0000000000000000ul,
0651 0x000000000000FFFFul,
0652 0x00000000FFFF0000ul,
0653 0x00000000FFFFFFFFul,
0654 0x0000FFFF00000000ul,
0655 0x0000FFFF0000FFFFul,
0656 0x0000FFFFFFFF0000ul,
0657 0x0000FFFFFFFFFFFFul,
0658 0xFFFF000000000000ul,
0659 0xFFFF00000000FFFFul,
0660 0xFFFF0000FFFF0000ul,
0661 0xFFFF0000FFFFFFFFul,
0662 0xFFFFFFFF00000000ul,
0663 0xFFFFFFFF0000FFFFul,
0664 0xFFFFFFFFFFFF0000ul,
0665 0xFFFFFFFFFFFFFFFFul,
0666 };
0667 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0668 {
0669 assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
0670 return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
0671 lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
0672 lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
0673 lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
0674 }
0675 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0676 {
0677 assert(!(mask & ~0xFFFFul) && "inbound mask");
0678 return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
0679 }
0680 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0681 {
0682 return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
0683 }
0684 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0685 {
0686 return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
0687 }
0688 }
0689
0690
0691 template <class A>
0692 XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
0693 {
0694
0695
0696 __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
0697
0698 __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
0699
0700
0701 tmp1 = _mm256_hadd_ps(tmp0, tmp1);
0702
0703 tmp0 = _mm256_hadd_ps(row[4], row[5]);
0704
0705 __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
0706
0707
0708 tmp2 = _mm256_hadd_ps(tmp0, tmp2);
0709
0710
0711 tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
0712
0713
0714 tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
0715 return _mm256_add_ps(tmp0, tmp1);
0716 }
0717 template <class A>
0718 XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
0719 {
0720
0721
0722 __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
0723
0724 __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
0725
0726 __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
0727
0728 tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
0729 return _mm256_add_pd(tmp1, tmp2);
0730 }
0731
0732
0733 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0734 XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
0735 {
0736 return self - batch<T, A>(mask.data);
0737 }
0738
0739
0740 template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0741 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
0742 {
0743 #if !defined(_MSC_VER) || _MSC_VER > 1900
0744 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0745 {
0746 return _mm256_insert_epi8(self, val, I);
0747 }
0748 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0749 {
0750 return _mm256_insert_epi16(self, val, I);
0751 }
0752 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0753 {
0754 return _mm256_insert_epi32(self, val, I);
0755 }
0756 else
0757 {
0758 return insert(self, val, pos, generic {});
0759 }
0760 #endif
0761 return insert(self, val, pos, generic {});
0762 }
0763
0764
0765 template <class A>
0766 XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
0767 {
0768 return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
0769 }
0770 template <class A>
0771 XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
0772 {
0773 return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
0774 }
0775
0776
0777 template <class A>
0778 XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0779 {
0780 return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
0781 }
0782 template <class A>
0783 XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0784 {
0785 return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
0786 }
0787
0788
0789 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0790 XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
0791 {
0792 return _mm256_load_si256((__m256i const*)mem);
0793 }
0794 template <class A>
0795 XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
0796 {
0797 return _mm256_load_ps(mem);
0798 }
0799 template <class A>
0800 XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
0801 {
0802 return _mm256_load_pd(mem);
0803 }
0804
0805 namespace detail
0806 {
0807
0808 template <class A>
0809 XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
0810 {
0811 using batch_type = batch<float, A>;
0812 __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
0813 __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
0814 __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
0815 __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
0816 batch_type real = _mm256_castps128_ps256(tmp_real);
0817 batch_type imag = _mm256_castps128_ps256(tmp_imag);
0818
0819 tmp0 = _mm256_extractf128_ps(lo, 0);
0820 tmp1 = _mm256_extractf128_ps(lo, 1);
0821 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
0822 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
0823 real = _mm256_insertf128_ps(real, tmp_real, 1);
0824 imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
0825 return { real, imag };
0826 }
0827 template <class A>
0828 XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
0829 {
0830 using batch_type = batch<double, A>;
0831 __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
0832 __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
0833 batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
0834 batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
0835
0836 tmp0 = _mm256_extractf128_pd(lo, 0);
0837 tmp1 = _mm256_extractf128_pd(lo, 1);
0838 __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
0839 __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
0840 real = _mm256_blend_pd(real, re_tmp1, 12);
0841 imag = _mm256_blend_pd(imag, im_tmp1, 12);
0842 return { real, imag };
0843 }
0844 }
0845
0846
0847 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0848 XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
0849 {
0850 return _mm256_loadu_si256((__m256i const*)mem);
0851 }
0852 template <class A>
0853 XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
0854 {
0855 return _mm256_loadu_ps(mem);
0856 }
0857 template <class A>
0858 XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
0859 {
0860 return _mm256_loadu_pd(mem);
0861 }
0862
0863
0864 template <class A>
0865 XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0866 {
0867 return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
0868 }
0869 template <class A>
0870 XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0871 {
0872 return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
0873 }
0874
0875 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0876 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0877 {
0878 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0879 { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0880 self, other);
0881 }
0882
0883
0884 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0885 XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0886 {
0887 XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
0888 {
0889 __m128i self_low, self_high;
0890 detail::split_avx(self, self_low, self_high);
0891 return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
0892 }
0893 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0894 {
0895 return _mm256_movemask_ps(_mm256_castsi256_ps(self));
0896 }
0897 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0898 {
0899 return _mm256_movemask_pd(_mm256_castsi256_pd(self));
0900 }
0901 else
0902 {
0903 assert(false && "unsupported arch/op combination");
0904 return {};
0905 }
0906 }
0907 template <class A>
0908 XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0909 {
0910 return _mm256_movemask_ps(self);
0911 }
0912
0913 template <class A>
0914 XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0915 {
0916 return _mm256_movemask_pd(self);
0917 }
0918
0919
0920 template <class A>
0921 XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0922 {
0923 return _mm256_max_ps(self, other);
0924 }
0925 template <class A>
0926 XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0927 {
0928 return _mm256_max_pd(self, other);
0929 }
0930 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0931 XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0932 {
0933 return select(self > other, self, other);
0934 }
0935
0936
0937 template <class A>
0938 XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0939 {
0940 return _mm256_min_ps(self, other);
0941 }
0942 template <class A>
0943 XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0944 {
0945 return _mm256_min_pd(self, other);
0946 }
0947 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0948 XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0949 {
0950 return select(self <= other, self, other);
0951 }
0952
0953
0954 template <class A>
0955 XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0956 {
0957 return _mm256_mul_ps(self, other);
0958 }
0959 template <class A>
0960 XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0961 {
0962 return _mm256_mul_pd(self, other);
0963 }
0964
0965
0966 template <class A>
0967 XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
0968 {
0969 return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
0970 }
0971 template <class A>
0972 XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
0973 {
0974 return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
0975 }
0976
0977
0978 template <class A>
0979 XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
0980 requires_arch<avx>) noexcept
0981 {
0982 return _mm256_cvtps_epi32(self);
0983 }
0984
0985
0986 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0987 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
0988 {
0989 return 0 - self;
0990 }
0991 template <class A>
0992 batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
0993 {
0994 return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
0995 }
0996 template <class A>
0997 XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
0998 {
0999 return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
1000 }
1001
1002
1003 template <class A>
1004 XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1005 {
1006 return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
1007 }
1008 template <class A>
1009 XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1010 {
1011 return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
1012 }
1013 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1014 XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1015 {
1016 return ~(self == other);
1017 }
1018
1019 template <class A>
1020 XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
1021 {
1022 return _mm256_xor_ps(self, other);
1023 }
1024 template <class A>
1025 XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
1026 {
1027 return _mm256_xor_pd(self, other);
1028 }
1029 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1030 XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
1031 {
1032 return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
1033 }
1034
1035
1036 template <class A>
1037 XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
1038 kernel::requires_arch<avx>) noexcept
1039 {
1040 return _mm256_rcp_ps(self);
1041 }
1042
1043
1044 template <class A>
1045 XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
1046 {
1047
1048
1049
1050
1051
1052
1053 __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
1054
1055 tmp = _mm256_add_ps(rhs, tmp);
1056
1057 tmp = _mm256_hadd_ps(tmp, tmp);
1058
1059 tmp = _mm256_hadd_ps(tmp, tmp);
1060 return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
1061 }
1062 template <class A>
1063 XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
1064 {
1065
1066
1067 __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
1068
1069 tmp = _mm256_add_pd(rhs, tmp);
1070
1071 tmp = _mm256_hadd_pd(tmp, tmp);
1072 return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
1073 }
1074 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1075 XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
1076 {
1077 __m128i low, high;
1078 detail::split_avx(self, low, high);
1079 batch<T, sse4_2> blow(low), bhigh(high);
1080 return reduce_add(blow) + reduce_add(bhigh);
1081 }
1082
1083
1084 template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1085 XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
1086 {
1087 constexpr auto mask = detail::shuffle(1, 0);
1088 batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
1089 batch<T, A> acc = max(self, step);
1090 __m128i low = _mm256_castsi256_si128(acc);
1091 return reduce_max(batch<T, sse4_2>(low));
1092 }
1093
1094
1095 template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1096 XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
1097 {
1098 constexpr auto mask = detail::shuffle(1, 0);
1099 batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
1100 batch<T, A> acc = min(self, step);
1101 __m128i low = _mm256_castsi256_si128(acc);
1102 return reduce_min(batch<T, sse4_2>(low));
1103 }
1104
1105
1106 template <class A>
1107 XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
1108 {
1109 return _mm256_rsqrt_ps(val);
1110 }
1111 template <class A>
1112 XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
1113 {
1114 return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
1115 }
1116
1117
1118 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1119 XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1120 {
1121 if (std::is_signed<T>::value)
1122 {
1123 auto mask = (other >> (8 * sizeof(T) - 1));
1124 auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
1125 auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
1126 return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
1127 }
1128 else
1129 {
1130 const auto diffmax = std::numeric_limits<T>::max() - self;
1131 const auto mindiff = min(diffmax, other);
1132 return self + mindiff;
1133 }
1134 }
1135
1136
1137 template <class A>
1138 XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
1139 {
1140 return _mm256_blendv_ps(false_br, true_br, cond);
1141 }
1142 template <class A>
1143 XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
1144 {
1145 return _mm256_blendv_pd(false_br, true_br, cond);
1146 }
1147 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1148 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
1149 {
1150 __m128i cond_low, cond_hi;
1151 detail::split_avx(cond, cond_low, cond_hi);
1152
1153 __m128i true_low, true_hi;
1154 detail::split_avx(true_br, true_low, true_hi);
1155
1156 __m128i false_low, false_hi;
1157 detail::split_avx(false_br, false_low, false_hi);
1158
1159 __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
1160 __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
1161 return detail::merge_sse(res_low, res_hi);
1162 }
1163 template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1164 XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
1165 {
1166 return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
1167 }
1168
1169 template <class A, bool... Values>
1170 XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
1171 {
1172 constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
1173 return _mm256_blend_ps(false_br, true_br, mask);
1174 }
1175
1176 template <class A, bool... Values>
1177 XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
1178 {
1179 constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
1180 return _mm256_blend_pd(false_br, true_br, mask);
1181 }
1182
1183
1184 template <class A, class... Values>
1185 XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
1186 {
1187 static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1188 return _mm256_setr_ps(values...);
1189 }
1190
1191 template <class A, class... Values>
1192 XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
1193 {
1194 static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1195 return _mm256_setr_pd(values...);
1196 }
1197 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1198 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
1199 {
1200 return _mm256_set_epi64x(v3, v2, v1, v0);
1201 }
1202 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1203 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1204 {
1205 return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
1206 }
1207 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1208 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1209 {
1210 return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1211 }
1212 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1213 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1214 T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1215 {
1216 return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1217 }
1218
1219 template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1220 XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
1221 {
1222 return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1223 }
1224
1225 template <class A, class... Values>
1226 XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
1227 {
1228 static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1229 return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
1230 }
1231
1232 template <class A, class... Values>
1233 XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
1234 {
1235 static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1236 return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
1237 }
1238
1239
1240 template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
1241 XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
1242 {
1243 constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
1244
1245 if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
1246 return _mm256_shuffle_ps(x, y, smask);
1247
1248
1249 if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12)
1250 return _mm256_shuffle_ps(y, x, smask);
1251
1252 return shuffle(x, y, mask, generic {});
1253 }
1254
1255 template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1256 XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
1257 {
1258 constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
1259
1260 if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6)
1261 return _mm256_shuffle_pd(x, y, smask);
1262
1263
1264 if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6)
1265 return _mm256_shuffle_pd(y, x, smask);
1266
1267 return shuffle(x, y, mask, generic {});
1268 }
1269
1270
1271 template <size_t N, class A, class T>
1272 XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
1273 {
1274 constexpr unsigned BitCount = N * 8;
1275 if (BitCount == 0)
1276 {
1277 return x;
1278 }
1279 if (BitCount >= 256)
1280 {
1281 return batch<T, A>(T(0));
1282 }
1283 if (BitCount > 128)
1284 {
1285 constexpr unsigned M = (BitCount - 128) / 8;
1286 __m128i low = _mm256_castsi256_si128(x);
1287 auto y = _mm_slli_si128(low, M);
1288 __m256i zero = _mm256_setzero_si256();
1289 return _mm256_insertf128_si256(zero, y, 1);
1290 }
1291 if (BitCount == 128)
1292 {
1293 __m128i low = _mm256_castsi256_si128(x);
1294 __m256i zero = _mm256_setzero_si256();
1295 return _mm256_insertf128_si256(zero, low, 1);
1296 }
1297
1298 constexpr unsigned M = BitCount / 8;
1299
1300 __m128i low = _mm256_castsi256_si128(x);
1301 auto ylow = _mm_slli_si128(low, M);
1302 auto zlow = _mm_srli_si128(low, 16 - M);
1303
1304 __m128i high = _mm256_extractf128_si256(x, 1);
1305 auto yhigh = _mm_slli_si128(high, M);
1306
1307 __m256i res = _mm256_castsi128_si256(ylow);
1308 return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
1309 }
1310
1311
1312 template <size_t N, class A, class T>
1313 XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
1314 {
1315 constexpr unsigned BitCount = N * 8;
1316 if (BitCount == 0)
1317 {
1318 return x;
1319 }
1320 if (BitCount >= 256)
1321 {
1322 return batch<T, A>(T(0));
1323 }
1324 if (BitCount > 128)
1325 {
1326 constexpr unsigned M = (BitCount - 128) / 8;
1327 __m128i high = _mm256_extractf128_si256(x, 1);
1328 __m128i y = _mm_srli_si128(high, M);
1329 __m256i zero = _mm256_setzero_si256();
1330 return _mm256_insertf128_si256(zero, y, 0);
1331 }
1332 if (BitCount == 128)
1333 {
1334 __m128i high = _mm256_extractf128_si256(x, 1);
1335 return _mm256_castsi128_si256(high);
1336 }
1337
1338 constexpr unsigned M = BitCount / 8;
1339
1340 __m128i low = _mm256_castsi256_si128(x);
1341 auto ylow = _mm_srli_si128(low, M);
1342
1343 __m128i high = _mm256_extractf128_si256(x, 1);
1344 auto yhigh = _mm_srli_si128(high, M);
1345 auto zhigh = _mm_slli_si128(high, 16 - M);
1346
1347 __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
1348 return _mm256_insertf128_si256(res, yhigh, 1);
1349 }
1350
1351
1352 template <class A>
1353 XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
1354 {
1355 return _mm256_sqrt_ps(val);
1356 }
1357 template <class A>
1358 XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
1359 {
1360 return _mm256_sqrt_pd(val);
1361 }
1362
1363
1364 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1365 XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1366 {
1367 if (std::is_signed<T>::value)
1368 {
1369 return sadd(self, -other);
1370 }
1371 else
1372 {
1373 const auto diff = min(self, other);
1374 return self - diff;
1375 }
1376 }
1377
1378
1379 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1380 XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1381 {
1382 return _mm256_store_si256((__m256i*)mem, self);
1383 }
1384 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1385 XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
1386 {
1387 return _mm256_store_si256((__m256i*)mem, self);
1388 }
1389 template <class A>
1390 XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1391 {
1392 return _mm256_store_ps(mem, self);
1393 }
1394 template <class A>
1395 XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1396 {
1397 return _mm256_store_pd(mem, self);
1398 }
1399
1400
1401 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1402 XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1403 {
1404 return _mm256_storeu_si256((__m256i*)mem, self);
1405 }
1406 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1407 XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
1408 {
1409 return _mm256_storeu_si256((__m256i*)mem, self);
1410 }
1411 template <class A>
1412 XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1413 {
1414 return _mm256_storeu_ps(mem, self);
1415 }
1416 template <class A>
1417 XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1418 {
1419 return _mm256_storeu_pd(mem, self);
1420 }
1421
1422
1423 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1424 XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1425 {
1426 return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
1427 { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
1428 self, other);
1429 }
1430 template <class A>
1431 XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1432 {
1433 return _mm256_sub_ps(self, other);
1434 }
1435 template <class A>
1436 XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1437 {
1438 return _mm256_sub_pd(self, other);
1439 }
1440
1441
1442 template <class A>
1443 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
1444 {
1445
1446 __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
1447 __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
1448
1449 __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
1450 __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
1451
1452
1453 batch<uint32_t, A> half_mask = mask % 4;
1454
1455
1456 __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
1457 __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
1458
1459
1460 batch_bool<uint32_t, A> blend_mask = mask >= 4;
1461
1462
1463 return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
1464 }
1465
1466 template <class A>
1467 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
1468 {
1469
1470 __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
1471 __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
1472
1473 __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
1474 __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
1475
1476
1477 batch<uint64_t, A> half_mask = -(mask & 1);
1478
1479
1480 __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
1481 __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
1482
1483
1484 batch_bool<uint64_t, A> blend_mask = mask >= 2;
1485
1486
1487 return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
1488 }
1489
1490 template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
1491 XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
1492 {
1493 return bitwise_cast<T>(
1494 swizzle(bitwise_cast<float>(self), mask));
1495 }
1496
1497 template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
1498 XSIMD_INLINE batch<T, A>
1499 swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
1500 {
1501 return bitwise_cast<T>(
1502 swizzle(bitwise_cast<double>(self), mask));
1503 }
1504
1505
1506 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1507 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
1508 {
1509
1510 __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
1511 __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
1512
1513 __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
1514 __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
1515
1516
1517 batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1518
1519
1520 __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
1521 __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
1522
1523
1524 batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
1525
1526
1527 constexpr auto mask = blend_mask.mask();
1528 return _mm256_blend_ps(r0, r1, mask);
1529 }
1530
1531 template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1532 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
1533 {
1534
1535 __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
1536 __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
1537
1538 __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
1539 __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
1540
1541
1542 batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
1543
1544
1545 __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
1546 __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
1547
1548
1549 batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1550
1551
1552 constexpr auto mask = blend_mask.mask();
1553 return _mm256_blend_pd(r0, r1, mask);
1554 }
1555 template <class A,
1556 typename T,
1557 uint32_t V0,
1558 uint32_t V1,
1559 uint32_t V2,
1560 uint32_t V3,
1561 uint32_t V4,
1562 uint32_t V5,
1563 uint32_t V6,
1564 uint32_t V7,
1565 detail::enable_sized_integral_t<T, 4> = 0>
1566 XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
1567 batch_constant<uint32_t, A,
1568 V0,
1569 V1,
1570 V2,
1571 V3,
1572 V4,
1573 V5,
1574 V6,
1575 V7> const& mask,
1576 requires_arch<avx>) noexcept
1577 {
1578 return bitwise_cast<T>(
1579 swizzle(bitwise_cast<float>(self), mask));
1580 }
1581
1582 template <class A,
1583 typename T,
1584 uint64_t V0,
1585 uint64_t V1,
1586 uint64_t V2,
1587 uint64_t V3,
1588 detail::enable_sized_integral_t<T, 8> = 0>
1589 XSIMD_INLINE batch<T, A>
1590 swizzle(batch<T, A> const& self,
1591 batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
1592 requires_arch<avx>) noexcept
1593 {
1594 return bitwise_cast<T>(
1595 swizzle(bitwise_cast<double>(self), mask));
1596 }
1597
1598 template <class A>
1599 XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
1600 {
1601 assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1602 (void)matrix_end;
1603
1604
1605 auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1606 r2 = matrix_begin[2], r3 = matrix_begin[3],
1607 r4 = matrix_begin[4], r5 = matrix_begin[5],
1608 r6 = matrix_begin[6], r7 = matrix_begin[7];
1609
1610 auto t0 = _mm256_unpacklo_ps(r0, r1);
1611 auto t1 = _mm256_unpackhi_ps(r0, r1);
1612 auto t2 = _mm256_unpacklo_ps(r2, r3);
1613 auto t3 = _mm256_unpackhi_ps(r2, r3);
1614 auto t4 = _mm256_unpacklo_ps(r4, r5);
1615 auto t5 = _mm256_unpackhi_ps(r4, r5);
1616 auto t6 = _mm256_unpacklo_ps(r6, r7);
1617 auto t7 = _mm256_unpackhi_ps(r6, r7);
1618
1619 r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
1620 r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
1621 r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
1622 r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
1623 r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
1624 r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
1625 r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
1626 r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
1627
1628 matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
1629 matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
1630 matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
1631 matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
1632 matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
1633 matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
1634 matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
1635 matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
1636 }
1637
1638 template <class A>
1639 XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
1640 {
1641 return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1642 }
1643 template <class A>
1644 XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
1645 {
1646 return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1647 }
1648
1649 template <class A>
1650 XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
1651 {
1652 assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1653 (void)matrix_end;
1654 auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1655 r2 = matrix_begin[2], r3 = matrix_begin[3];
1656
1657 auto t0 = _mm256_unpacklo_pd(r0, r1);
1658 auto t1 = _mm256_unpackhi_pd(r0, r1);
1659 auto t2 = _mm256_unpacklo_pd(r2, r3);
1660 auto t3 = _mm256_unpackhi_pd(r2, r3);
1661
1662 matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
1663 matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20);
1664 matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31);
1665 matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
1666 }
1667
1668 template <class A>
1669 XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
1670 {
1671 return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1672 }
1673 template <class A>
1674 XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
1675 {
1676 return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1677 }
1678
1679
1680 template <class A>
1681 XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
1682 {
1683 return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
1684 }
1685 template <class A>
1686 XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
1687 {
1688 return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
1689 }
1690
1691
1692 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1693 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1694 {
1695 XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
1696 {
1697
1698 __m128i self_hi = _mm256_extractf128_si256(self, 1);
1699 __m128i other_hi = _mm256_extractf128_si256(other, 1);
1700
1701
1702 __m128i res_lo, res_hi;
1703 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1704 {
1705 res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
1706 res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
1707 }
1708 else
1709 {
1710 res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
1711 res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
1712 }
1713
1714
1715 return _mm256_castps_si256(
1716 _mm256_insertf128_ps(
1717 _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
1718 _mm_castsi128_ps(res_hi),
1719 1));
1720 }
1721 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1722 {
1723 auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1724 auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1725 return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
1726 }
1727 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1728 {
1729 auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1730 auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1731 return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
1732 }
1733 else
1734 {
1735 assert(false && "unsupported arch/op combination");
1736 return {};
1737 }
1738 }
1739 template <class A>
1740 XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1741 {
1742 auto lo = _mm256_unpacklo_ps(self, other);
1743 auto hi = _mm256_unpackhi_ps(self, other);
1744 return _mm256_permute2f128_ps(lo, hi, 0x31);
1745 }
1746 template <class A>
1747 XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1748 {
1749 auto lo = _mm256_unpacklo_pd(self, other);
1750 auto hi = _mm256_unpackhi_pd(self, other);
1751 return _mm256_permute2f128_pd(lo, hi, 0x31);
1752 }
1753
1754
1755 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1756 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1757 {
1758 XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
1759 {
1760
1761 __m128i self_lo = _mm256_extractf128_si256(self, 0);
1762 __m128i other_lo = _mm256_extractf128_si256(other, 0);
1763
1764
1765 __m128i res_lo, res_hi;
1766 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1767 {
1768 res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
1769 res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
1770 }
1771 else
1772 {
1773 res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
1774 res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
1775 }
1776
1777
1778 return _mm256_castps_si256(
1779 _mm256_insertf128_ps(
1780 _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
1781 _mm_castsi128_ps(res_hi),
1782 1));
1783 }
1784 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1785 {
1786 auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1787 auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1788 return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
1789 }
1790 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1791 {
1792 auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1793 auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1794 return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
1795 }
1796 else
1797 {
1798 assert(false && "unsupported arch/op combination");
1799 return {};
1800 }
1801 }
1802
1803 template <class A>
1804 XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1805 {
1806 auto lo = _mm256_unpacklo_ps(self, other);
1807 auto hi = _mm256_unpackhi_ps(self, other);
1808 return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
1809 }
1810 template <class A>
1811 XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1812 {
1813 auto lo = _mm256_unpacklo_pd(self, other);
1814 auto hi = _mm256_unpackhi_pd(self, other);
1815 return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
1816 }
1817 }
1818 }
1819
1820 #endif