xsimd/arch/xsimd_avx.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_AVX_HPP
0013 #define XSIMD_AVX_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_avx_register.hpp"
0020
0021 namespace xsimd
0022 {
0023
0024     namespace kernel
0025     {
0026         using namespace types;
0027
0028         // fwd
0029         template <class A, class T, size_t I>
0030         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
0031
0032         namespace detail
0033         {
0034             XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
0035             {
0036                 low = _mm256_castsi256_si128(val);
0037                 high = _mm256_extractf128_si256(val, 1);
0038             }
0039             XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept
0040             {
0041                 low = _mm256_castps256_ps128(val);
0042                 high = _mm256_extractf128_ps(val, 1);
0043             }
0044             XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
0045             {
0046                 low = _mm256_castpd256_pd128(val);
0047                 high = _mm256_extractf128_pd(val, 1);
0048             }
0049             XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept
0050             {
0051                 return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
0052             }
0053             XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept
0054             {
0055                 return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
0056             }
0057             XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept
0058             {
0059                 return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
0060             }
0061             template <class F>
0062             XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept
0063             {
0064                 __m128i self_low, self_high;
0065                 split_avx(self, self_low, self_high);
0066                 __m128i res_low = f(self_low);
0067                 __m128i res_high = f(self_high);
0068                 return merge_sse(res_low, res_high);
0069             }
0070             template <class F>
0071             XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
0072             {
0073                 __m128i self_low, self_high, other_low, other_high;
0074                 split_avx(self, self_low, self_high);
0075                 split_avx(other, other_low, other_high);
0076                 __m128i res_low = f(self_low, other_low);
0077                 __m128i res_high = f(self_high, other_high);
0078                 return merge_sse(res_low, res_high);
0079             }
0080             template <class F>
0081             XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
0082             {
0083                 __m128i self_low, self_high;
0084                 split_avx(self, self_low, self_high);
0085                 __m128i res_low = f(self_low, other);
0086                 __m128i res_high = f(self_high, other);
0087                 return merge_sse(res_low, res_high);
0088             }
0089         }
0090
0091         // abs
0092         template <class A>
0093         XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
0094         {
0095             __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
0096             return _mm256_andnot_ps(sign_mask, self);
0097         }
0098         template <class A>
0099         XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
0100         {
0101             __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
0102             return _mm256_andnot_pd(sign_mask, self);
0103         }
0104
0105         // add
0106         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0107         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0108         {
0109             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0110                                       { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0111                                       self, other);
0112         }
0113         template <class A>
0114         XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0115         {
0116             return _mm256_add_ps(self, other);
0117         }
0118         template <class A>
0119         XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0120         {
0121             return _mm256_add_pd(self, other);
0122         }
0123
0124         // all
0125         template <class A>
0126         XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0127         {
0128             return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
0129         }
0130         template <class A>
0131         XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0132         {
0133             return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
0134         }
0135         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0136         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0137         {
0138             return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
0139         }
0140
0141         // any
0142         template <class A>
0143         XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0144         {
0145             return !_mm256_testz_ps(self, self);
0146         }
0147         template <class A>
0148         XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0149         {
0150             return !_mm256_testz_pd(self, self);
0151         }
0152         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0153         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0154         {
0155             return !_mm256_testz_si256(self, self);
0156         }
0157
0158         // batch_bool_cast
0159         template <class A, class T_out, class T_in>
0160         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
0161         {
0162             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
0163         }
0164
0165         // bitwise_and
0166         template <class A>
0167         XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0168         {
0169             return _mm256_and_ps(self, other);
0170         }
0171         template <class A>
0172         XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0173         {
0174             return _mm256_and_pd(self, other);
0175         }
0176
0177         template <class A>
0178         XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0179         {
0180             return _mm256_and_ps(self, other);
0181         }
0182         template <class A>
0183         XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0184         {
0185             return _mm256_and_pd(self, other);
0186         }
0187
0188         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0189         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0190         {
0191             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0192                                       { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0193                                       self, other);
0194         }
0195         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0196         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0197         {
0198             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0199                                       { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0200                                       self, other);
0201         }
0202
0203         // bitwise_andnot
0204         template <class A>
0205         XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0206         {
0207             return _mm256_andnot_ps(other, self);
0208         }
0209         template <class A>
0210         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0211         {
0212             return _mm256_andnot_pd(other, self);
0213         }
0214
0215         template <class A>
0216         XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0217         {
0218             return _mm256_andnot_ps(other, self);
0219         }
0220         template <class A>
0221         XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0222         {
0223             return _mm256_andnot_pd(other, self);
0224         }
0225
0226         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0227         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0228         {
0229             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0230                                       { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0231                                       self, other);
0232         }
0233         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0234         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0235         {
0236             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0237                                       { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0238                                       self, other);
0239         }
0240
0241         // bitwise_lshift
0242         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0243         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
0244         {
0245             return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
0246                                       { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
0247                                       self, other);
0248         }
0249
0250         // bitwise_not
0251         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0252         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
0253         {
0254             return detail::fwd_to_sse([](__m128i s) noexcept
0255                                       { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
0256                                       self);
0257         }
0258         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0259         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0260         {
0261             return detail::fwd_to_sse([](__m128i s) noexcept
0262                                       { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
0263                                       self);
0264         }
0265
0266         // bitwise_or
0267         template <class A>
0268         XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0269         {
0270             return _mm256_or_ps(self, other);
0271         }
0272         template <class A>
0273         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0274         {
0275             return _mm256_or_pd(self, other);
0276         }
0277         template <class A>
0278         XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0279         {
0280             return _mm256_or_ps(self, other);
0281         }
0282         template <class A>
0283         XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0284         {
0285             return _mm256_or_pd(self, other);
0286         }
0287         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0288         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0289         {
0290             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0291                                       { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0292                                       self, other);
0293         }
0294         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0295         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0296         {
0297             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0298                                       { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
0299                                       self, other);
0300         }
0301
0302         // bitwise_rshift
0303         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0304         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
0305         {
0306             return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
0307                                       { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
0308                                       self, other);
0309         }
0310
0311         // bitwise_xor
0312         template <class A>
0313         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0314         {
0315             return _mm256_xor_ps(self, other);
0316         }
0317         template <class A>
0318         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0319         {
0320             return _mm256_xor_pd(self, other);
0321         }
0322         template <class A>
0323         XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0324         {
0325             return _mm256_xor_ps(self, other);
0326         }
0327         template <class A>
0328         XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0329         {
0330             return _mm256_xor_pd(self, other);
0331         }
0332         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0333         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0334         {
0335             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0336                                       { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
0337                                       self, other);
0338         }
0339         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0340         XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0341         {
0342             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0343                                       { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
0344                                       self, other);
0345         }
0346
0347         // bitwise_cast
0348         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0349         XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
0350         {
0351             return _mm256_castsi256_ps(self);
0352         }
0353         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0354         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
0355         {
0356             return _mm256_castsi256_pd(self);
0357         }
0358         template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
0359         XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
0360         {
0361             return batch<Tp, A>(self.data);
0362         }
0363         template <class A>
0364         XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
0365         {
0366             return _mm256_castps_pd(self);
0367         }
0368         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0369         XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
0370         {
0371             return _mm256_castps_si256(self);
0372         }
0373         template <class A>
0374         XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
0375         {
0376             return _mm256_castpd_ps(self);
0377         }
0378         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0379         XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
0380         {
0381             return _mm256_castpd_si256(self);
0382         }
0383
0384         // bitwise_not
0385         template <class A>
0386         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
0387         {
0388             return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
0389         }
0390         template <class A>
0391         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
0392         {
0393             return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
0394         }
0395         template <class A>
0396         XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0397         {
0398             return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
0399         }
0400         template <class A>
0401         XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0402         {
0403             return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
0404         }
0405
0406         // broadcast
0407         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0408         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
0409         {
0410             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0411             {
0412                 return _mm256_set1_epi8(val);
0413             }
0414             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0415             {
0416                 return _mm256_set1_epi16(val);
0417             }
0418             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0419             {
0420                 return _mm256_set1_epi32(val);
0421             }
0422             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0423             {
0424                 return _mm256_set1_epi64x(val);
0425             }
0426             else
0427             {
0428                 assert(false && "unsupported");
0429                 return {};
0430             }
0431         }
0432         template <class A>
0433         XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
0434         {
0435             return _mm256_set1_ps(val);
0436         }
0437         template <class A>
0438         XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
0439         {
0440             return _mm256_set1_pd(val);
0441         }
0442
0443         // ceil
0444         template <class A>
0445         XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
0446         {
0447             return _mm256_ceil_ps(self);
0448         }
0449         template <class A>
0450         XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
0451         {
0452             return _mm256_ceil_pd(self);
0453         }
0454
0455         namespace detail
0456         {
0457             // On clang, _mm256_extractf128_ps is built upon build_shufflevector
0458             // which require index parameter to be a constant
0459             template <int index, class B>
0460             XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept
0461             {
0462                 __m128 tmp0 = _mm256_extractf128_ps(real, index);
0463                 __m128 tmp1 = _mm256_extractf128_ps(imag, index);
0464                 __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
0465                 tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
0466                 __m256 res = real;
0467                 res = _mm256_insertf128_ps(res, tmp0, 0);
0468                 res = _mm256_insertf128_ps(res, tmp2, 1);
0469                 return res;
0470             }
0471             template <int index, class B>
0472             XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept
0473             {
0474                 __m128d tmp0 = _mm256_extractf128_pd(real, index);
0475                 __m128d tmp1 = _mm256_extractf128_pd(imag, index);
0476                 __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
0477                 tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
0478                 __m256d res = real;
0479                 res = _mm256_insertf128_pd(res, tmp0, 0);
0480                 res = _mm256_insertf128_pd(res, tmp2, 1);
0481                 return res;
0482             }
0483
0484             // complex_low
0485             template <class A>
0486             XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
0487             {
0488                 return get_half_complex_f<0>(self.real(), self.imag());
0489             }
0490             template <class A>
0491             XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
0492             {
0493                 return get_half_complex_d<0>(self.real(), self.imag());
0494             }
0495
0496             // complex_high
0497             template <class A>
0498             XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
0499             {
0500                 return get_half_complex_f<1>(self.real(), self.imag());
0501             }
0502             template <class A>
0503             XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
0504             {
0505                 return get_half_complex_d<1>(self.real(), self.imag());
0506             }
0507         }
0508
0509         // fast_cast
0510         namespace detail
0511         {
0512             template <class A>
0513             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
0514             {
0515                 return _mm256_cvtepi32_ps(self);
0516             }
0517
0518             template <class A>
0519             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
0520             {
0521                 return _mm256_cvttps_epi32(self);
0522             }
0523         }
0524
0525         // decr_if
0526         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0527         XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
0528         {
0529             return self + batch<T, A>(mask.data);
0530         }
0531
0532         // div
0533         template <class A>
0534         XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0535         {
0536             return _mm256_div_ps(self, other);
0537         }
0538         template <class A>
0539         XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0540         {
0541             return _mm256_div_pd(self, other);
0542         }
0543
0544         // eq
0545         template <class A>
0546         XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0547         {
0548             return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
0549         }
0550         template <class A>
0551         XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0552         {
0553             return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
0554         }
0555         template <class A>
0556         XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
0557         {
0558             return ~(self != other);
0559         }
0560         template <class A>
0561         XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
0562         {
0563             return ~(self != other);
0564         }
0565         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0566         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0567         {
0568             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0569                                       { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
0570                                       self, other);
0571         }
0572
0573         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0574         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
0575         {
0576             return ~(self != other);
0577         }
0578
0579         // floor
0580         template <class A>
0581         XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
0582         {
0583             return _mm256_floor_ps(self);
0584         }
0585         template <class A>
0586         XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
0587         {
0588             return _mm256_floor_pd(self);
0589         }
0590
0591         // from_mask
0592         template <class A>
0593         XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
0594         {
0595             alignas(A::alignment()) static const uint64_t lut32[] = {
0596                 0x0000000000000000ul,
0597                 0x00000000FFFFFFFFul,
0598                 0xFFFFFFFF00000000ul,
0599                 0xFFFFFFFFFFFFFFFFul,
0600             };
0601             assert(!(mask & ~0xFFul) && "inbound mask");
0602             return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
0603         }
0604         template <class A>
0605         XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
0606         {
0607             alignas(A::alignment()) static const uint64_t lut64[][4] = {
0608                 { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
0609                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
0610                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
0611                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
0612                 { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0613                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0614                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0615                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0616                 { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0617                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0618                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0619                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0620                 { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0621                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0622                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0623                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0624             };
0625             assert(!(mask & ~0xFul) && "inbound mask");
0626             return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
0627         }
0628         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0629         XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
0630         {
0631             alignas(A::alignment()) static const uint32_t lut32[] = {
0632                 0x00000000,
0633                 0x000000FF,
0634                 0x0000FF00,
0635                 0x0000FFFF,
0636                 0x00FF0000,
0637                 0x00FF00FF,
0638                 0x00FFFF00,
0639                 0x00FFFFFF,
0640                 0xFF000000,
0641                 0xFF0000FF,
0642                 0xFF00FF00,
0643                 0xFF00FFFF,
0644                 0xFFFF0000,
0645                 0xFFFF00FF,
0646                 0xFFFFFF00,
0647                 0xFFFFFFFF,
0648             };
0649             alignas(A::alignment()) static const uint64_t lut64[] = {
0650                 0x0000000000000000ul,
0651                 0x000000000000FFFFul,
0652                 0x00000000FFFF0000ul,
0653                 0x00000000FFFFFFFFul,
0654                 0x0000FFFF00000000ul,
0655                 0x0000FFFF0000FFFFul,
0656                 0x0000FFFFFFFF0000ul,
0657                 0x0000FFFFFFFFFFFFul,
0658                 0xFFFF000000000000ul,
0659                 0xFFFF00000000FFFFul,
0660                 0xFFFF0000FFFF0000ul,
0661                 0xFFFF0000FFFFFFFFul,
0662                 0xFFFFFFFF00000000ul,
0663                 0xFFFFFFFF0000FFFFul,
0664                 0xFFFFFFFFFFFF0000ul,
0665                 0xFFFFFFFFFFFFFFFFul,
0666             };
0667             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0668             {
0669                 assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
0670                 return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
0671                                          lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
0672                                          lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
0673                                          lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
0674             }
0675             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0676             {
0677                 assert(!(mask & ~0xFFFFul) && "inbound mask");
0678                 return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
0679             }
0680             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0681             {
0682                 return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
0683             }
0684             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0685             {
0686                 return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
0687             }
0688         }
0689
0690         // haddp
0691         template <class A>
0692         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
0693         {
0694             // row = (a,b,c,d,e,f,g,h)
0695             // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
0696             __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
0697             // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
0698             __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
0699             // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
0700             // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
0701             tmp1 = _mm256_hadd_ps(tmp0, tmp1);
0702             // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
0703             tmp0 = _mm256_hadd_ps(row[4], row[5]);
0704             // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
0705             __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
0706             // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
0707             // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
0708             tmp2 = _mm256_hadd_ps(tmp0, tmp2);
0709             // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
0710             // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
0711             tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
0712             // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
0713             // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
0714             tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
0715             return _mm256_add_ps(tmp0, tmp1);
0716         }
0717         template <class A>
0718         XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
0719         {
0720             // row = (a,b,c,d)
0721             // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
0722             __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
0723             // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
0724             __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
0725             // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
0726             __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
0727             // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
0728             tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
0729             return _mm256_add_pd(tmp1, tmp2);
0730         }
0731
0732         // incr_if
0733         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0734         XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
0735         {
0736             return self - batch<T, A>(mask.data);
0737         }
0738
0739         // insert
0740         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0741         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
0742         {
0743 #if !defined(_MSC_VER) || _MSC_VER > 1900
0744             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0745             {
0746                 return _mm256_insert_epi8(self, val, I);
0747             }
0748             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0749             {
0750                 return _mm256_insert_epi16(self, val, I);
0751             }
0752             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0753             {
0754                 return _mm256_insert_epi32(self, val, I);
0755             }
0756             else
0757             {
0758                 return insert(self, val, pos, generic {});
0759             }
0760 #endif
0761             return insert(self, val, pos, generic {});
0762         }
0763
0764         // isnan
0765         template <class A>
0766         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
0767         {
0768             return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
0769         }
0770         template <class A>
0771         XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
0772         {
0773             return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
0774         }
0775
0776         // le
0777         template <class A>
0778         XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0779         {
0780             return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
0781         }
0782         template <class A>
0783         XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0784         {
0785             return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
0786         }
0787
0788         // load_aligned
0789         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0790         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
0791         {
0792             return _mm256_load_si256((__m256i const*)mem);
0793         }
0794         template <class A>
0795         XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
0796         {
0797             return _mm256_load_ps(mem);
0798         }
0799         template <class A>
0800         XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
0801         {
0802             return _mm256_load_pd(mem);
0803         }
0804
0805         namespace detail
0806         {
0807             // load_complex
0808             template <class A>
0809             XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
0810             {
0811                 using batch_type = batch<float, A>;
0812                 __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
0813                 __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
0814                 __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
0815                 __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
0816                 batch_type real = _mm256_castps128_ps256(tmp_real);
0817                 batch_type imag = _mm256_castps128_ps256(tmp_imag);
0818
0819                 tmp0 = _mm256_extractf128_ps(lo, 0);
0820                 tmp1 = _mm256_extractf128_ps(lo, 1);
0821                 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
0822                 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
0823                 real = _mm256_insertf128_ps(real, tmp_real, 1);
0824                 imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
0825                 return { real, imag };
0826             }
0827             template <class A>
0828             XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
0829             {
0830                 using batch_type = batch<double, A>;
0831                 __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
0832                 __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
0833                 batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
0834                 batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
0835
0836                 tmp0 = _mm256_extractf128_pd(lo, 0);
0837                 tmp1 = _mm256_extractf128_pd(lo, 1);
0838                 __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
0839                 __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
0840                 real = _mm256_blend_pd(real, re_tmp1, 12);
0841                 imag = _mm256_blend_pd(imag, im_tmp1, 12);
0842                 return { real, imag };
0843             }
0844         }
0845
0846         // load_unaligned
0847         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0848         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
0849         {
0850             return _mm256_loadu_si256((__m256i const*)mem);
0851         }
0852         template <class A>
0853         XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
0854         {
0855             return _mm256_loadu_ps(mem);
0856         }
0857         template <class A>
0858         XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
0859         {
0860             return _mm256_loadu_pd(mem);
0861         }
0862
0863         // lt
0864         template <class A>
0865         XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0866         {
0867             return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
0868         }
0869         template <class A>
0870         XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0871         {
0872             return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
0873         }
0874
0875         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0876         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0877         {
0878             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
0879                                       { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
0880                                       self, other);
0881         }
0882
0883         // mask
0884         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0885         XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
0886         {
0887             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
0888             {
0889                 __m128i self_low, self_high;
0890                 detail::split_avx(self, self_low, self_high);
0891                 return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
0892             }
0893             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0894             {
0895                 return _mm256_movemask_ps(_mm256_castsi256_ps(self));
0896             }
0897             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0898             {
0899                 return _mm256_movemask_pd(_mm256_castsi256_pd(self));
0900             }
0901             else
0902             {
0903                 assert(false && "unsupported arch/op combination");
0904                 return {};
0905             }
0906         }
0907         template <class A>
0908         XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
0909         {
0910             return _mm256_movemask_ps(self);
0911         }
0912
0913         template <class A>
0914         XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
0915         {
0916             return _mm256_movemask_pd(self);
0917         }
0918
0919         // max
0920         template <class A>
0921         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0922         {
0923             return _mm256_max_ps(self, other);
0924         }
0925         template <class A>
0926         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0927         {
0928             return _mm256_max_pd(self, other);
0929         }
0930         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0931         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0932         {
0933             return select(self > other, self, other);
0934         }
0935
0936         // min
0937         template <class A>
0938         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0939         {
0940             return _mm256_min_ps(self, other);
0941         }
0942         template <class A>
0943         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0944         {
0945             return _mm256_min_pd(self, other);
0946         }
0947         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0948         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
0949         {
0950             return select(self <= other, self, other);
0951         }
0952
0953         // mul
0954         template <class A>
0955         XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
0956         {
0957             return _mm256_mul_ps(self, other);
0958         }
0959         template <class A>
0960         XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
0961         {
0962             return _mm256_mul_pd(self, other);
0963         }
0964
0965         // nearbyint
0966         template <class A>
0967         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
0968         {
0969             return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
0970         }
0971         template <class A>
0972         XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
0973         {
0974             return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
0975         }
0976
0977         // nearbyint_as_int
0978         template <class A>
0979         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
0980                                                         requires_arch<avx>) noexcept
0981         {
0982             return _mm256_cvtps_epi32(self);
0983         }
0984
0985         // neg
0986         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0987         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
0988         {
0989             return 0 - self;
0990         }
0991         template <class A>
0992         batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
0993         {
0994             return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
0995         }
0996         template <class A>
0997         XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
0998         {
0999             return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
1000         }
1001
1002         // neq
1003         template <class A>
1004         XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1005         {
1006             return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
1007         }
1008         template <class A>
1009         XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1010         {
1011             return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
1012         }
1013         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1014         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1015         {
1016             return ~(self == other);
1017         }
1018
1019         template <class A>
1020         XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
1021         {
1022             return _mm256_xor_ps(self, other);
1023         }
1024         template <class A>
1025         XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
1026         {
1027             return _mm256_xor_pd(self, other);
1028         }
1029         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1030         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
1031         {
1032             return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
1033         }
1034
1035         // reciprocal
1036         template <class A>
1037         XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
1038                                                 kernel::requires_arch<avx>) noexcept
1039         {
1040             return _mm256_rcp_ps(self);
1041         }
1042
1043         // reduce_add
1044         template <class A>
1045         XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
1046         {
1047             // Warning about _mm256_hadd_ps:
1048             // _mm256_hadd_ps(a,b) gives
1049             // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
1050             // rely on a naive use of this method
1051             // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
1052             // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
1053             __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
1054             // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
1055             tmp = _mm256_add_ps(rhs, tmp);
1056             // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
1057             tmp = _mm256_hadd_ps(tmp, tmp);
1058             // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
1059             tmp = _mm256_hadd_ps(tmp, tmp);
1060             return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
1061         }
1062         template <class A>
1063         XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
1064         {
1065             // rhs = (x0, x1, x2, x3)
1066             // tmp = (x2, x3, x0, x1)
1067             __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
1068             // tmp = (x2+x0, x3+x1, -, -)
1069             tmp = _mm256_add_pd(rhs, tmp);
1070             // tmp = (x2+x0+x3+x1, -, -, -)
1071             tmp = _mm256_hadd_pd(tmp, tmp);
1072             return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
1073         }
1074         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1075         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
1076         {
1077             __m128i low, high;
1078             detail::split_avx(self, low, high);
1079             batch<T, sse4_2> blow(low), bhigh(high);
1080             return reduce_add(blow) + reduce_add(bhigh);
1081         }
1082
1083         // reduce_max
1084         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1085         XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
1086         {
1087             constexpr auto mask = detail::shuffle(1, 0);
1088             batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
1089             batch<T, A> acc = max(self, step);
1090             __m128i low = _mm256_castsi256_si128(acc);
1091             return reduce_max(batch<T, sse4_2>(low));
1092         }
1093
1094         // reduce_min
1095         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1096         XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
1097         {
1098             constexpr auto mask = detail::shuffle(1, 0);
1099             batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
1100             batch<T, A> acc = min(self, step);
1101             __m128i low = _mm256_castsi256_si128(acc);
1102             return reduce_min(batch<T, sse4_2>(low));
1103         }
1104
1105         // rsqrt
1106         template <class A>
1107         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
1108         {
1109             return _mm256_rsqrt_ps(val);
1110         }
1111         template <class A>
1112         XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
1113         {
1114             return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
1115         }
1116
1117         // sadd
1118         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1119         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1120         {
1121             if (std::is_signed<T>::value)
1122             {
1123                 auto mask = (other >> (8 * sizeof(T) - 1));
1124                 auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
1125                 auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
1126                 return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
1127             }
1128             else
1129             {
1130                 const auto diffmax = std::numeric_limits<T>::max() - self;
1131                 const auto mindiff = min(diffmax, other);
1132                 return self + mindiff;
1133             }
1134         }
1135
1136         // select
1137         template <class A>
1138         XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
1139         {
1140             return _mm256_blendv_ps(false_br, true_br, cond);
1141         }
1142         template <class A>
1143         XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
1144         {
1145             return _mm256_blendv_pd(false_br, true_br, cond);
1146         }
1147         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1148         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
1149         {
1150             __m128i cond_low, cond_hi;
1151             detail::split_avx(cond, cond_low, cond_hi);
1152
1153             __m128i true_low, true_hi;
1154             detail::split_avx(true_br, true_low, true_hi);
1155
1156             __m128i false_low, false_hi;
1157             detail::split_avx(false_br, false_low, false_hi);
1158
1159             __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
1160             __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
1161             return detail::merge_sse(res_low, res_hi);
1162         }
1163         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1164         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
1165         {
1166             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
1167         }
1168
1169         template <class A, bool... Values>
1170         XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
1171         {
1172             constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
1173             return _mm256_blend_ps(false_br, true_br, mask);
1174         }
1175
1176         template <class A, bool... Values>
1177         XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
1178         {
1179             constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
1180             return _mm256_blend_pd(false_br, true_br, mask);
1181         }
1182
1183         // set
1184         template <class A, class... Values>
1185         XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
1186         {
1187             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1188             return _mm256_setr_ps(values...);
1189         }
1190
1191         template <class A, class... Values>
1192         XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
1193         {
1194             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1195             return _mm256_setr_pd(values...);
1196         }
1197         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1198         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
1199         {
1200             return _mm256_set_epi64x(v3, v2, v1, v0);
1201         }
1202         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1203         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1204         {
1205             return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
1206         }
1207         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1208         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1209         {
1210             return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1211         }
1212         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1213         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
1214                                      T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
1215         {
1216             return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
1217         }
1218
1219         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1220         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
1221         {
1222             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1223         }
1224
1225         template <class A, class... Values>
1226         XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
1227         {
1228             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1229             return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
1230         }
1231
1232         template <class A, class... Values>
1233         XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
1234         {
1235             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1236             return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
1237         }
1238
1239         // shuffle
1240         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
1241         XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
1242         {
1243             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
1244             // shuffle within lane
1245             if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
1246                 return _mm256_shuffle_ps(x, y, smask);
1247
1248             // shuffle within opposite lane
1249             if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12)
1250                 return _mm256_shuffle_ps(y, x, smask);
1251
1252             return shuffle(x, y, mask, generic {});
1253         }
1254
1255         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1256         XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
1257         {
1258             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
1259             // shuffle within lane
1260             if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6)
1261                 return _mm256_shuffle_pd(x, y, smask);
1262
1263             // shuffle within opposite lane
1264             if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6)
1265                 return _mm256_shuffle_pd(y, x, smask);
1266
1267             return shuffle(x, y, mask, generic {});
1268         }
1269
1270         // slide_left
1271         template <size_t N, class A, class T>
1272         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
1273         {
1274             constexpr unsigned BitCount = N * 8;
1275             if (BitCount == 0)
1276             {
1277                 return x;
1278             }
1279             if (BitCount >= 256)
1280             {
1281                 return batch<T, A>(T(0));
1282             }
1283             if (BitCount > 128)
1284             {
1285                 constexpr unsigned M = (BitCount - 128) / 8;
1286                 __m128i low = _mm256_castsi256_si128(x);
1287                 auto y = _mm_slli_si128(low, M);
1288                 __m256i zero = _mm256_setzero_si256();
1289                 return _mm256_insertf128_si256(zero, y, 1);
1290             }
1291             if (BitCount == 128)
1292             {
1293                 __m128i low = _mm256_castsi256_si128(x);
1294                 __m256i zero = _mm256_setzero_si256();
1295                 return _mm256_insertf128_si256(zero, low, 1);
1296             }
1297             // shifting by [0, 128[ bits
1298             constexpr unsigned M = BitCount / 8;
1299
1300             __m128i low = _mm256_castsi256_si128(x);
1301             auto ylow = _mm_slli_si128(low, M);
1302             auto zlow = _mm_srli_si128(low, 16 - M);
1303
1304             __m128i high = _mm256_extractf128_si256(x, 1);
1305             auto yhigh = _mm_slli_si128(high, M);
1306
1307             __m256i res = _mm256_castsi128_si256(ylow);
1308             return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
1309         }
1310
1311         // slide_right
1312         template <size_t N, class A, class T>
1313         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
1314         {
1315             constexpr unsigned BitCount = N * 8;
1316             if (BitCount == 0)
1317             {
1318                 return x;
1319             }
1320             if (BitCount >= 256)
1321             {
1322                 return batch<T, A>(T(0));
1323             }
1324             if (BitCount > 128)
1325             {
1326                 constexpr unsigned M = (BitCount - 128) / 8;
1327                 __m128i high = _mm256_extractf128_si256(x, 1);
1328                 __m128i y = _mm_srli_si128(high, M);
1329                 __m256i zero = _mm256_setzero_si256();
1330                 return _mm256_insertf128_si256(zero, y, 0);
1331             }
1332             if (BitCount == 128)
1333             {
1334                 __m128i high = _mm256_extractf128_si256(x, 1);
1335                 return _mm256_castsi128_si256(high);
1336             }
1337             // shifting by [0, 128[ bits
1338             constexpr unsigned M = BitCount / 8;
1339
1340             __m128i low = _mm256_castsi256_si128(x);
1341             auto ylow = _mm_srli_si128(low, M);
1342
1343             __m128i high = _mm256_extractf128_si256(x, 1);
1344             auto yhigh = _mm_srli_si128(high, M);
1345             auto zhigh = _mm_slli_si128(high, 16 - M);
1346
1347             __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
1348             return _mm256_insertf128_si256(res, yhigh, 1);
1349         }
1350
1351         // sqrt
1352         template <class A>
1353         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
1354         {
1355             return _mm256_sqrt_ps(val);
1356         }
1357         template <class A>
1358         XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
1359         {
1360             return _mm256_sqrt_pd(val);
1361         }
1362
1363         // ssub
1364         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1365         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1366         {
1367             if (std::is_signed<T>::value)
1368             {
1369                 return sadd(self, -other);
1370             }
1371             else
1372             {
1373                 const auto diff = min(self, other);
1374                 return self - diff;
1375             }
1376         }
1377
1378         // store_aligned
1379         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1380         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1381         {
1382             return _mm256_store_si256((__m256i*)mem, self);
1383         }
1384         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1385         XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
1386         {
1387             return _mm256_store_si256((__m256i*)mem, self);
1388         }
1389         template <class A>
1390         XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1391         {
1392             return _mm256_store_ps(mem, self);
1393         }
1394         template <class A>
1395         XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1396         {
1397             return _mm256_store_pd(mem, self);
1398         }
1399
1400         // store_unaligned
1401         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1402         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
1403         {
1404             return _mm256_storeu_si256((__m256i*)mem, self);
1405         }
1406         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1407         XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
1408         {
1409             return _mm256_storeu_si256((__m256i*)mem, self);
1410         }
1411         template <class A>
1412         XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
1413         {
1414             return _mm256_storeu_ps(mem, self);
1415         }
1416         template <class A>
1417         XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
1418         {
1419             return _mm256_storeu_pd(mem, self);
1420         }
1421
1422         // sub
1423         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1424         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1425         {
1426             return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
1427                                       { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
1428                                       self, other);
1429         }
1430         template <class A>
1431         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1432         {
1433             return _mm256_sub_ps(self, other);
1434         }
1435         template <class A>
1436         XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1437         {
1438             return _mm256_sub_pd(self, other);
1439         }
1440
1441         // swizzle (dynamic mask)
1442         template <class A>
1443         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
1444         {
1445             // duplicate low and high part of input
1446             __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
1447             __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
1448
1449             __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
1450             __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
1451
1452             // normalize mask
1453             batch<uint32_t, A> half_mask = mask % 4;
1454
1455             // permute within each lane
1456             __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
1457             __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
1458
1459             // mask to choose the right lane
1460             batch_bool<uint32_t, A> blend_mask = mask >= 4;
1461
1462             // blend the two permutes
1463             return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
1464         }
1465
1466         template <class A>
1467         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
1468         {
1469             // duplicate low and high part of input
1470             __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
1471             __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
1472
1473             __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
1474             __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
1475
1476             // normalize mask
1477             batch<uint64_t, A> half_mask = -(mask & 1);
1478
1479             // permute within each lane
1480             __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
1481             __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
1482
1483             // mask to choose the right lane
1484             batch_bool<uint64_t, A> blend_mask = mask >= 2;
1485
1486             // blend the two permutes
1487             return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
1488         }
1489
1490         template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
1491         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
1492         {
1493             return bitwise_cast<T>(
1494                 swizzle(bitwise_cast<float>(self), mask));
1495         }
1496
1497         template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
1498         XSIMD_INLINE batch<T, A>
1499         swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
1500         {
1501             return bitwise_cast<T>(
1502                 swizzle(bitwise_cast<double>(self), mask));
1503         }
1504
1505         // swizzle (constant mask)
1506         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
1507         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
1508         {
1509             // duplicate low and high part of input
1510             __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
1511             __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
1512
1513             __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
1514             __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
1515
1516             // normalize mask
1517             batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
1518
1519             // permute within each lane
1520             __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
1521             __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
1522
1523             // mask to choose the right lane
1524             batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
1525
1526             // blend the two permutes
1527             constexpr auto mask = blend_mask.mask();
1528             return _mm256_blend_ps(r0, r1, mask);
1529         }
1530
1531         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
1532         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
1533         {
1534             // duplicate low and high part of input
1535             __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
1536             __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
1537
1538             __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
1539             __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
1540
1541             // normalize mask
1542             batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
1543
1544             // permute within each lane
1545             __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
1546             __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
1547
1548             // mask to choose the right lane
1549             batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
1550
1551             // blend the two permutes
1552             constexpr auto mask = blend_mask.mask();
1553             return _mm256_blend_pd(r0, r1, mask);
1554         }
1555         template <class A,
1556                   typename T,
1557                   uint32_t V0,
1558                   uint32_t V1,
1559                   uint32_t V2,
1560                   uint32_t V3,
1561                   uint32_t V4,
1562                   uint32_t V5,
1563                   uint32_t V6,
1564                   uint32_t V7,
1565                   detail::enable_sized_integral_t<T, 4> = 0>
1566         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
1567                                          batch_constant<uint32_t, A,
1568                                                         V0,
1569                                                         V1,
1570                                                         V2,
1571                                                         V3,
1572                                                         V4,
1573                                                         V5,
1574                                                         V6,
1575                                                         V7> const& mask,
1576                                          requires_arch<avx>) noexcept
1577         {
1578             return bitwise_cast<T>(
1579                 swizzle(bitwise_cast<float>(self), mask));
1580         }
1581
1582         template <class A,
1583                   typename T,
1584                   uint64_t V0,
1585                   uint64_t V1,
1586                   uint64_t V2,
1587                   uint64_t V3,
1588                   detail::enable_sized_integral_t<T, 8> = 0>
1589         XSIMD_INLINE batch<T, A>
1590         swizzle(batch<T, A> const& self,
1591                 batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
1592                 requires_arch<avx>) noexcept
1593         {
1594             return bitwise_cast<T>(
1595                 swizzle(bitwise_cast<double>(self), mask));
1596         }
1597         // transpose
1598         template <class A>
1599         XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
1600         {
1601             assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1602             (void)matrix_end;
1603             // See
1604             // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
1605             auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1606                  r2 = matrix_begin[2], r3 = matrix_begin[3],
1607                  r4 = matrix_begin[4], r5 = matrix_begin[5],
1608                  r6 = matrix_begin[6], r7 = matrix_begin[7];
1609
1610             auto t0 = _mm256_unpacklo_ps(r0, r1);
1611             auto t1 = _mm256_unpackhi_ps(r0, r1);
1612             auto t2 = _mm256_unpacklo_ps(r2, r3);
1613             auto t3 = _mm256_unpackhi_ps(r2, r3);
1614             auto t4 = _mm256_unpacklo_ps(r4, r5);
1615             auto t5 = _mm256_unpackhi_ps(r4, r5);
1616             auto t6 = _mm256_unpacklo_ps(r6, r7);
1617             auto t7 = _mm256_unpackhi_ps(r6, r7);
1618
1619             r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
1620             r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
1621             r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
1622             r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
1623             r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
1624             r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
1625             r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
1626             r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
1627
1628             matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
1629             matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
1630             matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
1631             matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
1632             matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
1633             matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
1634             matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
1635             matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
1636         }
1637
1638         template <class A>
1639         XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
1640         {
1641             return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1642         }
1643         template <class A>
1644         XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
1645         {
1646             return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1647         }
1648
1649         template <class A>
1650         XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
1651         {
1652             assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1653             (void)matrix_end;
1654             auto r0 = matrix_begin[0], r1 = matrix_begin[1],
1655                  r2 = matrix_begin[2], r3 = matrix_begin[3];
1656
1657             auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11
1658             auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13
1659             auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31
1660             auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33
1661
1662             matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
1663             matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20);
1664             matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31);
1665             matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
1666         }
1667
1668         template <class A>
1669         XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
1670         {
1671             return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1672         }
1673         template <class A>
1674         XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
1675         {
1676             return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1677         }
1678
1679         // trunc
1680         template <class A>
1681         XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
1682         {
1683             return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
1684         }
1685         template <class A>
1686         XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
1687         {
1688             return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
1689         }
1690
1691         // zip_hi
1692         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1693         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1694         {
1695             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
1696             {
1697                 // extract high word
1698                 __m128i self_hi = _mm256_extractf128_si256(self, 1);
1699                 __m128i other_hi = _mm256_extractf128_si256(other, 1);
1700
1701                 // interleave
1702                 __m128i res_lo, res_hi;
1703                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1704                 {
1705                     res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
1706                     res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
1707                 }
1708                 else
1709                 {
1710                     res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
1711                     res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
1712                 }
1713
1714                 // fuse
1715                 return _mm256_castps_si256(
1716                     _mm256_insertf128_ps(
1717                         _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
1718                         _mm_castsi128_ps(res_hi),
1719                         1));
1720             }
1721             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1722             {
1723                 auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1724                 auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1725                 return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
1726             }
1727             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1728             {
1729                 auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1730                 auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1731                 return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
1732             }
1733             else
1734             {
1735                 assert(false && "unsupported arch/op combination");
1736                 return {};
1737             }
1738         }
1739         template <class A>
1740         XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1741         {
1742             auto lo = _mm256_unpacklo_ps(self, other);
1743             auto hi = _mm256_unpackhi_ps(self, other);
1744             return _mm256_permute2f128_ps(lo, hi, 0x31);
1745         }
1746         template <class A>
1747         XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1748         {
1749             auto lo = _mm256_unpacklo_pd(self, other);
1750             auto hi = _mm256_unpackhi_pd(self, other);
1751             return _mm256_permute2f128_pd(lo, hi, 0x31);
1752         }
1753
1754         // zip_lo
1755         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1756         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
1757         {
1758             XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
1759             {
1760                 // extract low word
1761                 __m128i self_lo = _mm256_extractf128_si256(self, 0);
1762                 __m128i other_lo = _mm256_extractf128_si256(other, 0);
1763
1764                 // interleave
1765                 __m128i res_lo, res_hi;
1766                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1767                 {
1768                     res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
1769                     res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
1770                 }
1771                 else
1772                 {
1773                     res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
1774                     res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
1775                 }
1776
1777                 // fuse
1778                 return _mm256_castps_si256(
1779                     _mm256_insertf128_ps(
1780                         _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
1781                         _mm_castsi128_ps(res_hi),
1782                         1));
1783             }
1784             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1785             {
1786                 auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1787                 auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
1788                 return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
1789             }
1790             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1791             {
1792                 auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1793                 auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
1794                 return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
1795             }
1796             else
1797             {
1798                 assert(false && "unsupported arch/op combination");
1799                 return {};
1800             }
1801         }
1802
1803         template <class A>
1804         XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
1805         {
1806             auto lo = _mm256_unpacklo_ps(self, other);
1807             auto hi = _mm256_unpackhi_ps(self, other);
1808             return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
1809         }
1810         template <class A>
1811         XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
1812         {
1813             auto lo = _mm256_unpacklo_pd(self, other);
1814             auto hi = _mm256_unpackhi_pd(self, other);
1815             return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
1816         }
1817     }
1818 }
1819
1820 #endif