xsimd/arch/xsimd_sse2.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_SSE2_HPP
0013 #define XSIMD_SSE2_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_sse2_register.hpp"
0020
0021 namespace xsimd
0022 {
0023     template <typename T, class A, bool... Values>
0024     struct batch_bool_constant;
0025
0026     template <class T_out, class T_in, class A>
0027     XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
0028
0029     template <typename T, class A, T... Values>
0030     struct batch_constant;
0031
0032     namespace kernel
0033     {
0034         using namespace types;
0035
0036         namespace detail
0037         {
0038             constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
0039             {
0040                 return (z << 6) | (y << 4) | (x << 2) | w;
0041             }
0042             constexpr uint32_t shuffle(uint32_t x, uint32_t y)
0043             {
0044                 return (y << 1) | x;
0045             }
0046
0047             constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
0048             {
0049                 return shuffle(w % 4, x % 4, y % 4, z % 4);
0050             }
0051
0052             constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x)
0053             {
0054                 return shuffle(w % 2, x % 2);
0055             }
0056         }
0057
0058         // fwd
0059         template <class A, class T, size_t I>
0060         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
0061         template <class A, typename T, typename ITy, ITy... Indices>
0062         XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
0063         template <class A, class T>
0064         XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
0065         template <class A, class T>
0066         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
0067
0068         // abs
0069         template <class A>
0070         XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
0071         {
0072             __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
0073             return _mm_andnot_pd(sign_mask, self);
0074         }
0075         template <class A>
0076         XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
0077         {
0078             __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
0079             return _mm_andnot_ps(sign_mask, self);
0080         }
0081
0082         // add
0083         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0084         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0085         {
0086             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0087             {
0088                 return _mm_add_epi8(self, other);
0089             }
0090             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0091             {
0092                 return _mm_add_epi16(self, other);
0093             }
0094             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0095             {
0096                 return _mm_add_epi32(self, other);
0097             }
0098             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0099             {
0100                 return _mm_add_epi64(self, other);
0101             }
0102             else
0103             {
0104                 assert(false && "unsupported arch/op combination");
0105                 return {};
0106             }
0107         }
0108
0109         template <class A>
0110         XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0111         {
0112             return _mm_add_ps(self, other);
0113         }
0114
0115         template <class A>
0116         XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0117         {
0118             return _mm_add_pd(self, other);
0119         }
0120
0121         // all
0122         template <class A>
0123         XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
0124         {
0125             return _mm_movemask_ps(self) == 0x0F;
0126         }
0127         template <class A>
0128         XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
0129         {
0130             return _mm_movemask_pd(self) == 0x03;
0131         }
0132         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0133         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
0134         {
0135             return _mm_movemask_epi8(self) == 0xFFFF;
0136         }
0137
0138         // any
0139         template <class A>
0140         XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
0141         {
0142             return _mm_movemask_ps(self) != 0;
0143         }
0144         template <class A>
0145         XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
0146         {
0147             return _mm_movemask_pd(self) != 0;
0148         }
0149         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0150         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
0151         {
0152             return _mm_movemask_epi8(self) != 0;
0153         }
0154
0155         // avgr
0156         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0157         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0158         {
0159             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0160             {
0161                 return _mm_avg_epu8(self, other);
0162             }
0163             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0164             {
0165                 return _mm_avg_epu16(self, other);
0166             }
0167             else
0168             {
0169                 return avgr(self, other, generic {});
0170             }
0171         }
0172
0173         // avg
0174         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0175         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0176         {
0177             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0178             {
0179                 auto adj = ((self ^ other) << 7) >> 7;
0180                 return avgr(self, other, A {}) - adj;
0181             }
0182             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0183             {
0184                 auto adj = ((self ^ other) << 15) >> 15;
0185                 return avgr(self, other, A {}) - adj;
0186             }
0187             else
0188             {
0189                 return avg(self, other, generic {});
0190             }
0191         }
0192
0193         // batch_bool_cast
0194         template <class A, class T_out, class T_in>
0195         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
0196         {
0197             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
0198         }
0199
0200         // bitwise_and
0201         template <class A>
0202         XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0203         {
0204             return _mm_and_ps(self, other);
0205         }
0206         template <class A>
0207         XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0208         {
0209             return _mm_and_ps(self, other);
0210         }
0211         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0212         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0213         {
0214             return _mm_and_si128(self, other);
0215         }
0216         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0217         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0218         {
0219             return _mm_and_si128(self, other);
0220         }
0221
0222         template <class A>
0223         batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0224         {
0225             return _mm_and_pd(self, other);
0226         }
0227
0228         template <class A>
0229         XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0230         {
0231             return _mm_and_pd(self, other);
0232         }
0233
0234         // bitwise_andnot
0235         template <class A>
0236         XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0237         {
0238             return _mm_andnot_ps(other, self);
0239         }
0240
0241         template <class A>
0242         XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0243         {
0244             return _mm_andnot_ps(other, self);
0245         }
0246         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0247         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0248         {
0249             return _mm_andnot_si128(other, self);
0250         }
0251         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0252         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0253         {
0254             return _mm_andnot_si128(other, self);
0255         }
0256
0257         template <class A>
0258         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0259         {
0260             return _mm_andnot_pd(other, self);
0261         }
0262
0263         template <class A>
0264         XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0265         {
0266             return _mm_andnot_pd(other, self);
0267         }
0268
0269         // bitwise_lshift
0270         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0271         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
0272         {
0273             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0274             {
0275                 return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
0276             }
0277             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0278             {
0279                 return _mm_slli_epi16(self, other);
0280             }
0281             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0282             {
0283                 return _mm_slli_epi32(self, other);
0284             }
0285             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0286             {
0287                 return _mm_slli_epi64(self, other);
0288             }
0289             else
0290             {
0291                 assert(false && "unsupported arch/op combination");
0292                 return {};
0293             }
0294         }
0295
0296         // bitwise_not
0297         template <class A>
0298         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
0299         {
0300             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
0301         }
0302         template <class A>
0303         XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
0304         {
0305             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
0306         }
0307         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0308         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
0309         {
0310             return _mm_xor_si128(self, _mm_set1_epi32(-1));
0311         }
0312         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0313         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
0314         {
0315             return _mm_xor_si128(self, _mm_set1_epi32(-1));
0316         }
0317         template <class A>
0318         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
0319         {
0320             return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
0321         }
0322         template <class A>
0323         XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
0324         {
0325             return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
0326         }
0327
0328         // bitwise_or
0329         template <class A>
0330         XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0331         {
0332             return _mm_or_ps(self, other);
0333         }
0334         template <class A>
0335         XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0336         {
0337             return _mm_or_ps(self, other);
0338         }
0339         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0340         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0341         {
0342             return _mm_or_si128(self, other);
0343         }
0344         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0345         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0346         {
0347             return _mm_or_si128(self, other);
0348         }
0349
0350         template <class A>
0351         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0352         {
0353             return _mm_or_pd(self, other);
0354         }
0355
0356         template <class A>
0357         XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0358         {
0359             return _mm_or_pd(self, other);
0360         }
0361
0362         // bitwise_rshift
0363         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0364         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
0365         {
0366             if (std::is_signed<T>::value)
0367             {
0368                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0369                 {
0370                     __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
0371                     __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
0372                     __m128i res = _mm_srai_epi16(self, other);
0373                     return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
0374                 }
0375                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0376                 {
0377                     return _mm_srai_epi16(self, other);
0378                 }
0379                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0380                 {
0381                     return _mm_srai_epi32(self, other);
0382                 }
0383                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0384                 {
0385                     // from https://github.com/samyvilar/vect/blob/master/vect_128.h
0386                     return _mm_or_si128(
0387                         _mm_srli_epi64(self, other),
0388                         _mm_slli_epi64(
0389                             _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
0390                             64 - other));
0391                 }
0392                 else
0393                 {
0394                     assert(false && "unsupported arch/op combination");
0395                     return {};
0396                 }
0397             }
0398             else
0399             {
0400                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0401                 {
0402                     return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
0403                 }
0404                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0405                 {
0406                     return _mm_srli_epi16(self, other);
0407                 }
0408                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0409                 {
0410                     return _mm_srli_epi32(self, other);
0411                 }
0412                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0413                 {
0414                     return _mm_srli_epi64(self, other);
0415                 }
0416                 else
0417                 {
0418                     assert(false && "unsupported arch/op combination");
0419                     return {};
0420                 }
0421             }
0422         }
0423
0424         // bitwise_xor
0425         template <class A>
0426         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0427         {
0428             return _mm_xor_ps(self, other);
0429         }
0430         template <class A>
0431         XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0432         {
0433             return _mm_xor_ps(self, other);
0434         }
0435         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0436         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0437         {
0438             return _mm_xor_si128(self, other);
0439         }
0440         template <class A>
0441         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0442         {
0443             return _mm_xor_pd(self, other);
0444         }
0445         template <class A>
0446         XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0447         {
0448             return _mm_xor_pd(self, other);
0449         }
0450         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0451         XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0452         {
0453             return _mm_xor_si128(self, other);
0454         }
0455
0456         // bitwise_cast
0457         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0458         XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
0459         {
0460             return _mm_castsi128_ps(self);
0461         }
0462         template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
0463         XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
0464         {
0465             return batch<Tp, A>(self.data);
0466         }
0467         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0468         XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
0469         {
0470             return _mm_castps_si128(self);
0471         }
0472         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0473         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
0474         {
0475             return _mm_castsi128_pd(self);
0476         }
0477         template <class A>
0478         XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
0479         {
0480             return _mm_castps_pd(self);
0481         }
0482         template <class A>
0483         XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
0484         {
0485             return _mm_castpd_ps(self);
0486         }
0487         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0488         XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
0489         {
0490             return _mm_castpd_si128(self);
0491         }
0492
0493         // broadcast
0494         template <class A>
0495         batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
0496         {
0497             return _mm_set1_ps(val);
0498         }
0499         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0500         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
0501         {
0502             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0503             {
0504                 return _mm_set1_epi8(val);
0505             }
0506             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0507             {
0508                 return _mm_set1_epi16(val);
0509             }
0510             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0511             {
0512                 return _mm_set1_epi32(val);
0513             }
0514             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0515             {
0516                 return _mm_set1_epi64x(val);
0517             }
0518             else
0519             {
0520                 assert(false && "unsupported arch/op combination");
0521                 return {};
0522             }
0523         }
0524         template <class A>
0525         XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
0526         {
0527             return _mm_set1_pd(val);
0528         }
0529
0530         // store_complex
0531         namespace detail
0532         {
0533             // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
0534             // complex_low
0535             template <class A>
0536             XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
0537             {
0538                 return _mm_unpacklo_ps(self.real(), self.imag());
0539             }
0540             // complex_high
0541             template <class A>
0542             XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
0543             {
0544                 return _mm_unpackhi_ps(self.real(), self.imag());
0545             }
0546             template <class A>
0547             XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
0548             {
0549                 return _mm_unpacklo_pd(self.real(), self.imag());
0550             }
0551             template <class A>
0552             XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
0553             {
0554                 return _mm_unpackhi_pd(self.real(), self.imag());
0555             }
0556         }
0557
0558         // decr_if
0559         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0560         XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
0561         {
0562             return self + batch<T, A>(mask.data);
0563         }
0564
0565         // div
0566         template <class A>
0567         XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0568         {
0569             return _mm_div_ps(self, other);
0570         }
0571         template <class A>
0572         XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0573         {
0574             return _mm_div_pd(self, other);
0575         }
0576
0577         // fast_cast
0578         namespace detail
0579         {
0580             template <class A>
0581             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
0582             {
0583                 return _mm_cvtepi32_ps(self);
0584             }
0585
0586             template <class A>
0587             XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
0588             {
0589                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0590                 // adapted to sse2
0591                 __m128i xH = _mm_srli_epi64(x, 32);
0592                 xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
0593                 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
0594                 __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
0595                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
0596                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0597             }
0598
0599             template <class A>
0600             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
0601             {
0602                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0603                 // adapted to sse2
0604                 __m128i xH = _mm_srai_epi32(x, 16);
0605                 xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
0606                 xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
0607                 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
0608                 __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
0609                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
0610                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0611             }
0612
0613             template <class A>
0614             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
0615             {
0616                 return _mm_cvttps_epi32(self);
0617             }
0618         }
0619
0620         // eq
0621         template <class A>
0622         XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0623         {
0624             return _mm_cmpeq_ps(self, other);
0625         }
0626         template <class A>
0627         XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0628         {
0629             return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
0630         }
0631         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0632         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0633         {
0634             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0635             {
0636                 return _mm_cmpeq_epi8(self, other);
0637             }
0638             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0639             {
0640                 return _mm_cmpeq_epi16(self, other);
0641             }
0642             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0643             {
0644                 return _mm_cmpeq_epi32(self, other);
0645             }
0646             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0647             {
0648                 __m128i tmp1 = _mm_cmpeq_epi32(self, other);
0649                 __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
0650                 __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
0651                 __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
0652                 return _mm_shuffle_epi32(tmp4, 0xF5);
0653             }
0654             else
0655             {
0656                 assert(false && "unsupported arch/op combination");
0657                 return {};
0658             }
0659         }
0660         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0661         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0662         {
0663             return ~(self != other);
0664         }
0665         template <class A>
0666         XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0667         {
0668             return _mm_cmpeq_pd(self, other);
0669         }
0670         template <class A>
0671         XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0672         {
0673             return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
0674         }
0675
0676         // from_mask
0677         template <class A>
0678         XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
0679         {
0680             alignas(A::alignment()) static const uint32_t lut[][4] = {
0681                 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
0682                 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
0683                 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0684                 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0685                 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0686                 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0687                 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0688                 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0689                 { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
0690                 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
0691                 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0692                 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0693                 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0694                 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0695                 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0696                 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0697             };
0698             assert(!(mask & ~0xFul) && "inbound mask");
0699             return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
0700         }
0701         template <class A>
0702         XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
0703         {
0704             alignas(A::alignment()) static const uint64_t lut[][4] = {
0705                 { 0x0000000000000000ul, 0x0000000000000000ul },
0706                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0707                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0708                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0709             };
0710             assert(!(mask & ~0x3ul) && "inbound mask");
0711             return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
0712         }
0713         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0714         XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
0715         {
0716             alignas(A::alignment()) static const uint64_t lut64[] = {
0717                 0x0000000000000000,
0718                 0x000000000000FFFF,
0719                 0x00000000FFFF0000,
0720                 0x00000000FFFFFFFF,
0721                 0x0000FFFF00000000,
0722                 0x0000FFFF0000FFFF,
0723                 0x0000FFFFFFFF0000,
0724                 0x0000FFFFFFFFFFFF,
0725                 0xFFFF000000000000,
0726                 0xFFFF00000000FFFF,
0727                 0xFFFF0000FFFF0000,
0728                 0xFFFF0000FFFFFFFF,
0729                 0xFFFFFFFF00000000,
0730                 0xFFFFFFFF0000FFFF,
0731                 0xFFFFFFFFFFFF0000,
0732                 0xFFFFFFFFFFFFFFFF,
0733             };
0734             alignas(A::alignment()) static const uint32_t lut32[] = {
0735                 0x00000000,
0736                 0x000000FF,
0737                 0x0000FF00,
0738                 0x0000FFFF,
0739                 0x00FF0000,
0740                 0x00FF00FF,
0741                 0x00FFFF00,
0742                 0x00FFFFFF,
0743                 0xFF000000,
0744                 0xFF0000FF,
0745                 0xFF00FF00,
0746                 0xFF00FFFF,
0747                 0xFFFF0000,
0748                 0xFFFF00FF,
0749                 0xFFFFFF00,
0750                 0xFFFFFFFF,
0751             };
0752             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0753             {
0754                 assert(!(mask & ~0xFFFF) && "inbound mask");
0755                 return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
0756             }
0757             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0758             {
0759                 assert(!(mask & ~0xFF) && "inbound mask");
0760                 return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
0761             }
0762             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0763             {
0764                 return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
0765             }
0766             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0767             {
0768                 return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
0769             }
0770         }
0771
0772         // ge
0773         template <class A>
0774         XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0775         {
0776             return _mm_cmpge_ps(self, other);
0777         }
0778         template <class A>
0779         XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0780         {
0781             return _mm_cmpge_pd(self, other);
0782         }
0783
0784         // gt
0785         template <class A>
0786         XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0787         {
0788             return _mm_cmpgt_ps(self, other);
0789         }
0790         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0791         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0792         {
0793             if (std::is_signed<T>::value)
0794             {
0795                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0796                 {
0797                     return _mm_cmpgt_epi8(self, other);
0798                 }
0799                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0800                 {
0801                     return _mm_cmpgt_epi16(self, other);
0802                 }
0803                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0804                 {
0805                     return _mm_cmpgt_epi32(self, other);
0806                 }
0807                 else
0808                 {
0809                     return gt(self, other, generic {});
0810                 }
0811             }
0812             else
0813             {
0814                 return gt(self, other, generic {});
0815             }
0816         }
0817
0818         template <class A>
0819         XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0820         {
0821             return _mm_cmpgt_pd(self, other);
0822         }
0823
0824         // haddp
0825         template <class A>
0826         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
0827         {
0828             __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
0829             __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
0830             __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
0831             tmp0 = _mm_add_ps(tmp0, tmp1);
0832             tmp1 = _mm_unpacklo_ps(row[2], row[3]);
0833             tmp1 = _mm_add_ps(tmp1, tmp2);
0834             tmp2 = _mm_movehl_ps(tmp1, tmp0);
0835             tmp0 = _mm_movelh_ps(tmp0, tmp1);
0836             return _mm_add_ps(tmp0, tmp2);
0837         }
0838         template <class A>
0839         XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
0840         {
0841             return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
0842                               _mm_unpackhi_pd(row[0], row[1]));
0843         }
0844
0845         // incr_if
0846         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0847         XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
0848         {
0849             return self - batch<T, A>(mask.data);
0850         }
0851
0852         // insert
0853         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0854         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
0855         {
0856             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0857             {
0858                 return _mm_insert_epi16(self, val, I);
0859             }
0860             else
0861             {
0862                 return insert(self, val, pos, generic {});
0863             }
0864         }
0865
0866         // isnan
0867         template <class A>
0868         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
0869         {
0870             return _mm_cmpunord_ps(self, self);
0871         }
0872         template <class A>
0873         XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
0874         {
0875             return _mm_cmpunord_pd(self, self);
0876         }
0877
0878         // load_aligned
0879         template <class A>
0880         XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
0881         {
0882             return _mm_load_ps(mem);
0883         }
0884         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0885         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
0886         {
0887             return _mm_load_si128((__m128i const*)mem);
0888         }
0889         template <class A>
0890         XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
0891         {
0892             return _mm_load_pd(mem);
0893         }
0894
0895         // load_unaligned
0896         template <class A>
0897         XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
0898         {
0899             return _mm_loadu_ps(mem);
0900         }
0901         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0902         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
0903         {
0904             return _mm_loadu_si128((__m128i const*)mem);
0905         }
0906         template <class A>
0907         XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
0908         {
0909             return _mm_loadu_pd(mem);
0910         }
0911
0912         // load_complex
0913         namespace detail
0914         {
0915             // Redefine these methods in the SSE-based archs if required
0916             template <class A>
0917             XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
0918             {
0919                 return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
0920             }
0921             template <class A>
0922             XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
0923             {
0924                 return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
0925             }
0926         }
0927
0928         // le
0929         template <class A>
0930         XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0931         {
0932             return _mm_cmple_ps(self, other);
0933         }
0934         template <class A>
0935         XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0936         {
0937             return _mm_cmple_pd(self, other);
0938         }
0939
0940         // lt
0941         template <class A>
0942         XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0943         {
0944             return _mm_cmplt_ps(self, other);
0945         }
0946         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0947         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0948         {
0949             if (std::is_signed<T>::value)
0950             {
0951                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0952                 {
0953                     return _mm_cmplt_epi8(self, other);
0954                 }
0955                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0956                 {
0957                     return _mm_cmplt_epi16(self, other);
0958                 }
0959                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0960                 {
0961                     return _mm_cmplt_epi32(self, other);
0962                 }
0963                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0964                 {
0965                     __m128i tmp1 = _mm_sub_epi64(self, other);
0966                     __m128i tmp2 = _mm_xor_si128(self, other);
0967                     __m128i tmp3 = _mm_andnot_si128(other, self);
0968                     __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
0969                     __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
0970                     __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
0971                     return _mm_shuffle_epi32(tmp6, 0xF5);
0972                 }
0973                 else
0974                 {
0975                     assert(false && "unsupported arch/op combination");
0976                     return {};
0977                 }
0978             }
0979             else
0980             {
0981                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0982                 {
0983                     return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
0984                 }
0985                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0986                 {
0987                     return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
0988                 }
0989                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0990                 {
0991                     return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
0992                 }
0993                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0994                 {
0995                     auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
0996                     auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
0997                     __m128i tmp1 = _mm_sub_epi64(xself, xother);
0998                     __m128i tmp2 = _mm_xor_si128(xself, xother);
0999                     __m128i tmp3 = _mm_andnot_si128(xother, xself);
1000                     __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
1001                     __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
1002                     __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
1003                     return _mm_shuffle_epi32(tmp6, 0xF5);
1004                 }
1005                 else
1006                 {
1007                     assert(false && "unsupported arch/op combination");
1008                     return {};
1009                 }
1010             }
1011         }
1012
1013         template <class A>
1014         XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1015         {
1016             return _mm_cmplt_pd(self, other);
1017         }
1018
1019         /* compression table to turn 0b10 into 0b1,
1020          * 0b100010 into 0b101 etc
1021          */
1022         namespace detail
1023         {
1024             XSIMD_INLINE int mask_lut(int mask)
1025             {
1026                 // clang-format off
1027                 static const int mask_lut[256] = {
1028                   0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
1029                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1030                   0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
1031                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1032                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1033                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1034                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1035                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1036                   0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
1037                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1038                   0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
1039                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1040                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1041                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1042                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1043                   0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1044                 };
1045                 // clang-format on
1046                 return mask_lut[mask & 0xAA];
1047             }
1048         }
1049
1050         // mask
1051         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1052         XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1053         {
1054             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1055             {
1056                 return _mm_movemask_epi8(self);
1057             }
1058             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1059             {
1060                 uint64_t mask8 = _mm_movemask_epi8(self);
1061                 return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
1062             }
1063             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1064             {
1065                 return _mm_movemask_ps(_mm_castsi128_ps(self));
1066             }
1067             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1068             {
1069                 return _mm_movemask_pd(_mm_castsi128_pd(self));
1070             }
1071             else
1072             {
1073                 assert(false && "unsupported arch/op combination");
1074                 return {};
1075             }
1076         }
1077         template <class A>
1078         XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
1079         {
1080             return _mm_movemask_ps(self);
1081         }
1082
1083         template <class A>
1084         XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
1085         {
1086             return _mm_movemask_pd(self);
1087         }
1088
1089         // max
1090         template <class A>
1091         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1092         {
1093             return _mm_max_ps(self, other);
1094         }
1095         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1096         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1097         {
1098             return select(self > other, self, other);
1099         }
1100         template <class A>
1101         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1102         {
1103             return _mm_max_pd(self, other);
1104         }
1105
1106         // min
1107         template <class A>
1108         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1109         {
1110             return _mm_min_ps(self, other);
1111         }
1112         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1113         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1114         {
1115             return select(self <= other, self, other);
1116         }
1117         template <class A>
1118         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1119         {
1120             return _mm_min_pd(self, other);
1121         }
1122
1123         // mul
1124         template <class A>
1125         XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1126         {
1127             return _mm_mul_ps(self, other);
1128         }
1129         template <class A>
1130         XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1131         {
1132             return _mm_mul_pd(self, other);
1133         }
1134
1135         // mul
1136         template <class A>
1137         XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
1138         {
1139             return _mm_mullo_epi16(self, other);
1140         }
1141
1142         // nearbyint_as_int
1143         template <class A>
1144         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1145                                                         requires_arch<sse2>) noexcept
1146         {
1147             return _mm_cvtps_epi32(self);
1148         }
1149
1150         // neg
1151         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1152         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
1153         {
1154             return 0 - self;
1155         }
1156         template <class A>
1157         XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
1158         {
1159             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
1160         }
1161
1162         template <class A>
1163         XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
1164         {
1165             return _mm_xor_pd(
1166                 self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
1167         }
1168
1169         // neq
1170         template <class A>
1171         XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1172         {
1173             return _mm_cmpneq_ps(self, other);
1174         }
1175         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1176         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1177         {
1178             return ~(self == other);
1179         }
1180         template <class A>
1181         XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
1182         {
1183             return _mm_xor_ps(self, other);
1184         }
1185         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1186         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
1187         {
1188             return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
1189         }
1190
1191         template <class A>
1192         XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1193         {
1194             return _mm_cmpneq_pd(self, other);
1195         }
1196         template <class A>
1197         XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
1198         {
1199             return _mm_xor_pd(self, other);
1200         }
1201
1202         // reciprocal
1203         template <class A>
1204         XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
1205                                                 kernel::requires_arch<sse2>)
1206         {
1207             return _mm_rcp_ps(self);
1208         }
1209
1210         // reduce_add
1211         template <class A>
1212         XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
1213         {
1214             __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
1215             __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
1216             return _mm_cvtss_f32(tmp1);
1217         }
1218
1219         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1220         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
1221         {
1222             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1223             {
1224                 __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
1225                 __m128i tmp2 = _mm_add_epi32(self, tmp1);
1226                 __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
1227                 __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
1228                 return _mm_cvtsi128_si32(tmp4);
1229             }
1230             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1231             {
1232                 __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
1233                 __m128i tmp2 = _mm_add_epi64(self, tmp1);
1234 #if defined(__x86_64__)
1235                 return _mm_cvtsi128_si64(tmp2);
1236 #else
1237                 __m128i m;
1238                 _mm_storel_epi64(&m, tmp2);
1239                 int64_t i;
1240                 std::memcpy(&i, &m, sizeof(i));
1241                 return i;
1242 #endif
1243             }
1244             else
1245             {
1246                 return hadd(self, generic {});
1247             }
1248         }
1249
1250         template <class A>
1251         XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
1252         {
1253             return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
1254         }
1255
1256         // reduce_max
1257         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1258         XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
1259         {
1260             constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1261             batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1262             batch<T, A> acc0 = max(self, step0);
1263
1264             constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1265             batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1266             batch<T, A> acc1 = max(acc0, step1);
1267
1268             constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1269             batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1270             batch<T, A> acc2 = max(acc1, step2);
1271             if (sizeof(T) == 2)
1272                 return acc2.get(0);
1273             batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1274             batch<T, A> acc3 = max(acc2, step3);
1275             return acc3.get(0);
1276         }
1277
1278         // reduce_min
1279         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1280         XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
1281         {
1282             constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1283             batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1284             batch<T, A> acc0 = min(self, step0);
1285
1286             constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1287             batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1288             batch<T, A> acc1 = min(acc0, step1);
1289
1290             constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1291             batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1292             batch<T, A> acc2 = min(acc1, step2);
1293             if (sizeof(T) == 2)
1294                 return acc2.get(0);
1295             batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1296             batch<T, A> acc3 = min(acc2, step3);
1297             return acc3.get(0);
1298         }
1299
1300         // rsqrt
1301         template <class A>
1302         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
1303         {
1304             return _mm_rsqrt_ps(val);
1305         }
1306         template <class A>
1307         XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
1308         {
1309             return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
1310         }
1311
1312         // select
1313         template <class A>
1314         XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
1315         {
1316             return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
1317         }
1318
1319         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1320         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
1321         {
1322             return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
1323         }
1324         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1325         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
1326         {
1327             return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
1328         }
1329         template <class A>
1330         XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
1331         {
1332             return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
1333         }
1334
1335         // shuffle
1336         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1337         XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
1338         {
1339             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
1340             // shuffle within lane
1341             if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
1342                 return _mm_shuffle_ps(x, y, smask);
1343
1344             // shuffle within opposite lane
1345             if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
1346                 return _mm_shuffle_ps(y, x, smask);
1347             return shuffle(x, y, mask, generic {});
1348         }
1349
1350         template <class A, class ITy, ITy I0, ITy I1>
1351         XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
1352         {
1353             constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
1354             // shuffle within lane
1355             if (I0 < 2 && I1 >= 2)
1356                 return _mm_shuffle_pd(x, y, smask);
1357
1358             // shuffle within opposite lane
1359             if (I0 >= 2 && I1 < 2)
1360                 return _mm_shuffle_pd(y, x, smask);
1361             return shuffle(x, y, mask, generic {});
1362         }
1363
1364         // sqrt
1365         template <class A>
1366         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
1367         {
1368             return _mm_sqrt_ps(val);
1369         }
1370         template <class A>
1371         XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
1372         {
1373             return _mm_sqrt_pd(val);
1374         }
1375
1376         // slide_left
1377         template <size_t N, class A, class T>
1378         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
1379         {
1380             return _mm_slli_si128(x, N);
1381         }
1382
1383         // slide_right
1384         template <size_t N, class A, class T>
1385         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
1386         {
1387             return _mm_srli_si128(x, N);
1388         }
1389
1390         // sadd
1391
1392         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1393         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1394         {
1395             if (std::is_signed<T>::value)
1396             {
1397                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1398                 {
1399                     return _mm_adds_epi8(self, other);
1400                 }
1401                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1402                 {
1403                     return _mm_adds_epi16(self, other);
1404                 }
1405                 else
1406                 {
1407                     return sadd(self, other, generic {});
1408                 }
1409             }
1410             else
1411             {
1412                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1413                 {
1414                     return _mm_adds_epu8(self, other);
1415                 }
1416                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1417                 {
1418                     return _mm_adds_epu16(self, other);
1419                 }
1420                 else
1421                 {
1422                     return sadd(self, other, generic {});
1423                 }
1424             }
1425         }
1426
1427         // set
1428         template <class A, class... Values>
1429         XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
1430         {
1431             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1432             return _mm_setr_ps(values...);
1433         }
1434
1435         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1436         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
1437         {
1438             return _mm_set_epi64x(v1, v0);
1439         }
1440         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1441         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
1442         {
1443             return _mm_setr_epi32(v0, v1, v2, v3);
1444         }
1445         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1446         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1447         {
1448             return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
1449         }
1450         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1451         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1452         {
1453             return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1454         }
1455
1456         template <class A, class... Values>
1457         XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
1458         {
1459             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1460             return _mm_setr_pd(values...);
1461         }
1462
1463         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1464         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
1465         {
1466             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1467         }
1468
1469         template <class A, class... Values>
1470         XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
1471         {
1472             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1473             return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
1474         }
1475
1476         template <class A, class... Values>
1477         XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
1478         {
1479             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1480             return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
1481         }
1482
1483         // ssub
1484
1485         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1486         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1487         {
1488             if (std::is_signed<T>::value)
1489             {
1490                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1491                 {
1492                     return _mm_subs_epi8(self, other);
1493                 }
1494                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1495                 {
1496                     return _mm_subs_epi16(self, other);
1497                 }
1498                 else
1499                 {
1500                     return ssub(self, other, generic {});
1501                 }
1502             }
1503             else
1504             {
1505                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1506                 {
1507                     return _mm_subs_epu8(self, other);
1508                 }
1509                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1510                 {
1511                     return _mm_subs_epu16(self, other);
1512                 }
1513                 else
1514                 {
1515                     return ssub(self, other, generic {});
1516                 }
1517             }
1518         }
1519
1520         // store_aligned
1521         template <class A>
1522         XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1523         {
1524             return _mm_store_ps(mem, self);
1525         }
1526         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1527         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1528         {
1529             return _mm_store_si128((__m128i*)mem, self);
1530         }
1531         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1532         XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1533         {
1534             return _mm_store_si128((__m128i*)mem, self);
1535         }
1536         template <class A>
1537         XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1538         {
1539             return _mm_store_pd(mem, self);
1540         }
1541
1542         // store_unaligned
1543         template <class A>
1544         XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1545         {
1546             return _mm_storeu_ps(mem, self);
1547         }
1548         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1549         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1550         {
1551             return _mm_storeu_si128((__m128i*)mem, self);
1552         }
1553         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1554         XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1555         {
1556             return _mm_storeu_si128((__m128i*)mem, self);
1557         }
1558         template <class A>
1559         XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1560         {
1561             return _mm_storeu_pd(mem, self);
1562         }
1563
1564         // sub
1565         template <class A>
1566         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1567         {
1568             return _mm_sub_ps(self, other);
1569         }
1570         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1571         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1572         {
1573             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1574             {
1575                 return _mm_sub_epi8(self, other);
1576             }
1577             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1578             {
1579                 return _mm_sub_epi16(self, other);
1580             }
1581             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1582             {
1583                 return _mm_sub_epi32(self, other);
1584             }
1585             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1586             {
1587                 return _mm_sub_epi64(self, other);
1588             }
1589             else
1590             {
1591                 assert(false && "unsupported arch/op combination");
1592                 return {};
1593             }
1594         }
1595         template <class A>
1596         XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1597         {
1598             return _mm_sub_pd(self, other);
1599         }
1600
1601         // swizzle
1602
1603         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1604         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1605         {
1606             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
1607             return _mm_shuffle_ps(self, self, index);
1608         }
1609
1610         template <class A, uint64_t V0, uint64_t V1>
1611         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
1612         {
1613             constexpr uint32_t index = detail::shuffle(V0, V1);
1614             return _mm_shuffle_pd(self, self, index);
1615         }
1616
1617         template <class A, uint64_t V0, uint64_t V1>
1618         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
1619         {
1620             constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
1621             return _mm_shuffle_epi32(self, index);
1622         }
1623
1624         template <class A, uint64_t V0, uint64_t V1>
1625         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
1626         {
1627             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
1628         }
1629
1630         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1631         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1632         {
1633             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
1634             return _mm_shuffle_epi32(self, index);
1635         }
1636
1637         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1638         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
1639         {
1640             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
1641         }
1642
1643         // transpose
1644         template <class A>
1645         XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
1646         {
1647             assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1648             (void)matrix_end;
1649             auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1650             _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
1651             matrix_begin[0] = r0;
1652             matrix_begin[1] = r1;
1653             matrix_begin[2] = r2;
1654             matrix_begin[3] = r3;
1655         }
1656         template <class A>
1657         XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1658         {
1659             transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1660         }
1661         template <class A>
1662         XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1663         {
1664             transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1665         }
1666
1667         template <class A>
1668         XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
1669         {
1670             assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1671             (void)matrix_end;
1672             auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1673             matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
1674             matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
1675         }
1676         template <class A>
1677         XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1678         {
1679             transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1680         }
1681         template <class A>
1682         XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1683         {
1684             transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1685         }
1686
1687         // zip_hi
1688         template <class A>
1689         XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1690         {
1691             return _mm_unpackhi_ps(self, other);
1692         }
1693         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1694         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1695         {
1696             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1697             {
1698                 return _mm_unpackhi_epi8(self, other);
1699             }
1700             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1701             {
1702                 return _mm_unpackhi_epi16(self, other);
1703             }
1704             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1705             {
1706                 return _mm_unpackhi_epi32(self, other);
1707             }
1708             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1709             {
1710                 return _mm_unpackhi_epi64(self, other);
1711             }
1712             else
1713             {
1714                 assert(false && "unsupported arch/op combination");
1715                 return {};
1716             }
1717         }
1718         template <class A>
1719         XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1720         {
1721             return _mm_unpackhi_pd(self, other);
1722         }
1723
1724         // zip_lo
1725         template <class A>
1726         XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1727         {
1728             return _mm_unpacklo_ps(self, other);
1729         }
1730         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1731         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1732         {
1733             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1734             {
1735                 return _mm_unpacklo_epi8(self, other);
1736             }
1737             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1738             {
1739                 return _mm_unpacklo_epi16(self, other);
1740             }
1741             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1742             {
1743                 return _mm_unpacklo_epi32(self, other);
1744             }
1745             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1746             {
1747                 return _mm_unpacklo_epi64(self, other);
1748             }
1749             else
1750             {
1751                 assert(false && "unsupported arch/op combination");
1752                 return {};
1753             }
1754         }
1755         template <class A>
1756         XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1757         {
1758             return _mm_unpacklo_pd(self, other);
1759         }
1760     }
1761 }
1762
1763 #endif