File indexing completed on 2025-08-28 09:11:36
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_SSE2_HPP
0013 #define XSIMD_SSE2_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_sse2_register.hpp"
0020
0021 namespace xsimd
0022 {
0023 template <typename T, class A, bool... Values>
0024 struct batch_bool_constant;
0025
0026 template <class T_out, class T_in, class A>
0027 XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
0028
0029 template <typename T, class A, T... Values>
0030 struct batch_constant;
0031
0032 namespace kernel
0033 {
0034 using namespace types;
0035
0036 namespace detail
0037 {
0038 constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
0039 {
0040 return (z << 6) | (y << 4) | (x << 2) | w;
0041 }
0042 constexpr uint32_t shuffle(uint32_t x, uint32_t y)
0043 {
0044 return (y << 1) | x;
0045 }
0046
0047 constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
0048 {
0049 return shuffle(w % 4, x % 4, y % 4, z % 4);
0050 }
0051
0052 constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x)
0053 {
0054 return shuffle(w % 2, x % 2);
0055 }
0056 }
0057
0058
0059 template <class A, class T, size_t I>
0060 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
0061 template <class A, typename T, typename ITy, ITy... Indices>
0062 XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
0063 template <class A, class T>
0064 XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
0065 template <class A, class T>
0066 XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
0067
0068
0069 template <class A>
0070 XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
0071 {
0072 __m128d sign_mask = _mm_set1_pd(-0.f);
0073 return _mm_andnot_pd(sign_mask, self);
0074 }
0075 template <class A>
0076 XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
0077 {
0078 __m128 sign_mask = _mm_set1_ps(-0.f);
0079 return _mm_andnot_ps(sign_mask, self);
0080 }
0081
0082
0083 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0084 XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0085 {
0086 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0087 {
0088 return _mm_add_epi8(self, other);
0089 }
0090 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0091 {
0092 return _mm_add_epi16(self, other);
0093 }
0094 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0095 {
0096 return _mm_add_epi32(self, other);
0097 }
0098 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0099 {
0100 return _mm_add_epi64(self, other);
0101 }
0102 else
0103 {
0104 assert(false && "unsupported arch/op combination");
0105 return {};
0106 }
0107 }
0108
0109 template <class A>
0110 XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0111 {
0112 return _mm_add_ps(self, other);
0113 }
0114
0115 template <class A>
0116 XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0117 {
0118 return _mm_add_pd(self, other);
0119 }
0120
0121
0122 template <class A>
0123 XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
0124 {
0125 return _mm_movemask_ps(self) == 0x0F;
0126 }
0127 template <class A>
0128 XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
0129 {
0130 return _mm_movemask_pd(self) == 0x03;
0131 }
0132 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0133 XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
0134 {
0135 return _mm_movemask_epi8(self) == 0xFFFF;
0136 }
0137
0138
0139 template <class A>
0140 XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
0141 {
0142 return _mm_movemask_ps(self) != 0;
0143 }
0144 template <class A>
0145 XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
0146 {
0147 return _mm_movemask_pd(self) != 0;
0148 }
0149 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0150 XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
0151 {
0152 return _mm_movemask_epi8(self) != 0;
0153 }
0154
0155
0156 template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0157 XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0158 {
0159 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0160 {
0161 return _mm_avg_epu8(self, other);
0162 }
0163 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0164 {
0165 return _mm_avg_epu16(self, other);
0166 }
0167 else
0168 {
0169 return avgr(self, other, generic {});
0170 }
0171 }
0172
0173
0174 template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0175 XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0176 {
0177 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0178 {
0179 auto adj = ((self ^ other) << 7) >> 7;
0180 return avgr(self, other, A {}) - adj;
0181 }
0182 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0183 {
0184 auto adj = ((self ^ other) << 15) >> 15;
0185 return avgr(self, other, A {}) - adj;
0186 }
0187 else
0188 {
0189 return avg(self, other, generic {});
0190 }
0191 }
0192
0193
0194 template <class A, class T_out, class T_in>
0195 XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
0196 {
0197 return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
0198 }
0199
0200
0201 template <class A>
0202 XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0203 {
0204 return _mm_and_ps(self, other);
0205 }
0206 template <class A>
0207 XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0208 {
0209 return _mm_and_ps(self, other);
0210 }
0211 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0212 XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0213 {
0214 return _mm_and_si128(self, other);
0215 }
0216 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0217 XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0218 {
0219 return _mm_and_si128(self, other);
0220 }
0221
0222 template <class A>
0223 batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0224 {
0225 return _mm_and_pd(self, other);
0226 }
0227
0228 template <class A>
0229 XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0230 {
0231 return _mm_and_pd(self, other);
0232 }
0233
0234
0235 template <class A>
0236 XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0237 {
0238 return _mm_andnot_ps(other, self);
0239 }
0240
0241 template <class A>
0242 XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0243 {
0244 return _mm_andnot_ps(other, self);
0245 }
0246 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0247 XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0248 {
0249 return _mm_andnot_si128(other, self);
0250 }
0251 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0252 XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0253 {
0254 return _mm_andnot_si128(other, self);
0255 }
0256
0257 template <class A>
0258 XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0259 {
0260 return _mm_andnot_pd(other, self);
0261 }
0262
0263 template <class A>
0264 XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0265 {
0266 return _mm_andnot_pd(other, self);
0267 }
0268
0269
0270 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0271 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
0272 {
0273 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0274 {
0275 return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
0276 }
0277 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0278 {
0279 return _mm_slli_epi16(self, other);
0280 }
0281 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0282 {
0283 return _mm_slli_epi32(self, other);
0284 }
0285 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0286 {
0287 return _mm_slli_epi64(self, other);
0288 }
0289 else
0290 {
0291 assert(false && "unsupported arch/op combination");
0292 return {};
0293 }
0294 }
0295
0296
0297 template <class A>
0298 XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
0299 {
0300 return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
0301 }
0302 template <class A>
0303 XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
0304 {
0305 return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
0306 }
0307 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0308 XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
0309 {
0310 return _mm_xor_si128(self, _mm_set1_epi32(-1));
0311 }
0312 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0313 XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
0314 {
0315 return _mm_xor_si128(self, _mm_set1_epi32(-1));
0316 }
0317 template <class A>
0318 XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
0319 {
0320 return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
0321 }
0322 template <class A>
0323 XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
0324 {
0325 return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
0326 }
0327
0328
0329 template <class A>
0330 XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0331 {
0332 return _mm_or_ps(self, other);
0333 }
0334 template <class A>
0335 XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0336 {
0337 return _mm_or_ps(self, other);
0338 }
0339 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0340 XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0341 {
0342 return _mm_or_si128(self, other);
0343 }
0344 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0345 XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0346 {
0347 return _mm_or_si128(self, other);
0348 }
0349
0350 template <class A>
0351 XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0352 {
0353 return _mm_or_pd(self, other);
0354 }
0355
0356 template <class A>
0357 XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0358 {
0359 return _mm_or_pd(self, other);
0360 }
0361
0362
0363 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0364 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
0365 {
0366 if (std::is_signed<T>::value)
0367 {
0368 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0369 {
0370 __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
0371 __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
0372 __m128i res = _mm_srai_epi16(self, other);
0373 return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
0374 }
0375 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0376 {
0377 return _mm_srai_epi16(self, other);
0378 }
0379 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0380 {
0381 return _mm_srai_epi32(self, other);
0382 }
0383 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0384 {
0385
0386 return _mm_or_si128(
0387 _mm_srli_epi64(self, other),
0388 _mm_slli_epi64(
0389 _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
0390 64 - other));
0391 }
0392 else
0393 {
0394 assert(false && "unsupported arch/op combination");
0395 return {};
0396 }
0397 }
0398 else
0399 {
0400 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0401 {
0402 return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
0403 }
0404 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0405 {
0406 return _mm_srli_epi16(self, other);
0407 }
0408 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0409 {
0410 return _mm_srli_epi32(self, other);
0411 }
0412 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0413 {
0414 return _mm_srli_epi64(self, other);
0415 }
0416 else
0417 {
0418 assert(false && "unsupported arch/op combination");
0419 return {};
0420 }
0421 }
0422 }
0423
0424
0425 template <class A>
0426 XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0427 {
0428 return _mm_xor_ps(self, other);
0429 }
0430 template <class A>
0431 XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0432 {
0433 return _mm_xor_ps(self, other);
0434 }
0435 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0436 XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0437 {
0438 return _mm_xor_si128(self, other);
0439 }
0440 template <class A>
0441 XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0442 {
0443 return _mm_xor_pd(self, other);
0444 }
0445 template <class A>
0446 XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0447 {
0448 return _mm_xor_pd(self, other);
0449 }
0450 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0451 XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0452 {
0453 return _mm_xor_si128(self, other);
0454 }
0455
0456
0457 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0458 XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
0459 {
0460 return _mm_castsi128_ps(self);
0461 }
0462 template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
0463 XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
0464 {
0465 return batch<Tp, A>(self.data);
0466 }
0467 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0468 XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
0469 {
0470 return _mm_castps_si128(self);
0471 }
0472 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0473 XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
0474 {
0475 return _mm_castsi128_pd(self);
0476 }
0477 template <class A>
0478 XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
0479 {
0480 return _mm_castps_pd(self);
0481 }
0482 template <class A>
0483 XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
0484 {
0485 return _mm_castpd_ps(self);
0486 }
0487 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0488 XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
0489 {
0490 return _mm_castpd_si128(self);
0491 }
0492
0493
0494 template <class A>
0495 batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
0496 {
0497 return _mm_set1_ps(val);
0498 }
0499 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0500 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
0501 {
0502 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0503 {
0504 return _mm_set1_epi8(val);
0505 }
0506 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0507 {
0508 return _mm_set1_epi16(val);
0509 }
0510 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0511 {
0512 return _mm_set1_epi32(val);
0513 }
0514 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0515 {
0516 return _mm_set1_epi64x(val);
0517 }
0518 else
0519 {
0520 assert(false && "unsupported arch/op combination");
0521 return {};
0522 }
0523 }
0524 template <class A>
0525 XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
0526 {
0527 return _mm_set1_pd(val);
0528 }
0529
0530
0531 namespace detail
0532 {
0533
0534
0535 template <class A>
0536 XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
0537 {
0538 return _mm_unpacklo_ps(self.real(), self.imag());
0539 }
0540
0541 template <class A>
0542 XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
0543 {
0544 return _mm_unpackhi_ps(self.real(), self.imag());
0545 }
0546 template <class A>
0547 XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
0548 {
0549 return _mm_unpacklo_pd(self.real(), self.imag());
0550 }
0551 template <class A>
0552 XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
0553 {
0554 return _mm_unpackhi_pd(self.real(), self.imag());
0555 }
0556 }
0557
0558
0559 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0560 XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
0561 {
0562 return self + batch<T, A>(mask.data);
0563 }
0564
0565
0566 template <class A>
0567 XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0568 {
0569 return _mm_div_ps(self, other);
0570 }
0571 template <class A>
0572 XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0573 {
0574 return _mm_div_pd(self, other);
0575 }
0576
0577
0578 namespace detail
0579 {
0580 template <class A>
0581 XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
0582 {
0583 return _mm_cvtepi32_ps(self);
0584 }
0585
0586 template <class A>
0587 XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
0588 {
0589
0590
0591 __m128i xH = _mm_srli_epi64(x, 32);
0592 xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));
0593 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
0594 __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))));
0595 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));
0596 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0597 }
0598
0599 template <class A>
0600 XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
0601 {
0602
0603
0604 __m128i xH = _mm_srai_epi32(x, 16);
0605 xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
0606 xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.)));
0607 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
0608 __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000))));
0609 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.));
0610 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0611 }
0612
0613 template <class A>
0614 XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
0615 {
0616 return _mm_cvttps_epi32(self);
0617 }
0618 }
0619
0620
0621 template <class A>
0622 XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0623 {
0624 return _mm_cmpeq_ps(self, other);
0625 }
0626 template <class A>
0627 XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
0628 {
0629 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
0630 }
0631 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0632 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0633 {
0634 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0635 {
0636 return _mm_cmpeq_epi8(self, other);
0637 }
0638 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0639 {
0640 return _mm_cmpeq_epi16(self, other);
0641 }
0642 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0643 {
0644 return _mm_cmpeq_epi32(self, other);
0645 }
0646 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0647 {
0648 __m128i tmp1 = _mm_cmpeq_epi32(self, other);
0649 __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
0650 __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
0651 __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
0652 return _mm_shuffle_epi32(tmp4, 0xF5);
0653 }
0654 else
0655 {
0656 assert(false && "unsupported arch/op combination");
0657 return {};
0658 }
0659 }
0660 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0661 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
0662 {
0663 return ~(self != other);
0664 }
0665 template <class A>
0666 XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0667 {
0668 return _mm_cmpeq_pd(self, other);
0669 }
0670 template <class A>
0671 XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
0672 {
0673 return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
0674 }
0675
0676
0677 template <class A>
0678 XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
0679 {
0680 alignas(A::alignment()) static const uint32_t lut[][4] = {
0681 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
0682 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
0683 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0684 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0685 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0686 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0687 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0688 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0689 { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
0690 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
0691 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0692 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0693 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0694 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0695 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0696 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0697 };
0698 assert(!(mask & ~0xFul) && "inbound mask");
0699 return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
0700 }
0701 template <class A>
0702 XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
0703 {
0704 alignas(A::alignment()) static const uint64_t lut[][4] = {
0705 { 0x0000000000000000ul, 0x0000000000000000ul },
0706 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0707 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0708 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0709 };
0710 assert(!(mask & ~0x3ul) && "inbound mask");
0711 return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
0712 }
0713 template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0714 XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
0715 {
0716 alignas(A::alignment()) static const uint64_t lut64[] = {
0717 0x0000000000000000,
0718 0x000000000000FFFF,
0719 0x00000000FFFF0000,
0720 0x00000000FFFFFFFF,
0721 0x0000FFFF00000000,
0722 0x0000FFFF0000FFFF,
0723 0x0000FFFFFFFF0000,
0724 0x0000FFFFFFFFFFFF,
0725 0xFFFF000000000000,
0726 0xFFFF00000000FFFF,
0727 0xFFFF0000FFFF0000,
0728 0xFFFF0000FFFFFFFF,
0729 0xFFFFFFFF00000000,
0730 0xFFFFFFFF0000FFFF,
0731 0xFFFFFFFFFFFF0000,
0732 0xFFFFFFFFFFFFFFFF,
0733 };
0734 alignas(A::alignment()) static const uint32_t lut32[] = {
0735 0x00000000,
0736 0x000000FF,
0737 0x0000FF00,
0738 0x0000FFFF,
0739 0x00FF0000,
0740 0x00FF00FF,
0741 0x00FFFF00,
0742 0x00FFFFFF,
0743 0xFF000000,
0744 0xFF0000FF,
0745 0xFF00FF00,
0746 0xFF00FFFF,
0747 0xFFFF0000,
0748 0xFFFF00FF,
0749 0xFFFFFF00,
0750 0xFFFFFFFF,
0751 };
0752 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0753 {
0754 assert(!(mask & ~0xFFFF) && "inbound mask");
0755 return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
0756 }
0757 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0758 {
0759 assert(!(mask & ~0xFF) && "inbound mask");
0760 return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
0761 }
0762 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0763 {
0764 return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
0765 }
0766 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0767 {
0768 return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
0769 }
0770 }
0771
0772
0773 template <class A>
0774 XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0775 {
0776 return _mm_cmpge_ps(self, other);
0777 }
0778 template <class A>
0779 XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0780 {
0781 return _mm_cmpge_pd(self, other);
0782 }
0783
0784
0785 template <class A>
0786 XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0787 {
0788 return _mm_cmpgt_ps(self, other);
0789 }
0790 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0791 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0792 {
0793 if (std::is_signed<T>::value)
0794 {
0795 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0796 {
0797 return _mm_cmpgt_epi8(self, other);
0798 }
0799 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0800 {
0801 return _mm_cmpgt_epi16(self, other);
0802 }
0803 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0804 {
0805 return _mm_cmpgt_epi32(self, other);
0806 }
0807 else
0808 {
0809 return gt(self, other, generic {});
0810 }
0811 }
0812 else
0813 {
0814 return gt(self, other, generic {});
0815 }
0816 }
0817
0818 template <class A>
0819 XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0820 {
0821 return _mm_cmpgt_pd(self, other);
0822 }
0823
0824
0825 template <class A>
0826 XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
0827 {
0828 __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
0829 __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
0830 __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
0831 tmp0 = _mm_add_ps(tmp0, tmp1);
0832 tmp1 = _mm_unpacklo_ps(row[2], row[3]);
0833 tmp1 = _mm_add_ps(tmp1, tmp2);
0834 tmp2 = _mm_movehl_ps(tmp1, tmp0);
0835 tmp0 = _mm_movelh_ps(tmp0, tmp1);
0836 return _mm_add_ps(tmp0, tmp2);
0837 }
0838 template <class A>
0839 XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
0840 {
0841 return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
0842 _mm_unpackhi_pd(row[0], row[1]));
0843 }
0844
0845
0846 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0847 XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
0848 {
0849 return self - batch<T, A>(mask.data);
0850 }
0851
0852
0853 template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0854 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
0855 {
0856 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0857 {
0858 return _mm_insert_epi16(self, val, I);
0859 }
0860 else
0861 {
0862 return insert(self, val, pos, generic {});
0863 }
0864 }
0865
0866
0867 template <class A>
0868 XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
0869 {
0870 return _mm_cmpunord_ps(self, self);
0871 }
0872 template <class A>
0873 XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
0874 {
0875 return _mm_cmpunord_pd(self, self);
0876 }
0877
0878
0879 template <class A>
0880 XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
0881 {
0882 return _mm_load_ps(mem);
0883 }
0884 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0885 XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
0886 {
0887 return _mm_load_si128((__m128i const*)mem);
0888 }
0889 template <class A>
0890 XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
0891 {
0892 return _mm_load_pd(mem);
0893 }
0894
0895
0896 template <class A>
0897 XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
0898 {
0899 return _mm_loadu_ps(mem);
0900 }
0901 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0902 XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
0903 {
0904 return _mm_loadu_si128((__m128i const*)mem);
0905 }
0906 template <class A>
0907 XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
0908 {
0909 return _mm_loadu_pd(mem);
0910 }
0911
0912
0913 namespace detail
0914 {
0915
0916 template <class A>
0917 XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
0918 {
0919 return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
0920 }
0921 template <class A>
0922 XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
0923 {
0924 return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
0925 }
0926 }
0927
0928
0929 template <class A>
0930 XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0931 {
0932 return _mm_cmple_ps(self, other);
0933 }
0934 template <class A>
0935 XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
0936 {
0937 return _mm_cmple_pd(self, other);
0938 }
0939
0940
0941 template <class A>
0942 XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
0943 {
0944 return _mm_cmplt_ps(self, other);
0945 }
0946 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0947 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
0948 {
0949 if (std::is_signed<T>::value)
0950 {
0951 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0952 {
0953 return _mm_cmplt_epi8(self, other);
0954 }
0955 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0956 {
0957 return _mm_cmplt_epi16(self, other);
0958 }
0959 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0960 {
0961 return _mm_cmplt_epi32(self, other);
0962 }
0963 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0964 {
0965 __m128i tmp1 = _mm_sub_epi64(self, other);
0966 __m128i tmp2 = _mm_xor_si128(self, other);
0967 __m128i tmp3 = _mm_andnot_si128(other, self);
0968 __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
0969 __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
0970 __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
0971 return _mm_shuffle_epi32(tmp6, 0xF5);
0972 }
0973 else
0974 {
0975 assert(false && "unsupported arch/op combination");
0976 return {};
0977 }
0978 }
0979 else
0980 {
0981 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0982 {
0983 return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
0984 }
0985 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0986 {
0987 return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
0988 }
0989 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0990 {
0991 return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
0992 }
0993 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0994 {
0995 auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
0996 auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
0997 __m128i tmp1 = _mm_sub_epi64(xself, xother);
0998 __m128i tmp2 = _mm_xor_si128(xself, xother);
0999 __m128i tmp3 = _mm_andnot_si128(xother, xself);
1000 __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
1001 __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
1002 __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
1003 return _mm_shuffle_epi32(tmp6, 0xF5);
1004 }
1005 else
1006 {
1007 assert(false && "unsupported arch/op combination");
1008 return {};
1009 }
1010 }
1011 }
1012
1013 template <class A>
1014 XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1015 {
1016 return _mm_cmplt_pd(self, other);
1017 }
1018
1019
1020
1021
1022 namespace detail
1023 {
1024 XSIMD_INLINE int mask_lut(int mask)
1025 {
1026
1027 static const int mask_lut[256] = {
1028 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
1029 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1030 0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
1031 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1032 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1033 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1034 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1035 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1036 0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
1037 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1038 0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
1039 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1040 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1041 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1042 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1043 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
1044 };
1045
1046 return mask_lut[mask & 0xAA];
1047 }
1048 }
1049
1050
1051 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1052 XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1053 {
1054 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1055 {
1056 return _mm_movemask_epi8(self);
1057 }
1058 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1059 {
1060 uint64_t mask8 = _mm_movemask_epi8(self);
1061 return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
1062 }
1063 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1064 {
1065 return _mm_movemask_ps(_mm_castsi128_ps(self));
1066 }
1067 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1068 {
1069 return _mm_movemask_pd(_mm_castsi128_pd(self));
1070 }
1071 else
1072 {
1073 assert(false && "unsupported arch/op combination");
1074 return {};
1075 }
1076 }
1077 template <class A>
1078 XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
1079 {
1080 return _mm_movemask_ps(self);
1081 }
1082
1083 template <class A>
1084 XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
1085 {
1086 return _mm_movemask_pd(self);
1087 }
1088
1089
1090 template <class A>
1091 XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1092 {
1093 return _mm_max_ps(self, other);
1094 }
1095 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1096 XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1097 {
1098 return select(self > other, self, other);
1099 }
1100 template <class A>
1101 XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1102 {
1103 return _mm_max_pd(self, other);
1104 }
1105
1106
1107 template <class A>
1108 XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1109 {
1110 return _mm_min_ps(self, other);
1111 }
1112 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1113 XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1114 {
1115 return select(self <= other, self, other);
1116 }
1117 template <class A>
1118 XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1119 {
1120 return _mm_min_pd(self, other);
1121 }
1122
1123
1124 template <class A>
1125 XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1126 {
1127 return _mm_mul_ps(self, other);
1128 }
1129 template <class A>
1130 XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1131 {
1132 return _mm_mul_pd(self, other);
1133 }
1134
1135
1136 template <class A>
1137 XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
1138 {
1139 return _mm_mullo_epi16(self, other);
1140 }
1141
1142
1143 template <class A>
1144 XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1145 requires_arch<sse2>) noexcept
1146 {
1147 return _mm_cvtps_epi32(self);
1148 }
1149
1150
1151 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1152 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
1153 {
1154 return 0 - self;
1155 }
1156 template <class A>
1157 XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
1158 {
1159 return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
1160 }
1161
1162 template <class A>
1163 XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
1164 {
1165 return _mm_xor_pd(
1166 self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
1167 }
1168
1169
1170 template <class A>
1171 XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1172 {
1173 return _mm_cmpneq_ps(self, other);
1174 }
1175 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1176 XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1177 {
1178 return ~(self == other);
1179 }
1180 template <class A>
1181 XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
1182 {
1183 return _mm_xor_ps(self, other);
1184 }
1185 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1186 XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
1187 {
1188 return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
1189 }
1190
1191 template <class A>
1192 XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1193 {
1194 return _mm_cmpneq_pd(self, other);
1195 }
1196 template <class A>
1197 XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
1198 {
1199 return _mm_xor_pd(self, other);
1200 }
1201
1202
1203 template <class A>
1204 XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
1205 kernel::requires_arch<sse2>)
1206 {
1207 return _mm_rcp_ps(self);
1208 }
1209
1210
1211 template <class A>
1212 XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
1213 {
1214 __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
1215 __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
1216 return _mm_cvtss_f32(tmp1);
1217 }
1218
1219 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1220 XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
1221 {
1222 XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1223 {
1224 __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
1225 __m128i tmp2 = _mm_add_epi32(self, tmp1);
1226 __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
1227 __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
1228 return _mm_cvtsi128_si32(tmp4);
1229 }
1230 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1231 {
1232 __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
1233 __m128i tmp2 = _mm_add_epi64(self, tmp1);
1234 #if defined(__x86_64__)
1235 return _mm_cvtsi128_si64(tmp2);
1236 #else
1237 __m128i m;
1238 _mm_storel_epi64(&m, tmp2);
1239 int64_t i;
1240 std::memcpy(&i, &m, sizeof(i));
1241 return i;
1242 #endif
1243 }
1244 else
1245 {
1246 return hadd(self, generic {});
1247 }
1248 }
1249
1250 template <class A>
1251 XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
1252 {
1253 return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
1254 }
1255
1256
1257 template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1258 XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
1259 {
1260 constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1261 batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1262 batch<T, A> acc0 = max(self, step0);
1263
1264 constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1265 batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1266 batch<T, A> acc1 = max(acc0, step1);
1267
1268 constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1269 batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1270 batch<T, A> acc2 = max(acc1, step2);
1271 if (sizeof(T) == 2)
1272 return acc2.get(0);
1273 batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1274 batch<T, A> acc3 = max(acc2, step3);
1275 return acc3.get(0);
1276 }
1277
1278
1279 template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1280 XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
1281 {
1282 constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
1283 batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
1284 batch<T, A> acc0 = min(self, step0);
1285
1286 constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
1287 batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
1288 batch<T, A> acc1 = min(acc0, step1);
1289
1290 constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
1291 batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
1292 batch<T, A> acc2 = min(acc1, step2);
1293 if (sizeof(T) == 2)
1294 return acc2.get(0);
1295 batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1296 batch<T, A> acc3 = min(acc2, step3);
1297 return acc3.get(0);
1298 }
1299
1300
1301 template <class A>
1302 XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
1303 {
1304 return _mm_rsqrt_ps(val);
1305 }
1306 template <class A>
1307 XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
1308 {
1309 return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
1310 }
1311
1312
1313 template <class A>
1314 XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
1315 {
1316 return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
1317 }
1318
1319 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1320 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
1321 {
1322 return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
1323 }
1324 template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1325 XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
1326 {
1327 return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
1328 }
1329 template <class A>
1330 XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
1331 {
1332 return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
1333 }
1334
1335
1336 template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1337 XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
1338 {
1339 constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
1340
1341 if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
1342 return _mm_shuffle_ps(x, y, smask);
1343
1344
1345 if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
1346 return _mm_shuffle_ps(y, x, smask);
1347 return shuffle(x, y, mask, generic {});
1348 }
1349
1350 template <class A, class ITy, ITy I0, ITy I1>
1351 XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
1352 {
1353 constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
1354
1355 if (I0 < 2 && I1 >= 2)
1356 return _mm_shuffle_pd(x, y, smask);
1357
1358
1359 if (I0 >= 2 && I1 < 2)
1360 return _mm_shuffle_pd(y, x, smask);
1361 return shuffle(x, y, mask, generic {});
1362 }
1363
1364
1365 template <class A>
1366 XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
1367 {
1368 return _mm_sqrt_ps(val);
1369 }
1370 template <class A>
1371 XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
1372 {
1373 return _mm_sqrt_pd(val);
1374 }
1375
1376
1377 template <size_t N, class A, class T>
1378 XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
1379 {
1380 return _mm_slli_si128(x, N);
1381 }
1382
1383
1384 template <size_t N, class A, class T>
1385 XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
1386 {
1387 return _mm_srli_si128(x, N);
1388 }
1389
1390
1391
1392 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1393 XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1394 {
1395 if (std::is_signed<T>::value)
1396 {
1397 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1398 {
1399 return _mm_adds_epi8(self, other);
1400 }
1401 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1402 {
1403 return _mm_adds_epi16(self, other);
1404 }
1405 else
1406 {
1407 return sadd(self, other, generic {});
1408 }
1409 }
1410 else
1411 {
1412 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1413 {
1414 return _mm_adds_epu8(self, other);
1415 }
1416 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1417 {
1418 return _mm_adds_epu16(self, other);
1419 }
1420 else
1421 {
1422 return sadd(self, other, generic {});
1423 }
1424 }
1425 }
1426
1427
1428 template <class A, class... Values>
1429 XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
1430 {
1431 static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1432 return _mm_setr_ps(values...);
1433 }
1434
1435 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1436 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
1437 {
1438 return _mm_set_epi64x(v1, v0);
1439 }
1440 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1441 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
1442 {
1443 return _mm_setr_epi32(v0, v1, v2, v3);
1444 }
1445 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1446 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1447 {
1448 return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
1449 }
1450 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1451 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1452 {
1453 return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1454 }
1455
1456 template <class A, class... Values>
1457 XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
1458 {
1459 static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1460 return _mm_setr_pd(values...);
1461 }
1462
1463 template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1464 XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
1465 {
1466 return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1467 }
1468
1469 template <class A, class... Values>
1470 XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
1471 {
1472 static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1473 return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
1474 }
1475
1476 template <class A, class... Values>
1477 XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
1478 {
1479 static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1480 return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
1481 }
1482
1483
1484
1485 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1486 XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1487 {
1488 if (std::is_signed<T>::value)
1489 {
1490 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1491 {
1492 return _mm_subs_epi8(self, other);
1493 }
1494 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1495 {
1496 return _mm_subs_epi16(self, other);
1497 }
1498 else
1499 {
1500 return ssub(self, other, generic {});
1501 }
1502 }
1503 else
1504 {
1505 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1506 {
1507 return _mm_subs_epu8(self, other);
1508 }
1509 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1510 {
1511 return _mm_subs_epu16(self, other);
1512 }
1513 else
1514 {
1515 return ssub(self, other, generic {});
1516 }
1517 }
1518 }
1519
1520
1521 template <class A>
1522 XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1523 {
1524 return _mm_store_ps(mem, self);
1525 }
1526 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1527 XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1528 {
1529 return _mm_store_si128((__m128i*)mem, self);
1530 }
1531 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1532 XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1533 {
1534 return _mm_store_si128((__m128i*)mem, self);
1535 }
1536 template <class A>
1537 XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1538 {
1539 return _mm_store_pd(mem, self);
1540 }
1541
1542
1543 template <class A>
1544 XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
1545 {
1546 return _mm_storeu_ps(mem, self);
1547 }
1548 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1549 XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
1550 {
1551 return _mm_storeu_si128((__m128i*)mem, self);
1552 }
1553 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1554 XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
1555 {
1556 return _mm_storeu_si128((__m128i*)mem, self);
1557 }
1558 template <class A>
1559 XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
1560 {
1561 return _mm_storeu_pd(mem, self);
1562 }
1563
1564
1565 template <class A>
1566 XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1567 {
1568 return _mm_sub_ps(self, other);
1569 }
1570 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1571 XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1572 {
1573 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1574 {
1575 return _mm_sub_epi8(self, other);
1576 }
1577 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1578 {
1579 return _mm_sub_epi16(self, other);
1580 }
1581 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1582 {
1583 return _mm_sub_epi32(self, other);
1584 }
1585 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1586 {
1587 return _mm_sub_epi64(self, other);
1588 }
1589 else
1590 {
1591 assert(false && "unsupported arch/op combination");
1592 return {};
1593 }
1594 }
1595 template <class A>
1596 XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1597 {
1598 return _mm_sub_pd(self, other);
1599 }
1600
1601
1602
1603 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1604 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1605 {
1606 constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
1607 return _mm_shuffle_ps(self, self, index);
1608 }
1609
1610 template <class A, uint64_t V0, uint64_t V1>
1611 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
1612 {
1613 constexpr uint32_t index = detail::shuffle(V0, V1);
1614 return _mm_shuffle_pd(self, self, index);
1615 }
1616
1617 template <class A, uint64_t V0, uint64_t V1>
1618 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
1619 {
1620 constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
1621 return _mm_shuffle_epi32(self, index);
1622 }
1623
1624 template <class A, uint64_t V0, uint64_t V1>
1625 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
1626 {
1627 return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
1628 }
1629
1630 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1631 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
1632 {
1633 constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
1634 return _mm_shuffle_epi32(self, index);
1635 }
1636
1637 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1638 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
1639 {
1640 return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
1641 }
1642
1643
1644 template <class A>
1645 XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
1646 {
1647 assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1648 (void)matrix_end;
1649 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1650 _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
1651 matrix_begin[0] = r0;
1652 matrix_begin[1] = r1;
1653 matrix_begin[2] = r2;
1654 matrix_begin[3] = r3;
1655 }
1656 template <class A>
1657 XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1658 {
1659 transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1660 }
1661 template <class A>
1662 XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
1663 {
1664 transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
1665 }
1666
1667 template <class A>
1668 XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
1669 {
1670 assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
1671 (void)matrix_end;
1672 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1673 matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
1674 matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
1675 }
1676 template <class A>
1677 XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1678 {
1679 transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1680 }
1681 template <class A>
1682 XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
1683 {
1684 transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
1685 }
1686
1687
1688 template <class A>
1689 XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1690 {
1691 return _mm_unpackhi_ps(self, other);
1692 }
1693 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1694 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1695 {
1696 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1697 {
1698 return _mm_unpackhi_epi8(self, other);
1699 }
1700 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1701 {
1702 return _mm_unpackhi_epi16(self, other);
1703 }
1704 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1705 {
1706 return _mm_unpackhi_epi32(self, other);
1707 }
1708 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1709 {
1710 return _mm_unpackhi_epi64(self, other);
1711 }
1712 else
1713 {
1714 assert(false && "unsupported arch/op combination");
1715 return {};
1716 }
1717 }
1718 template <class A>
1719 XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1720 {
1721 return _mm_unpackhi_pd(self, other);
1722 }
1723
1724
1725 template <class A>
1726 XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
1727 {
1728 return _mm_unpacklo_ps(self, other);
1729 }
1730 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1731 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
1732 {
1733 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1734 {
1735 return _mm_unpacklo_epi8(self, other);
1736 }
1737 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1738 {
1739 return _mm_unpacklo_epi16(self, other);
1740 }
1741 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1742 {
1743 return _mm_unpacklo_epi32(self, other);
1744 }
1745 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1746 {
1747 return _mm_unpacklo_epi64(self, other);
1748 }
1749 else
1750 {
1751 assert(false && "unsupported arch/op combination");
1752 return {};
1753 }
1754 }
1755 template <class A>
1756 XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
1757 {
1758 return _mm_unpacklo_pd(self, other);
1759 }
1760 }
1761 }
1762
1763 #endif