File indexing completed on 2025-08-28 09:11:36
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_SSE4_1_HPP
0013 #define XSIMD_SSE4_1_HPP
0014
0015 #include <type_traits>
0016
0017 #include "../types/xsimd_sse4_1_register.hpp"
0018
0019 namespace xsimd
0020 {
0021
0022 namespace kernel
0023 {
0024 using namespace types;
0025
0026 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0027 XSIMD_INLINE bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
0028 {
0029 return !_mm_testz_si128(self, self);
0030 }
0031
0032 template <class A>
0033 XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0034 {
0035 return _mm_ceil_ps(self);
0036 }
0037 template <class A>
0038 XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0039 {
0040 return _mm_ceil_pd(self);
0041 }
0042
0043
0044 namespace detail
0045 {
0046 template <class A>
0047 XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
0048 {
0049
0050 __m128i xH = _mm_srai_epi32(x, 16);
0051 xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
0052 xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.)));
0053 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88);
0054 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.));
0055 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0056 }
0057
0058 template <class A>
0059 XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
0060 {
0061
0062 __m128i xH = _mm_srli_epi64(x, 32);
0063 xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.)));
0064 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc);
0065 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.));
0066 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0067 }
0068 }
0069
0070
0071 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0072 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0073 {
0074 XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0075 {
0076 return _mm_cmpeq_epi64(self, other);
0077 }
0078 else
0079 {
0080 return eq(self, other, ssse3 {});
0081 }
0082 }
0083
0084
0085 template <class A>
0086 XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0087 {
0088 return _mm_floor_ps(self);
0089 }
0090 template <class A>
0091 XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0092 {
0093 return _mm_floor_pd(self);
0094 }
0095
0096
0097 template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0098 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
0099 {
0100 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0101 {
0102 return _mm_insert_epi8(self, val, I);
0103 }
0104 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0105 {
0106 return _mm_insert_epi32(self, val, I);
0107 }
0108 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0109 {
0110 #if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
0111 return _mm_insert_epi64(self, val, I);
0112 #else
0113 uint32_t lo, hi;
0114 memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
0115 memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
0116 return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
0117 #endif
0118 }
0119 else
0120 {
0121 return insert(self, val, pos, ssse3 {});
0122 }
0123 }
0124
0125
0126 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0127 XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0128 {
0129 if (std::is_signed<T>::value)
0130 {
0131 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0132 {
0133 return _mm_max_epi8(self, other);
0134 }
0135 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0136 {
0137 return _mm_max_epi16(self, other);
0138 }
0139 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0140 {
0141 return _mm_max_epi32(self, other);
0142 }
0143 else
0144 {
0145 return max(self, other, ssse3 {});
0146 }
0147 }
0148 else
0149 {
0150 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0151 {
0152 return _mm_max_epu8(self, other);
0153 }
0154 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0155 {
0156 return _mm_max_epu16(self, other);
0157 }
0158 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0159 {
0160 return _mm_max_epu32(self, other);
0161 }
0162 else
0163 {
0164 return max(self, other, ssse3 {});
0165 }
0166 }
0167 }
0168
0169
0170 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0171 XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0172 {
0173 if (std::is_signed<T>::value)
0174 {
0175 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0176 {
0177 return _mm_min_epi8(self, other);
0178 }
0179 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0180 {
0181 return _mm_min_epi16(self, other);
0182 }
0183 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0184 {
0185 return _mm_min_epi32(self, other);
0186 }
0187 else
0188 {
0189 return min(self, other, ssse3 {});
0190 }
0191 }
0192 else
0193 {
0194 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0195 {
0196 return _mm_min_epu8(self, other);
0197 }
0198 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0199 {
0200 return _mm_min_epu16(self, other);
0201 }
0202 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0203 {
0204 return _mm_min_epu32(self, other);
0205 }
0206 else
0207 {
0208 return min(self, other, ssse3 {});
0209 }
0210 }
0211 }
0212
0213
0214 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0215 XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0216 {
0217 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0218 {
0219 return _mm_or_si128(
0220 _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
0221 _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
0222 }
0223 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0224 {
0225 return _mm_mullo_epi16(self, other);
0226 }
0227 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0228 {
0229 return _mm_mullo_epi32(self, other);
0230 }
0231 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0232 {
0233 return _mm_add_epi64(
0234 _mm_mul_epu32(self, other),
0235 _mm_slli_epi64(
0236 _mm_add_epi64(
0237 _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
0238 _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
0239 32));
0240 }
0241 else
0242 {
0243 assert(false && "unsupported arch/op combination");
0244 return {};
0245 }
0246 }
0247
0248
0249 template <class A>
0250 XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0251 {
0252 return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
0253 }
0254 template <class A>
0255 XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0256 {
0257 return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
0258 }
0259
0260
0261 namespace detail
0262 {
0263 template <class T>
0264 XSIMD_INLINE constexpr T interleave(T const& cond) noexcept
0265 {
0266 return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
0267 }
0268 }
0269
0270 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0271 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
0272 {
0273 return _mm_blendv_epi8(false_br, true_br, cond);
0274 }
0275 template <class A>
0276 XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
0277 {
0278 return _mm_blendv_ps(false_br, true_br, cond);
0279 }
0280 template <class A>
0281 XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
0282 {
0283 return _mm_blendv_pd(false_br, true_br, cond);
0284 }
0285
0286 template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0287 XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
0288 {
0289 constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
0290 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0291 {
0292 return _mm_blend_epi16(false_br, true_br, mask);
0293 }
0294 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0295 {
0296 constexpr int imask = detail::interleave(mask);
0297 return _mm_blend_epi16(false_br, true_br, imask);
0298 }
0299 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0300 {
0301 constexpr int imask = detail::interleave(mask);
0302 constexpr int imask2 = detail::interleave(imask);
0303 return _mm_blend_epi16(false_br, true_br, imask2);
0304 }
0305 else
0306 {
0307 return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
0308 }
0309 }
0310 template <class A, bool... Values>
0311 XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
0312 {
0313 constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
0314 return _mm_blend_ps(false_br, true_br, mask);
0315 }
0316 template <class A, bool... Values>
0317 XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
0318 {
0319 constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
0320 return _mm_blend_pd(false_br, true_br, mask);
0321 }
0322
0323
0324 template <class A>
0325 XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0326 {
0327 return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
0328 }
0329 template <class A>
0330 XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0331 {
0332 return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
0333 }
0334
0335 }
0336
0337 }
0338
0339 #endif