xsimd/arch/xsimd_sse4_1.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_SSE4_1_HPP
0013 #define XSIMD_SSE4_1_HPP
0014
0015 #include <type_traits>
0016
0017 #include "../types/xsimd_sse4_1_register.hpp"
0018
0019 namespace xsimd
0020 {
0021
0022     namespace kernel
0023     {
0024         using namespace types;
0025         // any
0026         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0027         XSIMD_INLINE bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
0028         {
0029             return !_mm_testz_si128(self, self);
0030         }
0031         // ceil
0032         template <class A>
0033         XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0034         {
0035             return _mm_ceil_ps(self);
0036         }
0037         template <class A>
0038         XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0039         {
0040             return _mm_ceil_pd(self);
0041         }
0042
0043         // fast_cast
0044         namespace detail
0045         {
0046             template <class A>
0047             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
0048             {
0049                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0050                 __m128i xH = _mm_srai_epi32(x, 16);
0051                 xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
0052                 xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
0053                 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
0054                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
0055                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0056             }
0057
0058             template <class A>
0059             XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
0060             {
0061                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0062                 __m128i xH = _mm_srli_epi64(x, 32);
0063                 xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
0064                 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
0065                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
0066                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
0067             }
0068         }
0069
0070         // eq
0071         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0072         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0073         {
0074             XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0075             {
0076                 return _mm_cmpeq_epi64(self, other);
0077             }
0078             else
0079             {
0080                 return eq(self, other, ssse3 {});
0081             }
0082         }
0083
0084         // floor
0085         template <class A>
0086         XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0087         {
0088             return _mm_floor_ps(self);
0089         }
0090         template <class A>
0091         XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0092         {
0093             return _mm_floor_pd(self);
0094         }
0095
0096         // insert
0097         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0098         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
0099         {
0100             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0101             {
0102                 return _mm_insert_epi8(self, val, I);
0103             }
0104             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0105             {
0106                 return _mm_insert_epi32(self, val, I);
0107             }
0108             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0109             {
0110 #if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
0111                 return _mm_insert_epi64(self, val, I);
0112 #else
0113                 uint32_t lo, hi;
0114                 memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
0115                 memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
0116                 return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
0117 #endif
0118             }
0119             else
0120             {
0121                 return insert(self, val, pos, ssse3 {});
0122             }
0123         }
0124
0125         // max
0126         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0127         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0128         {
0129             if (std::is_signed<T>::value)
0130             {
0131                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0132                 {
0133                     return _mm_max_epi8(self, other);
0134                 }
0135                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0136                 {
0137                     return _mm_max_epi16(self, other);
0138                 }
0139                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0140                 {
0141                     return _mm_max_epi32(self, other);
0142                 }
0143                 else
0144                 {
0145                     return max(self, other, ssse3 {});
0146                 }
0147             }
0148             else
0149             {
0150                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0151                 {
0152                     return _mm_max_epu8(self, other);
0153                 }
0154                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0155                 {
0156                     return _mm_max_epu16(self, other);
0157                 }
0158                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0159                 {
0160                     return _mm_max_epu32(self, other);
0161                 }
0162                 else
0163                 {
0164                     return max(self, other, ssse3 {});
0165                 }
0166             }
0167         }
0168
0169         // min
0170         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0171         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0172         {
0173             if (std::is_signed<T>::value)
0174             {
0175                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0176                 {
0177                     return _mm_min_epi8(self, other);
0178                 }
0179                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0180                 {
0181                     return _mm_min_epi16(self, other);
0182                 }
0183                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0184                 {
0185                     return _mm_min_epi32(self, other);
0186                 }
0187                 else
0188                 {
0189                     return min(self, other, ssse3 {});
0190                 }
0191             }
0192             else
0193             {
0194                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0195                 {
0196                     return _mm_min_epu8(self, other);
0197                 }
0198                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0199                 {
0200                     return _mm_min_epu16(self, other);
0201                 }
0202                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0203                 {
0204                     return _mm_min_epu32(self, other);
0205                 }
0206                 else
0207                 {
0208                     return min(self, other, ssse3 {});
0209                 }
0210             }
0211         }
0212
0213         // mul
0214         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0215         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
0216         {
0217             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0218             {
0219                 return _mm_or_si128(
0220                     _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
0221                     _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
0222             }
0223             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0224             {
0225                 return _mm_mullo_epi16(self, other);
0226             }
0227             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0228             {
0229                 return _mm_mullo_epi32(self, other);
0230             }
0231             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0232             {
0233                 return _mm_add_epi64(
0234                     _mm_mul_epu32(self, other),
0235                     _mm_slli_epi64(
0236                         _mm_add_epi64(
0237                             _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
0238                             _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
0239                         32));
0240             }
0241             else
0242             {
0243                 assert(false && "unsupported arch/op combination");
0244                 return {};
0245             }
0246         }
0247
0248         // nearbyint
0249         template <class A>
0250         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0251         {
0252             return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
0253         }
0254         template <class A>
0255         XSIMD_INLINE batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0256         {
0257             return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
0258         }
0259
0260         // select
0261         namespace detail
0262         {
0263             template <class T>
0264             XSIMD_INLINE constexpr T interleave(T const& cond) noexcept
0265             {
0266                 return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
0267             }
0268         }
0269
0270         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0271         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
0272         {
0273             return _mm_blendv_epi8(false_br, true_br, cond);
0274         }
0275         template <class A>
0276         XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
0277         {
0278             return _mm_blendv_ps(false_br, true_br, cond);
0279         }
0280         template <class A>
0281         XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
0282         {
0283             return _mm_blendv_pd(false_br, true_br, cond);
0284         }
0285
0286         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0287         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
0288         {
0289             constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
0290             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0291             {
0292                 return _mm_blend_epi16(false_br, true_br, mask);
0293             }
0294             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0295             {
0296                 constexpr int imask = detail::interleave(mask);
0297                 return _mm_blend_epi16(false_br, true_br, imask);
0298             }
0299             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0300             {
0301                 constexpr int imask = detail::interleave(mask);
0302                 constexpr int imask2 = detail::interleave(imask);
0303                 return _mm_blend_epi16(false_br, true_br, imask2);
0304             }
0305             else
0306             {
0307                 return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
0308             }
0309         }
0310         template <class A, bool... Values>
0311         XSIMD_INLINE batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
0312         {
0313             constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
0314             return _mm_blend_ps(false_br, true_br, mask);
0315         }
0316         template <class A, bool... Values>
0317         XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
0318         {
0319             constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
0320             return _mm_blend_pd(false_br, true_br, mask);
0321         }
0322
0323         // trunc
0324         template <class A>
0325         XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
0326         {
0327             return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
0328         }
0329         template <class A>
0330         XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
0331         {
0332             return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
0333         }
0334
0335     }
0336
0337 }
0338
0339 #endif