xsimd/arch/xsimd_ssse3.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_SSSE3_HPP
0013 #define XSIMD_SSSE3_HPP
0014
0015 #include <cstddef>
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_ssse3_register.hpp"
0019 #include "../types/xsimd_utils.hpp"
0020
0021 namespace xsimd
0022 {
0023
0024     namespace kernel
0025     {
0026         using namespace types;
0027
0028         // abs
0029         template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
0030         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
0031         {
0032             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0033             {
0034                 return _mm_abs_epi8(self);
0035             }
0036             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0037             {
0038                 return _mm_abs_epi16(self);
0039             }
0040             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0041             {
0042                 return _mm_abs_epi32(self);
0043             }
0044             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0045             {
0046                 return _mm_abs_epi64(self);
0047             }
0048             else
0049             {
0050                 assert(false && "unsupported arch/op combination");
0051                 return {};
0052             }
0053         }
0054
0055         // extract_pair
0056         namespace detail
0057         {
0058
0059             template <class T, class A>
0060             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
0061             {
0062                 return other;
0063             }
0064
0065             template <class T, class A, std::size_t I, std::size_t... Is>
0066             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
0067             {
0068                 if (i == I)
0069                 {
0070                     return _mm_alignr_epi8(self, other, sizeof(T) * I);
0071                 }
0072                 else
0073                     return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
0074             }
0075         }
0076
0077         template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
0078         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
0079         {
0080             constexpr std::size_t size = batch<T, A>::size;
0081             assert(0 <= i && i < size && "index in bounds");
0082             return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
0083         }
0084
0085         // reduce_add
0086         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0087         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
0088         {
0089             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0090             {
0091                 __m128i tmp1 = _mm_hadd_epi16(self, self);
0092                 __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
0093                 __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
0094                 return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
0095             }
0096             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0097             {
0098                 __m128i tmp1 = _mm_hadd_epi32(self, self);
0099                 __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
0100                 return _mm_cvtsi128_si32(tmp2);
0101             }
0102             else
0103             {
0104                 return reduce_add(self, sse3 {});
0105             }
0106         }
0107
0108         // rotate_left
0109         template <size_t N, class A>
0110         XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
0111         {
0112             return _mm_alignr_epi8(self, self, N);
0113         }
0114         template <size_t N, class A>
0115         XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
0116         {
0117             return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
0118         }
0119
0120         // swizzle (dynamic mask)
0121         template <class A>
0122         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
0123         {
0124             return _mm_shuffle_epi8(self, mask);
0125         }
0126         template <class A>
0127         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
0128         {
0129             return _mm_shuffle_epi8(self, mask);
0130         }
0131
0132         template <class A, class T, class IT>
0133         XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
0134         swizzle(batch<T, A> const& self, batch<IT, A> mask, requires_arch<ssse3>) noexcept
0135         {
0136             constexpr auto pikes = static_cast<as_unsigned_integer_t<T>>(0x0706050403020100ul);
0137             constexpr auto comb = static_cast<as_unsigned_integer_t<T>>(0x0101010101010101ul * sizeof(T));
0138             return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), bitwise_cast<uint8_t>(mask * comb + pikes), ssse3 {}));
0139         }
0140
0141         // swizzle (constant mask)
0142         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
0143         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
0144         {
0145             constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
0146                                      2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
0147                 mask8;
0148             return _mm_shuffle_epi8(self, mask8.as_batch());
0149         }
0150
0151         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
0152         XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
0153         {
0154             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
0155         }
0156
0157         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
0158                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
0159         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
0160         {
0161             return swizzle(self, mask.as_batch(), ssse3 {});
0162         }
0163
0164         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
0165                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
0166         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
0167         {
0168             return swizzle(self, mask.as_batch(), ssse3 {});
0169         }
0170
0171     }
0172
0173 }
0174
0175 #endif