File indexing completed on 2025-08-28 09:11:36
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_SSSE3_HPP
0013 #define XSIMD_SSSE3_HPP
0014
0015 #include <cstddef>
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_ssse3_register.hpp"
0019 #include "../types/xsimd_utils.hpp"
0020
0021 namespace xsimd
0022 {
0023
0024 namespace kernel
0025 {
0026 using namespace types;
0027
0028
0029 template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
0030 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
0031 {
0032 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0033 {
0034 return _mm_abs_epi8(self);
0035 }
0036 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0037 {
0038 return _mm_abs_epi16(self);
0039 }
0040 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0041 {
0042 return _mm_abs_epi32(self);
0043 }
0044 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0045 {
0046 return _mm_abs_epi64(self);
0047 }
0048 else
0049 {
0050 assert(false && "unsupported arch/op combination");
0051 return {};
0052 }
0053 }
0054
0055
0056 namespace detail
0057 {
0058
0059 template <class T, class A>
0060 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
0061 {
0062 return other;
0063 }
0064
0065 template <class T, class A, std::size_t I, std::size_t... Is>
0066 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
0067 {
0068 if (i == I)
0069 {
0070 return _mm_alignr_epi8(self, other, sizeof(T) * I);
0071 }
0072 else
0073 return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
0074 }
0075 }
0076
0077 template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
0078 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
0079 {
0080 constexpr std::size_t size = batch<T, A>::size;
0081 assert(0 <= i && i < size && "index in bounds");
0082 return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
0083 }
0084
0085
0086 template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0087 XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
0088 {
0089 XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0090 {
0091 __m128i tmp1 = _mm_hadd_epi16(self, self);
0092 __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
0093 __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
0094 return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
0095 }
0096 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0097 {
0098 __m128i tmp1 = _mm_hadd_epi32(self, self);
0099 __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
0100 return _mm_cvtsi128_si32(tmp2);
0101 }
0102 else
0103 {
0104 return reduce_add(self, sse3 {});
0105 }
0106 }
0107
0108
0109 template <size_t N, class A>
0110 XSIMD_INLINE batch<uint16_t, A> rotate_left(batch<uint16_t, A> const& self, requires_arch<ssse3>) noexcept
0111 {
0112 return _mm_alignr_epi8(self, self, N);
0113 }
0114 template <size_t N, class A>
0115 XSIMD_INLINE batch<int16_t, A> rotate_left(batch<int16_t, A> const& self, requires_arch<ssse3>) noexcept
0116 {
0117 return bitwise_cast<int16_t>(rotate_left<N, A>(bitwise_cast<uint16_t>(self), ssse3 {}));
0118 }
0119
0120
0121 template <class A>
0122 XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
0123 {
0124 return _mm_shuffle_epi8(self, mask);
0125 }
0126 template <class A>
0127 XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<ssse3>) noexcept
0128 {
0129 return _mm_shuffle_epi8(self, mask);
0130 }
0131
0132 template <class A, class T, class IT>
0133 XSIMD_INLINE typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
0134 swizzle(batch<T, A> const& self, batch<IT, A> mask, requires_arch<ssse3>) noexcept
0135 {
0136 constexpr auto pikes = static_cast<as_unsigned_integer_t<T>>(0x0706050403020100ul);
0137 constexpr auto comb = static_cast<as_unsigned_integer_t<T>>(0x0101010101010101ul * sizeof(T));
0138 return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), bitwise_cast<uint8_t>(mask * comb + pikes), ssse3 {}));
0139 }
0140
0141
0142 template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
0143 XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
0144 {
0145 constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
0146 2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
0147 mask8;
0148 return _mm_shuffle_epi8(self, mask8.as_batch());
0149 }
0150
0151 template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
0152 XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
0153 {
0154 return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
0155 }
0156
0157 template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
0158 uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
0159 XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
0160 {
0161 return swizzle(self, mask.as_batch(), ssse3 {});
0162 }
0163
0164 template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
0165 uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
0166 XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
0167 {
0168 return swizzle(self, mask.as_batch(), ssse3 {});
0169 }
0170
0171 }
0172
0173 }
0174
0175 #endif