xsimd/arch/xsimd_sse3.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_SSE3_HPP
0013 #define XSIMD_SSE3_HPP
0014
0015 #include "../types/xsimd_sse3_register.hpp"
0016 #include <type_traits>
0017
0018 namespace xsimd
0019 {
0020
0021     namespace kernel
0022     {
0023         using namespace types;
0024
0025         // haddp
0026         template <class A>
0027         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
0028         {
0029             return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
0030                                _mm_hadd_ps(row[2], row[3]));
0031         }
0032         template <class A>
0033         XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
0034         {
0035             return _mm_hadd_pd(row[0], row[1]);
0036         }
0037
0038         // load_unaligned
0039         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0040         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
0041         {
0042             return _mm_lddqu_si128((__m128i const*)mem);
0043         }
0044
0045         // reduce_add
0046         template <class A>
0047         XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
0048         {
0049             __m128 tmp0 = _mm_hadd_ps(self, self);
0050             __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
0051             return _mm_cvtss_f32(tmp1);
0052         }
0053         template <class A>
0054         XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
0055         {
0056             __m128d tmp0 = _mm_hadd_pd(self, self);
0057             return _mm_cvtsd_f64(tmp0);
0058         }
0059
0060     }
0061
0062 }
0063
0064 #endif