Vc/sse/vectorhelper.h

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #ifndef VC_SSE_VECTORHELPER_H_
0029 #define VC_SSE_VECTORHELPER_H_
0030
0031 #include "types.h"
0032 #include "../common/loadstoreflags.h"
0033 #include <limits>
0034 #include "const_data.h"
0035 #include "macros.h"
0036
0037 namespace Vc_VERSIONED_NAMESPACE
0038 {
0039 namespace SSE
0040 {
0041 #define Vc_OP0(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name() { return code; }
0042 #define Vc_OP1(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a) { return code; }
0043 #define Vc_OP2(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b) { return code; }
0044 #define Vc_OP3(name, code) static Vc_ALWAYS_INLINE Vc_CONST VectorType name(const VectorType a, const VectorType b, const VectorType c) { return code; }
0045
0046         template<> struct VectorHelper<__m128>
0047         {
0048             typedef __m128 VectorType;
0049
0050             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfAligned  = nullptr) { return _mm_load_ps(x); }
0051             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_ps(x); }
0052             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const float *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0053
0054             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_ps(mem, x); }
0055             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_ps(mem, x); }
0056             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_ps(mem, x); }
0057             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0058
0059             // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
0060             template<typename Flags> static Vc_ALWAYS_INLINE void store(float *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castps_si128(x), _mm_castps_si128(m), reinterpret_cast<char *>(mem)); }
0061
0062             Vc_OP0(allone, _mm_setallone_ps())
0063             Vc_OP0(zero, _mm_setzero_ps())
0064             Vc_OP3(blend, blendv_ps(a, b, c))
0065         };
0066
0067
0068         template<> struct VectorHelper<__m128d>
0069         {
0070             typedef __m128d VectorType;
0071
0072             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfAligned   = nullptr) { return _mm_load_pd(x); }
0073             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_pd(x); }
0074             template<typename Flags> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const double *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0075
0076             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_pd(mem, x); }
0077             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_pd(mem, x); }
0078             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_pd(mem, x); }
0079             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0080
0081             // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
0082             template<typename Flags> static Vc_ALWAYS_INLINE void store(double *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(_mm_castpd_si128(x), _mm_castpd_si128(m), reinterpret_cast<char *>(mem)); }
0083
0084             Vc_OP0(allone, _mm_setallone_pd())
0085             Vc_OP0(zero, _mm_setzero_pd())
0086             Vc_OP3(blend, blendv_pd(a, b, c))
0087         };
0088
0089         template<> struct VectorHelper<__m128i>
0090         {
0091             typedef __m128i VectorType;
0092
0093             template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfAligned   = nullptr) { return _mm_load_si128(reinterpret_cast<const VectorType *>(x)); }
0094             template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfUnaligned = nullptr) { return _mm_loadu_si128(reinterpret_cast<const VectorType *>(x)); }
0095             template<typename Flags, typename T> static Vc_ALWAYS_INLINE Vc_PURE VectorType load(const T *x, typename Flags::EnableIfStreaming = nullptr) { return _mm_stream_load(x); }
0096
0097             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfAligned               = nullptr) { _mm_store_si128(reinterpret_cast<VectorType *>(mem), x); }
0098             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedNotStreaming = nullptr) { _mm_storeu_si128(reinterpret_cast<VectorType *>(mem), x); }
0099             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfStreaming             = nullptr) { _mm_stream_si128(reinterpret_cast<VectorType *>(mem), x); }
0100             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, typename Flags::EnableIfUnalignedAndStreaming = nullptr) { _mm_maskmoveu_si128(x, _mm_setallone_si128(), reinterpret_cast<char *>(mem)); }
0101
0102             // before AVX there was only one maskstore. load -> blend -> store would break the C++ memory model (read/write of memory that is actually not touched by this thread)
0103             template<typename Flags, typename T> static Vc_ALWAYS_INLINE void store(T *mem, VectorType x, VectorType m) { _mm_maskmoveu_si128(x, m, reinterpret_cast<char *>(mem)); }
0104
0105             Vc_OP0(allone, _mm_setallone_si128())
0106             Vc_OP0(zero, _mm_setzero_si128())
0107             Vc_OP3(blend, blendv_epi8(a, b, c))
0108         };
0109
0110 #undef Vc_OP1
0111 #undef Vc_OP2
0112 #undef Vc_OP3
0113
0114 #define Vc_OP1(op) \
0115         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a) { return Vc_CAT2(_mm_##op##_, Vc_SUFFIX)(a); }
0116 #define Vc_OP(op) \
0117         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op##_ , Vc_SUFFIX)(a, b); }
0118 #define Vc_OP_(op) \
0119         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op    , Vc_SUFFIX)(a, b); }
0120 #define Vc_OPx(op, op2) \
0121         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_##op2##_, Vc_SUFFIX)(a, b); }
0122 #define Vc_OP_CAST_(op) \
0123         static Vc_ALWAYS_INLINE Vc_CONST VectorType op(const VectorType a, const VectorType b) { return Vc_CAT2(_mm_castps_, Vc_SUFFIX)( \
0124             _mm_##op##ps(Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(a), \
0125               Vc_CAT2(Vc_CAT2(_mm_cast, Vc_SUFFIX), _ps)(b))); \
0126         }
0127 #define Vc_MINMAX \
0128         static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return Vc_CAT2(_mm_min_, Vc_SUFFIX)(a, b); } \
0129         static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return Vc_CAT2(_mm_max_, Vc_SUFFIX)(a, b); }
0130
0131         template<> struct VectorHelper<double> {
0132             typedef __m128d VectorType;
0133             typedef double EntryType;
0134 #define Vc_SUFFIX pd
0135
0136             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0137             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_pd(mask), a); }
0138             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0139             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const double a, const double b) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b); }
0140             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0141             static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.); }
0142
0143 #ifdef Vc_IMPL_FMA4
0144             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0145                 v1 = _mm_macc_pd(v1, v2, v3);
0146             }
0147 #else
0148             static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
0149                 VectorType h1 = _mm_and_pd(v1, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0150                 VectorType h2 = _mm_and_pd(v2, _mm_load_pd(reinterpret_cast<const double *>(&c_general::highMaskDouble)));
0151 #if defined(Vc_GCC) && Vc_GCC < 0x40703
0152                 // GCC before 4.7.3 uses an incorrect optimization where it replaces the subtraction with an andnot
0153                 // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=54703
0154                 asm("":"+x"(h1), "+x"(h2));
0155 #endif
0156                 const VectorType l1 = _mm_sub_pd(v1, h1);
0157                 const VectorType l2 = _mm_sub_pd(v2, h2);
0158                 const VectorType ll = mul(l1, l2);
0159                 const VectorType lh = add(mul(l1, h2), mul(h1, l2));
0160                 const VectorType hh = mul(h1, h2);
0161                 // ll < lh < hh for all entries is certain
0162                 const VectorType lh_lt_v3 = _mm_cmplt_pd(abs(lh), abs(v3)); // |lh| < |v3|
0163                 const VectorType b = blendv_pd(v3, lh, lh_lt_v3);
0164                 const VectorType c = blendv_pd(lh, v3, lh_lt_v3);
0165                 v1 = add(add(ll, b), add(c, hh));
0166             }
0167 #endif
0168
0169             Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
0170
0171             Vc_OP1(sqrt)
0172             static Vc_ALWAYS_INLINE Vc_CONST VectorType rsqrt(VectorType x) {
0173                 return _mm_div_pd(one(), sqrt(x));
0174             }
0175             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
0176                 return _mm_div_pd(one(), x);
0177             }
0178             static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
0179                 return _mm_cmpunord_pd(x, x);
0180             }
0181             static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
0182                 return _mm_cmpord_pd(x, _mm_mul_pd(zero(), x));
0183             }
0184             static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
0185                 return _mm_castsi128_pd(cmpeq_epi64(_mm_castpd_si128(abs(x)), _mm_castpd_si128(_mm_load_pd(c_log<double>::d(1)))));
0186             }
0187             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
0188                 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_pd());
0189             }
0190
0191             Vc_MINMAX
0192             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0193                 a = _mm_min_sd(a, _mm_unpackhi_pd(a, a));
0194                 return _mm_cvtsd_f64(a);
0195             }
0196             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0197                 a = _mm_max_sd(a, _mm_unpackhi_pd(a, a));
0198                 return _mm_cvtsd_f64(a);
0199             }
0200             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0201                 a = _mm_mul_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
0202                 return _mm_cvtsd_f64(a);
0203             }
0204             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0205                 a = _mm_add_sd(a, _mm_shuffle_pd(a, a, _MM_SHUFFLE2(0, 1)));
0206                 return _mm_cvtsd_f64(a);
0207             }
0208 #undef Vc_SUFFIX
0209             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
0210 #ifdef Vc_IMPL_SSE4_1
0211                 return _mm_round_pd(a, _MM_FROUND_NINT);
0212 #else
0213                 //XXX: slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
0214                 return _mm_cvtepi32_pd(_mm_cvtpd_epi32(a));
0215 #endif
0216             }
0217         };
0218
0219         template<> struct VectorHelper<float> {
0220             typedef float EntryType;
0221             typedef __m128 VectorType;
0222 #define Vc_SUFFIX ps
0223
0224             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0225             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(mask, a); }
0226             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0227             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const float a, const float b, const float c, const float d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0228             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0229             static Vc_ALWAYS_INLINE Vc_CONST VectorType one()  { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }// set(1.f); }
0230             static Vc_ALWAYS_INLINE Vc_CONST __m128 concat(__m128d a, __m128d b) { return _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)); }
0231
0232 #ifdef Vc_IMPL_FMA4
0233             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0234                 v1 = _mm_macc_ps(v1, v2, v3);
0235             }
0236 #else
0237             static inline void fma(VectorType &v1, VectorType v2, VectorType v3) {
0238                 __m128d v1_0 = _mm_cvtps_pd(v1);
0239                 __m128d v1_1 = _mm_cvtps_pd(_mm_movehl_ps(v1, v1));
0240                 __m128d v2_0 = _mm_cvtps_pd(v2);
0241                 __m128d v2_1 = _mm_cvtps_pd(_mm_movehl_ps(v2, v2));
0242                 __m128d v3_0 = _mm_cvtps_pd(v3);
0243                 __m128d v3_1 = _mm_cvtps_pd(_mm_movehl_ps(v3, v3));
0244                 v1 = _mm_movelh_ps(
0245                         _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_0, v2_0), v3_0)),
0246                         _mm_cvtpd_ps(_mm_add_pd(_mm_mul_pd(v1_1, v2_1), v3_1)));
0247             }
0248 #endif
0249
0250             Vc_OP(add) Vc_OP(sub) Vc_OP(mul)
0251
0252             Vc_OP1(sqrt) Vc_OP1(rsqrt)
0253             static Vc_ALWAYS_INLINE Vc_CONST VectorType isNaN(VectorType x) {
0254                 return _mm_cmpunord_ps(x, x);
0255             }
0256             static Vc_ALWAYS_INLINE Vc_CONST VectorType isFinite(VectorType x) {
0257                 return _mm_cmpord_ps(x, _mm_mul_ps(zero(), x));
0258             }
0259             static Vc_ALWAYS_INLINE Vc_CONST VectorType isInfinite(VectorType x) {
0260                 return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(abs(x)), _mm_castps_si128(_mm_load_ps(c_log<float>::d(1)))));
0261             }
0262             static Vc_ALWAYS_INLINE Vc_CONST VectorType reciprocal(VectorType x) {
0263                 return _mm_rcp_ps(x);
0264             }
0265             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) {
0266                 return Vc_CAT2(_mm_and_, Vc_SUFFIX)(a, _mm_setabsmask_ps());
0267             }
0268
0269             Vc_MINMAX
0270             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0271                 a = _mm_min_ps(a, _mm_movehl_ps(a, a));   // a = min(a0, a2), min(a1, a3), min(a2, a2), min(a3, a3)
0272                 a = _mm_min_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = min(a0, a1), a1, a2, a3
0273                 return _mm_cvtss_f32(a);
0274             }
0275             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0276                 a = _mm_max_ps(a, _mm_movehl_ps(a, a));   // a = max(a0, a2), max(a1, a3), max(a2, a2), max(a3, a3)
0277                 a = _mm_max_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1))); // a = max(a0, a1), a1, a2, a3
0278                 return _mm_cvtss_f32(a);
0279             }
0280             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0281                 a = _mm_mul_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
0282                 a = _mm_mul_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
0283                 return _mm_cvtss_f32(a);
0284             }
0285             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0286                 a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 1, 2, 3)));
0287                 a = _mm_add_ss(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 2, 0, 1)));
0288                 return _mm_cvtss_f32(a);
0289             }
0290 #undef Vc_SUFFIX
0291             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) {
0292 #ifdef Vc_IMPL_SSE4_1
0293                 return _mm_round_ps(a, _MM_FROUND_NINT);
0294 #else
0295                 //XXX slow: _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
0296                 return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
0297 #endif
0298             }
0299         };
0300
0301         template<> struct VectorHelper<int> {
0302             typedef int EntryType;
0303             typedef __m128i VectorType;
0304 #define Vc_SUFFIX si128
0305
0306             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0307             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0308             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0309 #undef Vc_SUFFIX
0310 #define Vc_SUFFIX epi32
0311             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0312
0313             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0314             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const int a, const int b, const int c, const int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0315
0316             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0317
0318             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0319                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0320             }
0321             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0322                 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
0323             }
0324             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi32(a); }
0325             static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epi32(a, b); }
0326             static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epi32(a, b); }
0327             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0328                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0329                 // using lo_epi16 for speed here
0330                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0331                 return _mm_cvtsi128_si32(a);
0332             }
0333             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0334                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0335                 // using lo_epi16 for speed here
0336                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0337                 return _mm_cvtsi128_si32(a);
0338             }
0339             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0340                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0341                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0342                 return _mm_cvtsi128_si32(a);
0343             }
0344 #ifdef Vc_IMPL_SSE4_1
0345             static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(VectorType a, VectorType b) { return _mm_mullo_epi32(a, b); }
0346             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0347                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0348                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0349                 return _mm_cvtsi128_si32(a);
0350             }
0351 #else
0352             static inline Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
0353                 const VectorType aShift = _mm_srli_si128(a, 4);
0354                 const VectorType ab02 = _mm_mul_epu32(a, b); // [a0 * b0, a2 * b2]
0355                 const VectorType bShift = _mm_srli_si128(b, 4);
0356                 const VectorType ab13 = _mm_mul_epu32(aShift, bShift); // [a1 * b1, a3 * b3]
0357                 return _mm_unpacklo_epi32(_mm_shuffle_epi32(ab02, 8), _mm_shuffle_epi32(ab13, 8));
0358             }
0359 #endif
0360
0361             Vc_OP(add) Vc_OP(sub)
0362 #undef Vc_SUFFIX
0363             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0364         };
0365
0366         template<> struct VectorHelper<unsigned int> {
0367             typedef unsigned int EntryType;
0368             typedef __m128i VectorType;
0369 #define Vc_SUFFIX si128
0370             Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
0371             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0372             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0373
0374 #undef Vc_SUFFIX
0375 #define Vc_SUFFIX epu32
0376             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0377
0378             static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu32(a, b); }
0379             static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu32(a, b); }
0380             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0381                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0382                 // using lo_epi16 for speed here
0383                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0384                 return _mm_cvtsi128_si32(a);
0385             }
0386             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0387                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0388                 // using lo_epi16 for speed here
0389                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0390                 return _mm_cvtsi128_si32(a);
0391             }
0392             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0393                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0394                 // using lo_epi16 for speed here
0395                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0396                 return _mm_cvtsi128_si32(a);
0397             }
0398             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0399                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0400                 // using lo_epi16 for speed here
0401                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0402                 return _mm_cvtsi128_si32(a);
0403             }
0404
0405             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0406
0407             static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a, const VectorType b) {
0408                 return VectorHelper<int>::mul(a, b);
0409             }
0410 //X             template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
0411 //X                 switch (b) {
0412 //X                     case    0: return zero();
0413 //X                     case    1: return a;
0414 //X                     case    2: return _mm_slli_epi32(a,  1);
0415 //X                     case    4: return _mm_slli_epi32(a,  2);
0416 //X                     case    8: return _mm_slli_epi32(a,  3);
0417 //X                     case   16: return _mm_slli_epi32(a,  4);
0418 //X                     case   32: return _mm_slli_epi32(a,  5);
0419 //X                     case   64: return _mm_slli_epi32(a,  6);
0420 //X                     case  128: return _mm_slli_epi32(a,  7);
0421 //X                     case  256: return _mm_slli_epi32(a,  8);
0422 //X                     case  512: return _mm_slli_epi32(a,  9);
0423 //X                     case 1024: return _mm_slli_epi32(a, 10);
0424 //X                     case 2048: return _mm_slli_epi32(a, 11);
0425 //X                 }
0426 //X                 return mul(a, set(b));
0427 //X             }
0428
0429 #undef Vc_SUFFIX
0430 #define Vc_SUFFIX epi32
0431             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0432                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0433             }
0434             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0435                 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
0436             }
0437             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0438             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const unsigned int a, const unsigned int b, const unsigned int c, const unsigned int d) { return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d); }
0439
0440             Vc_OP(add) Vc_OP(sub)
0441 #undef Vc_SUFFIX
0442             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0443         };
0444
0445         template<> struct VectorHelper<signed short> {
0446             typedef __m128i VectorType;
0447             typedef signed short EntryType;
0448 #define Vc_SUFFIX si128
0449
0450             Vc_OP_(or_) Vc_OP_(and_) Vc_OP_(xor_)
0451             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0452             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0453             static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packs_epi32(a, b); }
0454             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_srai_epi32(_mm_unpacklo_epi16(x, x), 16); }
0455             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_srai_epi32(_mm_unpackhi_epi16(x, x), 16); }
0456
0457 #undef Vc_SUFFIX
0458 #define Vc_SUFFIX epi16
0459             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0460
0461             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0462                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0463             }
0464             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0465                 return Vc_CAT2(_mm_srai_, Vc_SUFFIX)(a, shift);
0466             }
0467             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0468             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c, const EntryType d,
0469                     const EntryType e, const EntryType f, const EntryType g, const EntryType h) {
0470                 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
0471             }
0472
0473             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) {
0474                 v1 = add(mul(v1, v2), v3); }
0475
0476             static Vc_ALWAYS_INLINE Vc_CONST VectorType abs(const VectorType a) { return abs_epi16(a); }
0477
0478             Vc_OPx(mul, mullo)
0479             Vc_OP(min) Vc_OP(max)
0480             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0481                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
0482                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0483                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0484                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0485                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0486             }
0487             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0488                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
0489                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0490                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0491                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0492                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0493             }
0494             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0495                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0496                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0497                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0498                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0499             }
0500             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0501                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0502                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0503                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0504                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0505             }
0506
0507             Vc_OP(add) Vc_OP(sub)
0508 #undef Vc_SUFFIX
0509             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0510         };
0511
0512         template<> struct VectorHelper<unsigned short> {
0513             typedef __m128i VectorType;
0514             typedef unsigned short EntryType;
0515 #define Vc_SUFFIX si128
0516             Vc_OP_CAST_(or_) Vc_OP_CAST_(and_) Vc_OP_CAST_(xor_)
0517             static Vc_ALWAYS_INLINE Vc_CONST VectorType zero() { return Vc_CAT2(_mm_setzero_, Vc_SUFFIX)(); }
0518             static Vc_ALWAYS_INLINE Vc_CONST VectorType notMaskedToZero(VectorType a, __m128 mask) { return Vc_CAT2(_mm_and_, Vc_SUFFIX)(_mm_castps_si128(mask), a); }
0519 #ifdef Vc_IMPL_SSE4_1
0520             static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }
0521 #else
0522             // FIXME too bad, but this is broken without SSE 4.1
0523             static Vc_ALWAYS_INLINE Vc_CONST __m128i concat(__m128i a, __m128i b) {
0524                 auto tmp0 = _mm_unpacklo_epi16(a, b); // 0 4 X X 1 5 X X
0525                 auto tmp1 = _mm_unpackhi_epi16(a, b); // 2 6 X X 3 7 X X
0526                 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1); // 0 2 4 6 X X X X
0527                 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1); // 1 3 5 7 X X X X
0528                 return _mm_unpacklo_epi16(tmp2, tmp3); // 0 1 2 3 4 5 6 7
0529             }
0530 #endif
0531             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand0(__m128i x) { return _mm_unpacklo_epi16(x, _mm_setzero_si128()); }
0532             static Vc_ALWAYS_INLINE Vc_CONST __m128i expand1(__m128i x) { return _mm_unpackhi_epi16(x, _mm_setzero_si128()); }
0533
0534 #undef Vc_SUFFIX
0535 #define Vc_SUFFIX epu16
0536             static Vc_ALWAYS_INLINE Vc_CONST VectorType one() { return Vc_CAT2(_mm_setone_, Vc_SUFFIX)(); }
0537
0538 //X             template<unsigned int b> static Vc_ALWAYS_INLINE Vc_CONST VectorType mul(const VectorType a) {
0539 //X                 switch (b) {
0540 //X                     case    0: return zero();
0541 //X                     case    1: return a;
0542 //X                     case    2: return _mm_slli_epi16(a,  1);
0543 //X                     case    4: return _mm_slli_epi16(a,  2);
0544 //X                     case    8: return _mm_slli_epi16(a,  3);
0545 //X                     case   16: return _mm_slli_epi16(a,  4);
0546 //X                     case   32: return _mm_slli_epi16(a,  5);
0547 //X                     case   64: return _mm_slli_epi16(a,  6);
0548 //X                     case  128: return _mm_slli_epi16(a,  7);
0549 //X                     case  256: return _mm_slli_epi16(a,  8);
0550 //X                     case  512: return _mm_slli_epi16(a,  9);
0551 //X                     case 1024: return _mm_slli_epi16(a, 10);
0552 //X                     case 2048: return _mm_slli_epi16(a, 11);
0553 //X                 }
0554 //X                 return mul(a, set(b));
0555 //X             }
0556 #if !defined(USE_INCORRECT_UNSIGNED_COMPARE) || Vc_IMPL_SSE4_1
0557             static Vc_ALWAYS_INLINE Vc_CONST VectorType min(VectorType a, VectorType b) { return min_epu16(a, b); }
0558             static Vc_ALWAYS_INLINE Vc_CONST VectorType max(VectorType a, VectorType b) { return max_epu16(a, b); }
0559 #endif
0560 #undef Vc_SUFFIX
0561 #define Vc_SUFFIX epi16
0562             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftLeft(VectorType a, int shift) {
0563                 return Vc_CAT2(_mm_slli_, Vc_SUFFIX)(a, shift);
0564             }
0565             static Vc_ALWAYS_INLINE Vc_CONST VectorType shiftRight(VectorType a, int shift) {
0566                 return Vc_CAT2(_mm_srli_, Vc_SUFFIX)(a, shift);
0567             }
0568
0569             static Vc_ALWAYS_INLINE void fma(VectorType &v1, VectorType v2, VectorType v3) { v1 = add(mul(v1, v2), v3); }
0570
0571             Vc_OPx(mul, mullo) // should work correctly for all values
0572 #if defined(USE_INCORRECT_UNSIGNED_COMPARE) && !defined(Vc_IMPL_SSE4_1)
0573             Vc_OP(min) Vc_OP(max) // XXX breaks for values with MSB set
0574 #endif
0575             static Vc_ALWAYS_INLINE Vc_CONST EntryType min(VectorType a) {
0576                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
0577                 a = min(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0578                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0579                 a = min(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0580                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0581             }
0582             static Vc_ALWAYS_INLINE Vc_CONST EntryType max(VectorType a) {
0583                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
0584                 a = max(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0585                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0586                 a = max(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0587                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0588             }
0589             static Vc_ALWAYS_INLINE Vc_CONST EntryType mul(VectorType a) {
0590                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
0591                 a = mul(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0592                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0593                 a = mul(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0594                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0595             }
0596             static Vc_ALWAYS_INLINE Vc_CONST EntryType add(VectorType a) {
0597                 // reminder: _MM_SHUFFLE(3, 2, 1, 0) means "no change"
0598                 a = add(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(1, 0, 3, 2)));
0599                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 0, 3, 2)));
0600                 a = add(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1, 1, 1, 1)));
0601                 return _mm_cvtsi128_si32(a); // & 0xffff is implicit
0602             }
0603             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a) { return Vc_CAT2(_mm_set1_, Vc_SUFFIX)(a); }
0604             static Vc_ALWAYS_INLINE Vc_CONST VectorType set(const EntryType a, const EntryType b, const EntryType c,
0605                     const EntryType d, const EntryType e, const EntryType f,
0606                     const EntryType g, const EntryType h) {
0607                 return Vc_CAT2(_mm_set_, Vc_SUFFIX)(a, b, c, d, e, f, g, h);
0608             }
0609
0610             Vc_OP(add) Vc_OP(sub)
0611 #undef Vc_SUFFIX
0612             static Vc_ALWAYS_INLINE Vc_CONST VectorType round(VectorType a) { return a; }
0613         };
0614 #undef Vc_OP1
0615 #undef Vc_OP
0616 #undef Vc_OP_
0617 #undef Vc_OPx
0618 #undef Vc_OP_CAST_
0619 #undef Vc_MINMAX
0620
0621 }  // namespace SSE
0622 }  // namespace Vc
0623
0624 #include "vectorhelper.tcc"
0625
0626 #endif // VC_SSE_VECTORHELPER_H_