Vc/sse/intrinsics.h

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #ifndef VC_SSE_INTRINSICS_H_
0029 #define VC_SSE_INTRINSICS_H_
0030
0031 #ifdef Vc_MSVC
0032 #include <intrin.h>
0033 #else
0034 #include <x86intrin.h>
0035 #endif
0036
0037 #include "../common/storage.h"
0038 #include "const_data.h"
0039 #include <cstdlib>
0040 #include "types.h"
0041 #include "debug.h"
0042
0043 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
0044 // GCC uses lots of old-style-casts in macros that disguise as intrinsics
0045 #pragma GCC diagnostic push
0046 #pragma GCC diagnostic ignored "-Wold-style-cast"
0047 #endif
0048
0049 namespace Vc_VERSIONED_NAMESPACE
0050 {
0051 namespace SseIntrinsics
0052 {
0053     using SSE::c_general;
0054
0055     constexpr std::size_t VectorAlignment = 16;
0056
0057 #if defined(Vc_GCC) && Vc_GCC < 0x40600 && !defined(Vc_DONT_FIX_SSE_SHIFT)
0058     static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi16(__m128i a, __m128i count) { __asm__("psllw %1,%0" : "+x"(a) : "x"(count)); return a; }
0059     static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi32(__m128i a, __m128i count) { __asm__("pslld %1,%0" : "+x"(a) : "x"(count)); return a; }
0060     static Vc_INTRINSIC Vc_CONST __m128i _mm_sll_epi64(__m128i a, __m128i count) { __asm__("psllq %1,%0" : "+x"(a) : "x"(count)); return a; }
0061     static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi16(__m128i a, __m128i count) { __asm__("psrlw %1,%0" : "+x"(a) : "x"(count)); return a; }
0062     static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi32(__m128i a, __m128i count) { __asm__("psrld %1,%0" : "+x"(a) : "x"(count)); return a; }
0063     static Vc_INTRINSIC Vc_CONST __m128i _mm_srl_epi64(__m128i a, __m128i count) { __asm__("psrlq %1,%0" : "+x"(a) : "x"(count)); return a; }
0064 #endif
0065
0066 #ifdef Vc_GCC
0067     // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
0068     // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
0069     static Vc_INTRINSIC Vc_CONST __m128d _mm_mul_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) * static_cast<__v2df>(b)); }
0070     static Vc_INTRINSIC Vc_CONST __m128d _mm_add_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) + static_cast<__v2df>(b)); }
0071     static Vc_INTRINSIC Vc_CONST __m128d _mm_sub_pd(__m128d a, __m128d b) { return static_cast<__m128d>(static_cast<__v2df>(a) - static_cast<__v2df>(b)); }
0072     static Vc_INTRINSIC Vc_CONST __m128  _mm_mul_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) * static_cast<__v4sf>(b)); }
0073     static Vc_INTRINSIC Vc_CONST __m128  _mm_add_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) + static_cast<__v4sf>(b)); }
0074     static Vc_INTRINSIC Vc_CONST __m128  _mm_sub_ps(__m128  a, __m128  b) { return static_cast<__m128 >(static_cast<__v4sf>(a) - static_cast<__v4sf>(b)); }
0075 #endif
0076
0077     static Vc_INTRINSIC Vc_CONST __m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
0078     static Vc_INTRINSIC Vc_CONST __m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
0079     static Vc_INTRINSIC Vc_CONST __m128  _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
0080
0081     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi16()  { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one16)); }
0082     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu16()  { return _mm_setone_epi16(); }
0083     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epi32()  { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::one32)); }
0084     static Vc_INTRINSIC __m128i Vc_CONST _mm_setone_epu32()  { return _mm_setone_epi32(); }
0085
0086     static Vc_INTRINSIC __m128  Vc_CONST _mm_setone_ps()     { return _mm_load_ps(c_general::oneFloat); }
0087     static Vc_INTRINSIC __m128d Vc_CONST _mm_setone_pd()     { return _mm_load_pd(c_general::oneDouble); }
0088
0089     static Vc_INTRINSIC __m128d Vc_CONST _mm_setabsmask_pd() { return _mm_load_pd(reinterpret_cast<const double *>(c_general::absMaskDouble)); }
0090     static Vc_INTRINSIC __m128  Vc_CONST _mm_setabsmask_ps() { return _mm_load_ps(reinterpret_cast<const float *>(c_general::absMaskFloat)); }
0091     static Vc_INTRINSIC __m128d Vc_CONST _mm_setsignmask_pd(){ return _mm_load_pd(reinterpret_cast<const double *>(c_general::signMaskDouble)); }
0092     static Vc_INTRINSIC __m128  Vc_CONST _mm_setsignmask_ps(){ return _mm_load_ps(reinterpret_cast<const float *>(c_general::signMaskFloat)); }
0093
0094     static Vc_INTRINSIC __m128i Vc_CONST setmin_epi8 () { return _mm_set1_epi8(-0x80); }
0095     static Vc_INTRINSIC __m128i Vc_CONST setmin_epi16() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::minShort)); }
0096     static Vc_INTRINSIC __m128i Vc_CONST setmin_epi32() { return _mm_load_si128(reinterpret_cast<const __m128i *>(c_general::signMaskFloat)); }
0097
0098 #if defined(Vc_IMPL_XOP)
0099     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b) { return _mm_comgt_epu8(a, b); }
0100     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b) { return _mm_comlt_epu16(a, b); }
0101     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b) { return _mm_comgt_epu16(a, b); }
0102     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b) { return _mm_comlt_epu32(a, b); }
0103     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b) { return _mm_comgt_epu32(a, b); }
0104     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu64(__m128i a, __m128i b) { return _mm_comlt_epu64(a, b); }
0105 #else
0106     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu8(__m128i a, __m128i b)
0107     {
0108         return _mm_cmpgt_epi8(_mm_xor_si128(a, setmin_epi8()),
0109                               _mm_xor_si128(b, setmin_epi8()));
0110     }
0111     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu16(__m128i a, __m128i b)
0112     {
0113         return _mm_cmplt_epi16(_mm_xor_si128(a, setmin_epi16()),
0114                                _mm_xor_si128(b, setmin_epi16()));
0115     }
0116     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu16(__m128i a, __m128i b)
0117     {
0118         return _mm_cmpgt_epi16(_mm_xor_si128(a, setmin_epi16()),
0119                                _mm_xor_si128(b, setmin_epi16()));
0120     }
0121     static Vc_INTRINSIC __m128i Vc_CONST cmplt_epu32(__m128i a, __m128i b)
0122     {
0123         return _mm_cmplt_epi32(_mm_xor_si128(a, setmin_epi32()),
0124                                _mm_xor_si128(b, setmin_epi32()));
0125     }
0126     static Vc_INTRINSIC __m128i Vc_CONST cmpgt_epu32(__m128i a, __m128i b)
0127     {
0128         return _mm_cmpgt_epi32(_mm_xor_si128(a, setmin_epi32()),
0129                                _mm_xor_si128(b, setmin_epi32()));
0130     }
0131     Vc_INTRINSIC __m128i Vc_CONST cmpgt_epi64(__m128i a, __m128i b)
0132     {
0133 #ifdef Vc_IMPL_SSE4_2
0134         return _mm_cmpgt_epi64(a, b);
0135 #else
0136         const auto aa = _mm_xor_si128(a, _mm_srli_epi64(setmin_epi32(),32));
0137         const auto bb = _mm_xor_si128(b, _mm_srli_epi64(setmin_epi32(),32));
0138         const auto gt = _mm_cmpgt_epi32(aa, bb);
0139         const auto eq = _mm_cmpeq_epi32(aa, bb);
0140         // Algorithm:
0141         // 1. if the high 32 bits of gt are true, make the full 64 bits true
0142         // 2. if the high 32 bits of gt are false and the high 32 bits of eq are true,
0143         //    duplicate the low 32 bits of gt to the high 32 bits (note that this requires
0144         //    unsigned compare on the lower 32 bits, which is the reason for the xors
0145         //    above)
0146         // 3. else make the full 64 bits false
0147
0148         const auto gt2 =
0149             _mm_shuffle_epi32(gt, 0xf5);  // dup the high 32 bits to the low 32 bits
0150         const auto lo =
0151             _mm_shuffle_epi32(_mm_and_si128(_mm_srli_epi64(eq, 32), gt), 0xa0);
0152         return _mm_or_si128(gt2, lo);
0153 #endif
0154     }
0155 #endif
0156 }  // namespace SseIntrinsics
0157 }  // namespace Vc
0158
0159 // SSSE3
0160 #ifdef Vc_IMPL_SSSE3
0161 namespace Vc_VERSIONED_NAMESPACE
0162 {
0163 namespace SseIntrinsics
0164 {
0165     // not overriding _mm_set1_epi8 because this one should only be used for non-constants
0166     Vc_INTRINSIC Vc_CONST __m128i abs_epi8(__m128i a) { return _mm_abs_epi8(a); }
0167     Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) { return _mm_abs_epi16(a); }
0168     Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) { return _mm_abs_epi32(a); }
0169     template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
0170     {
0171         return _mm_alignr_epi8(a, b, s & 0x1fu);
0172     }
0173 }  // namespace SseIntrinsics
0174 }  // namespace Vc
0175
0176 #else
0177
0178 namespace Vc_VERSIONED_NAMESPACE
0179 {
0180 namespace SseIntrinsics
0181 {
0182     Vc_INTRINSIC Vc_CONST __m128i abs_epi8 (__m128i a) {
0183         __m128i negative = _mm_cmplt_epi8 (a, _mm_setzero_si128());
0184         return _mm_add_epi8 (_mm_xor_si128(a, negative), _mm_and_si128(negative,  _mm_set1_epi8(1)));
0185     }
0186     // positive value:
0187     //   negative == 0
0188     //   a unchanged after xor
0189     //   0 >> 31 -> 0
0190     //   a + 0 -> a
0191     // negative value:
0192     //   negative == -1
0193     //   a xor -1 -> -a - 1
0194     //   -1 >> 31 -> 1
0195     //   -a - 1 + 1 -> -a
0196     Vc_INTRINSIC Vc_CONST __m128i abs_epi16(__m128i a) {
0197         __m128i negative = _mm_cmplt_epi16(a, _mm_setzero_si128());
0198         return _mm_add_epi16(_mm_xor_si128(a, negative), _mm_srli_epi16(negative, 15));
0199     }
0200     Vc_INTRINSIC Vc_CONST __m128i abs_epi32(__m128i a) {
0201         __m128i negative = _mm_cmplt_epi32(a, _mm_setzero_si128());
0202         return _mm_add_epi32(_mm_xor_si128(a, negative), _mm_srli_epi32(negative, 31));
0203     }
0204     template <int s> Vc_INTRINSIC Vc_CONST __m128i alignr_epi8(__m128i a, __m128i b)
0205     {
0206         switch (s & 0x1fu) {
0207             case  0: return b;
0208             case  1: return _mm_or_si128(_mm_slli_si128(a, 15), _mm_srli_si128(b,  1));
0209             case  2: return _mm_or_si128(_mm_slli_si128(a, 14), _mm_srli_si128(b,  2));
0210             case  3: return _mm_or_si128(_mm_slli_si128(a, 13), _mm_srli_si128(b,  3));
0211             case  4: return _mm_or_si128(_mm_slli_si128(a, 12), _mm_srli_si128(b,  4));
0212             case  5: return _mm_or_si128(_mm_slli_si128(a, 11), _mm_srli_si128(b,  5));
0213             case  6: return _mm_or_si128(_mm_slli_si128(a, 10), _mm_srli_si128(b,  6));
0214             case  7: return _mm_or_si128(_mm_slli_si128(a,  9), _mm_srli_si128(b,  7));
0215             case  8: return _mm_or_si128(_mm_slli_si128(a,  8), _mm_srli_si128(b,  8));
0216             case  9: return _mm_or_si128(_mm_slli_si128(a,  7), _mm_srli_si128(b,  9));
0217             case 10: return _mm_or_si128(_mm_slli_si128(a,  6), _mm_srli_si128(b, 10));
0218             case 11: return _mm_or_si128(_mm_slli_si128(a,  5), _mm_srli_si128(b, 11));
0219             case 12: return _mm_or_si128(_mm_slli_si128(a,  4), _mm_srli_si128(b, 12));
0220             case 13: return _mm_or_si128(_mm_slli_si128(a,  3), _mm_srli_si128(b, 13));
0221             case 14: return _mm_or_si128(_mm_slli_si128(a,  2), _mm_srli_si128(b, 14));
0222             case 15: return _mm_or_si128(_mm_slli_si128(a,  1), _mm_srli_si128(b, 15));
0223             case 16: return a;
0224             case 17: return _mm_srli_si128(a,  1);
0225             case 18: return _mm_srli_si128(a,  2);
0226             case 19: return _mm_srli_si128(a,  3);
0227             case 20: return _mm_srli_si128(a,  4);
0228             case 21: return _mm_srli_si128(a,  5);
0229             case 22: return _mm_srli_si128(a,  6);
0230             case 23: return _mm_srli_si128(a,  7);
0231             case 24: return _mm_srli_si128(a,  8);
0232             case 25: return _mm_srli_si128(a,  9);
0233             case 26: return _mm_srli_si128(a, 10);
0234             case 27: return _mm_srli_si128(a, 11);
0235             case 28: return _mm_srli_si128(a, 12);
0236             case 29: return _mm_srli_si128(a, 13);
0237             case 30: return _mm_srli_si128(a, 14);
0238             case 31: return _mm_srli_si128(a, 15);
0239         }
0240         return _mm_setzero_si128();
0241     }
0242 }  // namespace SseIntrinsics
0243 }  // namespace Vc
0244 #endif
0245
0246 // SSE4.1
0247 #ifdef Vc_IMPL_SSE4_1
0248 namespace Vc_VERSIONED_NAMESPACE
0249 {
0250 namespace SseIntrinsics
0251 {
0252 Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b)
0253 {
0254     return _mm_cmpeq_epi64(a, b);
0255 }
0256 template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
0257 {
0258     return _mm_extract_epi32(v, index);
0259 }
0260 Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c)
0261 {
0262     return _mm_blendv_pd(a, b, c);
0263 }
0264 Vc_INTRINSIC Vc_CONST __m128 blendv_ps(__m128 a, __m128 b, __m128 c)
0265 {
0266     return _mm_blendv_ps(a, b, c);
0267 }
0268 Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c)
0269 {
0270     return _mm_blendv_epi8(a, b, c);
0271 }
0272 template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
0273 {
0274     return _mm_blend_pd(a, b, mask);
0275 }
0276 template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
0277 {
0278     return _mm_blend_ps(a, b, mask);
0279 }
0280 template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
0281 {
0282     return _mm_blend_epi16(a, b, mask);
0283 }
0284 Vc_INTRINSIC Vc_CONST __m128i max_epi8(__m128i a, __m128i b)
0285 {
0286     return _mm_max_epi8(a, b);
0287 }
0288 Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b)
0289 {
0290     return _mm_max_epi32(a, b);
0291 }
0292 Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b)
0293 {
0294     return _mm_max_epu16(a, b);
0295 }
0296 Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b)
0297 {
0298     return _mm_max_epu32(a, b);
0299 }
0300 Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b)
0301 {
0302     return _mm_min_epu16(a, b);
0303 }
0304 Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b)
0305 {
0306     return _mm_min_epu32(a, b);
0307 }
0308 Vc_INTRINSIC Vc_CONST __m128i min_epi8(__m128i a, __m128i b)
0309 {
0310     return _mm_min_epi8(a, b);
0311 }
0312 Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b)
0313 {
0314     return _mm_min_epi32(a, b);
0315 }
0316 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8)
0317 {
0318     return _mm_cvtepu8_epi16(epu8);
0319 }
0320 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8)
0321 {
0322     return _mm_cvtepi8_epi16(epi8);
0323 }
0324 Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16)
0325 {
0326     return _mm_cvtepu16_epi32(epu16);
0327 }
0328 Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16)
0329 {
0330     return _mm_cvtepi16_epi32(epu16);
0331 }
0332 Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8)
0333 {
0334     return _mm_cvtepu8_epi32(epu8);
0335 }
0336 Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8)
0337 {
0338     return _mm_cvtepi8_epi32(epi8);
0339 }
0340 }  // namespace SseIntrinsics
0341 }  // namespace Vc
0342 #else
0343
0344 namespace Vc_VERSIONED_NAMESPACE
0345 {
0346 namespace SseIntrinsics
0347 {
0348     Vc_INTRINSIC Vc_CONST __m128i cmpeq_epi64(__m128i a, __m128i b) {
0349         auto tmp = _mm_cmpeq_epi32(a, b);
0350         return _mm_and_si128(tmp, _mm_shuffle_epi32(tmp, 1*1 + 0*4 + 3*16 + 2*64));
0351     }
0352     template <int index> Vc_INTRINSIC Vc_CONST int extract_epi32(__m128i v)
0353     {
0354 #ifdef Vc_USE_BUILTIN_VECTOR_TYPES
0355         typedef int int32v4 __attribute__((__vector_size__(16)));
0356         return aliasing_cast<int32v4>(v)[index];
0357 #else
0358         return _mm_cvtsi128_si32(_mm_srli_si128(v, index * 4));
0359 #endif
0360     }
0361     Vc_INTRINSIC Vc_CONST __m128d blendv_pd(__m128d a, __m128d b, __m128d c) {
0362 #ifdef Vc_GCC
0363         return reinterpret_cast<__m128d>(
0364             (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
0365             (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
0366 #else
0367         return _mm_or_pd(_mm_andnot_pd(c, a), _mm_and_pd(c, b));
0368 #endif
0369     }
0370     Vc_INTRINSIC Vc_CONST __m128  blendv_ps(__m128  a, __m128  b, __m128  c) {
0371 #ifdef Vc_GCC
0372         return reinterpret_cast<__m128>(
0373             (~reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(a)) |
0374             (reinterpret_cast<__m128i>(c) & reinterpret_cast<__m128i>(b)));
0375 #else
0376         return _mm_or_ps(_mm_andnot_ps(c, a), _mm_and_ps(c, b));
0377 #endif
0378     }
0379     Vc_INTRINSIC Vc_CONST __m128i blendv_epi8(__m128i a, __m128i b, __m128i c) {
0380 #ifdef Vc_GCC
0381         return (~c & a) | (c & b);
0382 #else
0383         return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
0384 #endif
0385     }
0386
0387     // only use the following blend functions with immediates as mask and, of course, compiling
0388     // with optimization
0389     template <int mask> Vc_INTRINSIC Vc_CONST __m128d blend_pd(__m128d a, __m128d b)
0390     {
0391         switch (mask) {
0392         case 0x0:
0393             return a;
0394         case 0x1:
0395             return _mm_shuffle_pd(b, a, 2);
0396         case 0x2:
0397             return _mm_shuffle_pd(a, b, 2);
0398         case 0x3:
0399             return b;
0400         default:
0401             abort();
0402             return a; // should never be reached, but MSVC needs it else it warns about 'not all control paths return a value'
0403         }
0404     }
0405     template <int mask> Vc_INTRINSIC Vc_CONST __m128 blend_ps(__m128 a, __m128 b)
0406     {
0407         __m128i c;
0408         switch (mask) {
0409         case 0x0:
0410             return a;
0411         case 0x1:
0412             c = _mm_srli_si128(_mm_setallone_si128(), 12);
0413             break;
0414         case 0x2:
0415             c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 4);
0416             break;
0417         case 0x3:
0418             c = _mm_srli_si128(_mm_setallone_si128(), 8);
0419             break;
0420         case 0x4:
0421             c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 12), 8);
0422             break;
0423         case 0x5:
0424             c = _mm_set_epi32(0, -1, 0, -1);
0425             break;
0426         case 0x6:
0427             c = _mm_slli_si128(_mm_srli_si128(_mm_setallone_si128(), 8), 4);
0428             break;
0429         case 0x7:
0430             c = _mm_srli_si128(_mm_setallone_si128(), 4);
0431             break;
0432         case 0x8:
0433             c = _mm_slli_si128(_mm_setallone_si128(), 12);
0434             break;
0435         case 0x9:
0436             c = _mm_set_epi32(-1, 0, 0, -1);
0437             break;
0438         case 0xa:
0439             c = _mm_set_epi32(-1, 0, -1, 0);
0440             break;
0441         case 0xb:
0442             c = _mm_set_epi32(-1, 0, -1, -1);
0443             break;
0444         case 0xc:
0445             c = _mm_slli_si128(_mm_setallone_si128(), 8);
0446             break;
0447         case 0xd:
0448             c = _mm_set_epi32(-1, -1, 0, -1);
0449             break;
0450         case 0xe:
0451             c = _mm_slli_si128(_mm_setallone_si128(), 4);
0452             break;
0453         case 0xf:
0454             return b;
0455         default: // may not happen
0456             abort();
0457             c = _mm_setzero_si128();
0458             break;
0459         }
0460         __m128 _c = _mm_castsi128_ps(c);
0461         return _mm_or_ps(_mm_andnot_ps(_c, a), _mm_and_ps(_c, b));
0462     }
0463     template <int mask> Vc_INTRINSIC Vc_CONST __m128i blend_epi16(__m128i a, __m128i b)
0464     {
0465         __m128i c;
0466         switch (mask) {
0467         case 0x00:
0468             return a;
0469         case 0x01:
0470             c = _mm_srli_si128(_mm_setallone_si128(), 14);
0471             break;
0472         case 0x03:
0473             c = _mm_srli_si128(_mm_setallone_si128(), 12);
0474             break;
0475         case 0x07:
0476             c = _mm_srli_si128(_mm_setallone_si128(), 10);
0477             break;
0478         case 0x0f:
0479             return _mm_unpackhi_epi64(_mm_slli_si128(b, 8), a);
0480         case 0x1f:
0481             c = _mm_srli_si128(_mm_setallone_si128(), 6);
0482             break;
0483         case 0x3f:
0484             c = _mm_srli_si128(_mm_setallone_si128(), 4);
0485             break;
0486         case 0x7f:
0487             c = _mm_srli_si128(_mm_setallone_si128(), 2);
0488             break;
0489         case 0x80:
0490             c = _mm_slli_si128(_mm_setallone_si128(), 14);
0491             break;
0492         case 0xc0:
0493             c = _mm_slli_si128(_mm_setallone_si128(), 12);
0494             break;
0495         case 0xe0:
0496             c = _mm_slli_si128(_mm_setallone_si128(), 10);
0497             break;
0498         case 0xf0:
0499             c = _mm_slli_si128(_mm_setallone_si128(), 8);
0500             break;
0501         case 0xf8:
0502             c = _mm_slli_si128(_mm_setallone_si128(), 6);
0503             break;
0504         case 0xfc:
0505             c = _mm_slli_si128(_mm_setallone_si128(), 4);
0506             break;
0507         case 0xfe:
0508             c = _mm_slli_si128(_mm_setallone_si128(), 2);
0509             break;
0510         case 0xff:
0511             return b;
0512         case 0xcc:
0513             return _mm_unpacklo_epi32(_mm_shuffle_epi32(a, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 1, 3, 1)));
0514         case 0x33:
0515             return _mm_unpacklo_epi32(_mm_shuffle_epi32(b, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_epi32(a, _MM_SHUFFLE(3, 1, 3, 1)));
0516         default:
0517             const __m128i shift = _mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, -0x7fff);
0518             c = _mm_srai_epi16(_mm_mullo_epi16(_mm_set1_epi16(mask), shift), 15);
0519             break;
0520         }
0521         return _mm_or_si128(_mm_andnot_si128(c, a), _mm_and_si128(c, b));
0522     }
0523
0524     Vc_INTRINSIC Vc_CONST __m128i max_epi8 (__m128i a, __m128i b) {
0525         return blendv_epi8(b, a, _mm_cmpgt_epi8 (a, b));
0526     }
0527     Vc_INTRINSIC Vc_CONST __m128i max_epi32(__m128i a, __m128i b) {
0528         return blendv_epi8(b, a, _mm_cmpgt_epi32(a, b));
0529     }
0530     Vc_INTRINSIC Vc_CONST __m128i max_epu16(__m128i a, __m128i b) {
0531         return blendv_epi8(b, a, cmpgt_epu16(a, b));
0532     }
0533     Vc_INTRINSIC Vc_CONST __m128i max_epu32(__m128i a, __m128i b) {
0534         return blendv_epi8(b, a, cmpgt_epu32(a, b));
0535     }
0536     Vc_INTRINSIC Vc_CONST __m128i min_epu16(__m128i a, __m128i b) {
0537         return blendv_epi8(a, b, cmpgt_epu16(a, b));
0538     }
0539     Vc_INTRINSIC Vc_CONST __m128i min_epu32(__m128i a, __m128i b) {
0540         return blendv_epi8(a, b, cmpgt_epu32(a, b));
0541     }
0542     Vc_INTRINSIC Vc_CONST __m128i min_epi8 (__m128i a, __m128i b) {
0543         return blendv_epi8(a, b, _mm_cmpgt_epi8 (a, b));
0544     }
0545     Vc_INTRINSIC Vc_CONST __m128i min_epi32(__m128i a, __m128i b) {
0546         return blendv_epi8(a, b, _mm_cmpgt_epi32(a, b));
0547     }
0548     Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi16(__m128i epu8) {
0549         return _mm_unpacklo_epi8(epu8, _mm_setzero_si128());
0550     }
0551     Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi16(__m128i epi8) {
0552         return _mm_unpacklo_epi8(epi8, _mm_cmplt_epi8(epi8, _mm_setzero_si128()));
0553     }
0554     Vc_INTRINSIC Vc_CONST __m128i cvtepu16_epi32(__m128i epu16) {
0555         return _mm_unpacklo_epi16(epu16, _mm_setzero_si128());
0556     }
0557     Vc_INTRINSIC Vc_CONST __m128i cvtepi16_epi32(__m128i epu16) {
0558         return _mm_unpacklo_epi16(epu16, _mm_cmplt_epi16(epu16, _mm_setzero_si128()));
0559     }
0560     Vc_INTRINSIC Vc_CONST __m128i cvtepu8_epi32(__m128i epu8) {
0561         return cvtepu16_epi32(cvtepu8_epi16(epu8));
0562     }
0563     Vc_INTRINSIC Vc_CONST __m128i cvtepi8_epi32(__m128i epi8) {
0564         const __m128i neg = _mm_cmplt_epi8(epi8, _mm_setzero_si128());
0565         const __m128i epi16 = _mm_unpacklo_epi8(epi8, neg);
0566         return _mm_unpacklo_epi16(epi16, _mm_unpacklo_epi8(neg, neg));
0567     }
0568 }  // namespace SseIntrinsics
0569 }  // namespace Vc
0570 #endif
0571
0572 // SSE4.2
0573 namespace Vc_VERSIONED_NAMESPACE
0574 {
0575 namespace SseIntrinsics
0576 {
0577     static Vc_INTRINSIC Vc_PURE __m128  _mm_stream_load(const float *mem) {
0578 #ifdef Vc_IMPL_SSE4_1
0579         return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
0580 #else
0581         return _mm_load_ps(mem);
0582 #endif
0583     }
0584     static Vc_INTRINSIC Vc_PURE __m128d _mm_stream_load(const double *mem) {
0585 #ifdef Vc_IMPL_SSE4_1
0586         return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
0587 #else
0588         return _mm_load_pd(mem);
0589 #endif
0590     }
0591     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const int *mem) {
0592 #ifdef Vc_IMPL_SSE4_1
0593         return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<int *>(mem)));
0594 #else
0595         return _mm_load_si128(reinterpret_cast<const __m128i *>(mem));
0596 #endif
0597     }
0598     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned int *mem) {
0599         return _mm_stream_load(reinterpret_cast<const int *>(mem));
0600     }
0601     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const short *mem) {
0602         return _mm_stream_load(reinterpret_cast<const int *>(mem));
0603     }
0604     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned short *mem) {
0605         return _mm_stream_load(reinterpret_cast<const int *>(mem));
0606     }
0607     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const signed char *mem) {
0608         return _mm_stream_load(reinterpret_cast<const int *>(mem));
0609     }
0610     static Vc_INTRINSIC Vc_PURE __m128i _mm_stream_load(const unsigned char *mem) {
0611         return _mm_stream_load(reinterpret_cast<const int *>(mem));
0612     }
0613
0614 #ifndef __x86_64__
0615     Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
0616         return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
0617     }
0618 #endif
0619
0620 #ifdef Vc_IMPL_AVX2
0621 template <int Scale> __m128 gather(const float *addr, __m128i idx)
0622 {
0623     return _mm_i32gather_ps(addr, idx, Scale);
0624 }
0625 template <int Scale> __m128d gather(const double *addr, __m128i idx)
0626 {
0627     return _mm_i32gather_pd(addr, idx, Scale);
0628 }
0629 template <int Scale> __m128i gather(const int *addr, __m128i idx)
0630 {
0631     return _mm_i32gather_epi32(addr, idx, Scale);
0632 }
0633 template <int Scale> __m128i gather(const unsigned *addr, __m128i idx)
0634 {
0635     return _mm_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
0636 }
0637
0638 template <int Scale> __m128 gather(__m128 src, __m128 k, const float *addr, __m128i idx)
0639 {
0640     return _mm_mask_i32gather_ps(src, addr, idx, k, Scale);
0641 }
0642 template <int Scale>
0643 __m128d gather(__m128d src, __m128d k, const double *addr, __m128i idx)
0644 {
0645     return _mm_mask_i32gather_pd(src, addr, idx, k, Scale);
0646 }
0647 template <int Scale> __m128i gather(__m128i src, __m128i k, const int *addr, __m128i idx)
0648 {
0649     return _mm_mask_i32gather_epi32(src, addr, idx, k, Scale);
0650 }
0651 template <int Scale>
0652 __m128i gather(__m128i src, __m128i k, const unsigned *addr, __m128i idx)
0653 {
0654     return _mm_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
0655 }
0656 #endif
0657
0658 }  // namespace SseIntrinsics
0659 }  // namespace Vc
0660
0661 namespace Vc_VERSIONED_NAMESPACE
0662 {
0663 namespace SSE
0664 {
0665 using namespace SseIntrinsics;
0666
0667 template <typename T> struct ParameterHelper
0668 {
0669     typedef T ByValue;
0670     typedef T &Reference;
0671     typedef const T &ConstRef;
0672 };
0673
0674 template <typename T> struct VectorHelper
0675 {
0676 };
0677
0678 template <typename T> struct VectorTypeHelper
0679 {
0680     typedef __m128i Type;
0681 };
0682 template <> struct VectorTypeHelper<double>
0683 {
0684     typedef __m128d Type;
0685 };
0686 template <> struct VectorTypeHelper<float>
0687 {
0688     typedef __m128 Type;
0689 };
0690
0691 template <typename T> struct DetermineGatherMask
0692 {
0693     typedef T Type;
0694 };
0695
0696 template <typename T> struct VectorTraits
0697 {
0698     typedef typename VectorTypeHelper<T>::Type VectorType;
0699     using EntryType = T;
0700     static constexpr size_t Size = sizeof(VectorType) / sizeof(EntryType);
0701     typedef Mask<T> MaskType;
0702     typedef typename DetermineGatherMask<MaskType>::Type GatherMaskType;
0703     typedef Common::VectorMemoryUnion<VectorType, EntryType> StorageType;
0704 };
0705
0706 template <typename T> struct VectorHelperSize;
0707 }  // namespace SSE
0708 }  // namespace Vc
0709
0710 #if defined(Vc_GCC) && !defined(__OPTIMIZE__)
0711 #pragma GCC diagnostic pop
0712 #endif
0713
0714 #include "shuffle.h"
0715
0716 #endif // VC_SSE_INTRINSICS_H_