Vc/avx/intrinsics.h

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 #ifndef VC_AVX_INTRINSICS_H_
0029 #define VC_AVX_INTRINSICS_H_
0030
0031 #include "../global.h"
0032 #include "../traits/type_traits.h"
0033
0034 // see comment in sse/intrinsics.h
0035 extern "C" {
0036 // AVX
0037 #include <immintrin.h>
0038
0039 #if (defined(Vc_IMPL_XOP) || defined(Vc_IMPL_FMA4)) && !defined(Vc_MSVC)
0040 #include <x86intrin.h>
0041 #endif
0042 }
0043
0044 #include "../common/fix_clang_emmintrin.h"
0045
0046 #include "const_data.h"
0047 #include "../common/types.h"
0048 #include "macros.h"
0049 #include <cstdlib>
0050
0051 #if (defined Vc_CLANG && Vc_CLANG >= 0x30900 && Vc_CLANG < 0x70000)
0052 #ifdef _mm256_permute2f128_si256
0053 #undef _mm256_permute2f128_si256
0054 #define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
0055   (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
0056                                            (__v8si)(__m256i)(V2), (char)(M)); })
0057 #endif
0058
0059 #ifdef _mm256_permute2f128_ps
0060 #undef _mm256_permute2f128_ps
0061 #define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
0062   (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
0063                                           (__v8sf)(__m256)(V2), (char)(M)); })
0064 #endif
0065
0066 #ifdef _mm256_permute2x128_si256
0067 #undef _mm256_permute2x128_si256
0068 #define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
0069   (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (char)(M)); })
0070 #endif
0071 #endif
0072
0073 namespace Vc_VERSIONED_NAMESPACE
0074 {
0075 namespace AvxIntrinsics
0076 {
0077     using AVX::c_general;
0078     using AVX::_IndexesFromZero32;
0079     using AVX::_IndexesFromZero16;
0080     using AVX::_IndexesFromZero8;
0081
0082     typedef __m128  m128 ;
0083     typedef __m128d m128d;
0084     typedef __m128i m128i;
0085     typedef __m256  m256 ;
0086     typedef __m256d m256d;
0087     typedef __m256i m256i;
0088
0089 #ifdef Vc_GCC
0090     // Redefine the mul/add/sub intrinsics to use GCC-specific operators instead of builtin
0091     // functions. This way the fp-contraction optimization step kicks in and creates FMAs! :)
0092     static Vc_INTRINSIC Vc_CONST m256d _mm256_mul_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) * static_cast<__v4df>(b)); }
0093     static Vc_INTRINSIC Vc_CONST m256d _mm256_add_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) + static_cast<__v4df>(b)); }
0094     static Vc_INTRINSIC Vc_CONST m256d _mm256_sub_pd(m256d a, m256d b) { return static_cast<m256d>(static_cast<__v4df>(a) - static_cast<__v4df>(b)); }
0095     static Vc_INTRINSIC Vc_CONST m256 _mm256_mul_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) * static_cast<__v8sf>(b)); }
0096     static Vc_INTRINSIC Vc_CONST m256 _mm256_add_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) + static_cast<__v8sf>(b)); }
0097     static Vc_INTRINSIC Vc_CONST m256 _mm256_sub_ps(m256 a, m256 b) { return static_cast<m256>(static_cast<__v8sf>(a) - static_cast<__v8sf>(b)); }
0098 #endif
0099
0100     static Vc_INTRINSIC m256d Vc_CONST set1_pd   (double a) { return _mm256_set1_pd   (a); }
0101     static Vc_INTRINSIC m256i Vc_CONST set1_epi32(int    a) { return _mm256_set1_epi32(a); }
0102
0103     static Vc_INTRINSIC Vc_CONST m128i _mm_setallone_si128() { return _mm_load_si128(reinterpret_cast<const __m128i *>(Common::AllBitsSet)); }
0104     static Vc_INTRINSIC Vc_CONST m128  _mm_setallone_ps() { return _mm_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
0105     static Vc_INTRINSIC Vc_CONST m128d _mm_setallone_pd() { return _mm_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
0106
0107     static Vc_INTRINSIC Vc_CONST m256i setallone_si256() { return _mm256_castps_si256(_mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet))); }
0108     static Vc_INTRINSIC Vc_CONST m256d setallone_pd() { return _mm256_load_pd(reinterpret_cast<const double *>(Common::AllBitsSet)); }
0109     static Vc_INTRINSIC Vc_CONST m256  setallone_ps() { return _mm256_load_ps(reinterpret_cast<const float *>(Common::AllBitsSet)); }
0110
0111     static Vc_INTRINSIC m256i Vc_CONST setone_epi8 ()  { return _mm256_set1_epi8(1); }
0112     static Vc_INTRINSIC m256i Vc_CONST setone_epu8 ()  { return setone_epi8(); }
0113     static Vc_INTRINSIC m256i Vc_CONST setone_epi16()  { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::one16))); }
0114     static Vc_INTRINSIC m256i Vc_CONST setone_epu16()  { return setone_epi16(); }
0115     static Vc_INTRINSIC m256i Vc_CONST setone_epi32()  { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&_IndexesFromZero32[1]))); }
0116     static Vc_INTRINSIC m256i Vc_CONST setone_epu32()  { return setone_epi32(); }
0117
0118     static Vc_INTRINSIC m256  Vc_CONST setone_ps()     { return _mm256_broadcast_ss(&c_general::oneFloat); }
0119     static Vc_INTRINSIC m256d Vc_CONST setone_pd()     { return _mm256_broadcast_sd(&c_general::oneDouble); }
0120
0121     static Vc_INTRINSIC m256d Vc_CONST setabsmask_pd() { return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::absMaskFloat[0])); }
0122     static Vc_INTRINSIC m256  Vc_CONST setabsmask_ps() { return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::absMaskFloat[1])); }
0123     static Vc_INTRINSIC m256d Vc_CONST setsignmask_pd(){ return _mm256_broadcast_sd(reinterpret_cast<const double *>(&c_general::signMaskFloat[0])); }
0124     static Vc_INTRINSIC m256  Vc_CONST setsignmask_ps(){ return _mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1])); }
0125
0126     static Vc_INTRINSIC m256  Vc_CONST set2power31_ps()    { return _mm256_broadcast_ss(&c_general::_2power31); }
0127     static Vc_INTRINSIC m128  Vc_CONST _mm_set2power31_ps()    { return _mm_broadcast_ss(&c_general::_2power31); }
0128     static Vc_INTRINSIC m256i Vc_CONST set2power31_epu32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
0129     static Vc_INTRINSIC m128i Vc_CONST _mm_set2power31_epu32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
0130
0131     static Vc_INTRINSIC m256i Vc_CONST setmin_epi8 () { return _mm256_set1_epi8(-0x80); }
0132     static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi16() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
0133     static Vc_INTRINSIC m128i Vc_CONST _mm_setmin_epi32() { return _mm_castps_si128(_mm_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
0134     static Vc_INTRINSIC m256i Vc_CONST setmin_epi16() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(c_general::minShort))); }
0135     static Vc_INTRINSIC m256i Vc_CONST setmin_epi32() { return _mm256_castps_si256(_mm256_broadcast_ss(reinterpret_cast<const float *>(&c_general::signMaskFloat[1]))); }
0136
0137     template <int i>
0138     static Vc_INTRINSIC Vc_CONST unsigned int extract_epu32(__m128i x)
0139     {
0140         return _mm_extract_epi32(x, i);
0141     }
0142
0143     template <int offset> Vc_INTRINSIC __m256  insert128(__m256  a, __m128  b) { return _mm256_insertf128_ps(a, b, offset); }
0144     template <int offset> Vc_INTRINSIC __m256d insert128(__m256d a, __m128d b) { return _mm256_insertf128_pd(a, b, offset); }
0145     template <int offset> Vc_INTRINSIC __m256i insert128(__m256i a, __m128i b) {
0146 #ifdef Vc_IMPL_AVX2
0147         return _mm256_inserti128_si256(a, b, offset);
0148 #else
0149         return _mm256_insertf128_si256(a, b, offset);
0150 #endif
0151     }
0152
0153     template <int offset> Vc_INTRINSIC __m128  extract128(__m256  a) { return _mm256_extractf128_ps(a, offset); }
0154     template <int offset> Vc_INTRINSIC __m128d extract128(__m256d a) { return _mm256_extractf128_pd(a, offset); }
0155     template <int offset> Vc_INTRINSIC __m128i extract128(__m256i a) {
0156 #ifdef Vc_IMPL_AVX2
0157         return _mm256_extracti128_si256(a, offset);
0158 #else
0159         return _mm256_extractf128_si256(a, offset);
0160 #endif
0161     }
0162
0163     /////////////////////// COMPARE OPS ///////////////////////
0164 #ifdef Vc_GCC
0165     // GCC needs builtin compare operators to enable constant folding
0166     Vc_INTRINSIC __m256d cmpeq_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a == b); }
0167     Vc_INTRINSIC __m256d cmpneq_pd  (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a != b); }
0168     Vc_INTRINSIC __m256d cmplt_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a < b); }
0169     Vc_INTRINSIC __m256d cmpge_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a >= b); }
0170     Vc_INTRINSIC __m256d cmple_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a <= b); }
0171     Vc_INTRINSIC __m256d cmpgt_pd   (__m256d a, __m256d b) { return reinterpret_cast<__m256d>(a > b); }
0172
0173     Vc_INTRINSIC __m256  cmpeq_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a == b); }
0174     Vc_INTRINSIC __m256  cmpneq_ps  (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a != b); }
0175     Vc_INTRINSIC __m256  cmplt_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a < b); }
0176     Vc_INTRINSIC __m256  cmpge_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a >= b); }
0177     Vc_INTRINSIC __m256  cmple_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a <= b); }
0178     Vc_INTRINSIC __m256  cmpgt_ps   (__m256  a, __m256  b) { return reinterpret_cast<__m256 >(a > b); }
0179 #else
0180     Vc_INTRINSIC __m256d cmpeq_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_EQ_OQ); }
0181     Vc_INTRINSIC __m256d cmpneq_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ); }
0182     Vc_INTRINSIC __m256d cmplt_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LT_OS); }
0183     Vc_INTRINSIC __m256d cmpge_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLT_US); }
0184     Vc_INTRINSIC __m256d cmple_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_LE_OS); }
0185     Vc_INTRINSIC __m256d cmpgt_pd   (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_NLE_US); }
0186
0187     Vc_INTRINSIC __m256  cmpeq_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_EQ_OQ); }
0188     Vc_INTRINSIC __m256  cmpneq_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ); }
0189     Vc_INTRINSIC __m256  cmplt_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_LT_OS); }
0190     Vc_INTRINSIC __m256  cmpge_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLT_US); }
0191     Vc_INTRINSIC __m256  cmple_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_LE_OS); }
0192     Vc_INTRINSIC __m256  cmpgt_ps   (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_NLE_US); }
0193 #endif
0194     Vc_INTRINSIC __m256d cmpnlt_pd  (__m256d a, __m256d b) { return cmpge_pd(a, b); }
0195     Vc_INTRINSIC __m256d cmpnle_pd  (__m256d a, __m256d b) { return cmpgt_pd(a, b); }
0196     Vc_INTRINSIC __m256  cmpnlt_ps  (__m256  a, __m256  b) { return cmpge_ps(a, b); }
0197     Vc_INTRINSIC __m256  cmpnle_ps  (__m256  a, __m256  b) { return cmpgt_ps(a, b); }
0198
0199     Vc_INTRINSIC __m256d cmpord_pd  (__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_ORD_Q); }
0200     Vc_INTRINSIC __m256d cmpunord_pd(__m256d a, __m256d b) { return _mm256_cmp_pd(a, b, _CMP_UNORD_Q); }
0201     Vc_INTRINSIC __m256  cmpord_ps  (__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_ORD_Q); }
0202     Vc_INTRINSIC __m256  cmpunord_ps(__m256  a, __m256  b) { return _mm256_cmp_ps(a, b, _CMP_UNORD_Q); }
0203
0204 #if defined(Vc_IMPL_XOP)
0205     static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
0206         return _mm_comlt_epu16(a, b);
0207     }
0208     static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
0209         return _mm_comgt_epu16(a, b);
0210     }
0211 #else
0212     static Vc_INTRINSIC m128i cmplt_epu16(__m128i a, __m128i b) {
0213         return _mm_cmplt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
0214     }
0215     static Vc_INTRINSIC m128i cmpgt_epu16(__m128i a, __m128i b) {
0216         return _mm_cmpgt_epi16(_mm_xor_si128(a, _mm_setmin_epi16()), _mm_xor_si128(b, _mm_setmin_epi16()));
0217     }
0218 #endif
0219
0220 #ifdef Vc_IMPL_AVX2
0221     template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
0222     {
0223         return _mm256_alignr_epi8(s1, s2, shift);
0224     }
0225 #else
0226     template <int shift> Vc_INTRINSIC Vc_CONST m256i alignr(__m256i s1, __m256i s2)
0227     {
0228         return insert128<1>(
0229             _mm256_castsi128_si256(_mm_alignr_epi8(_mm256_castsi256_si128(s1),
0230                                                    _mm256_castsi256_si128(s2), shift)),
0231             _mm_alignr_epi8(extract128<1>(s1), extract128<1>(s2), shift));
0232     }
0233 #endif
0234
0235 #ifdef Vc_IMPL_AVX2
0236 #define Vc_AVX_TO_SSE_2_NEW(name)                                                        \
0237     Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0)                             \
0238     {                                                                                    \
0239         return _mm256_##name(a0, b0);                                                    \
0240     }
0241 #define Vc_AVX_TO_SSE_256_128(name)                                                      \
0242     Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0)                             \
0243     {                                                                                    \
0244         return _mm256_##name(a0, b0);                                                    \
0245     }
0246 #define Vc_AVX_TO_SSE_1i(name)                                                           \
0247     template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0)                        \
0248     {                                                                                    \
0249         return _mm256_##name(a0, i);                                                     \
0250     }
0251 #define Vc_AVX_TO_SSE_1(name)                                                            \
0252     Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0) { return _mm256_##name(a0); }
0253 #define Vc_AVX_TO_SSE_1_128(name, shift__)                                               \
0254     Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0) { return _mm256_##name(a0); }
0255 #else
0256 /**\internal
0257  * Defines the function \p name, which takes to __m256i arguments and calls `_mm_##name` on the low
0258  * and high 128 bit halfs of the arguments.
0259  *
0260  * In case the AVX2 intrinsics are enabled, the arguments are directly passed to a single
0261  * `_mm256_##name` call.
0262  */
0263 #define Vc_AVX_TO_SSE_1(name)                                                            \
0264     Vc_INTRINSIC Vc_CONST __m256i name(__m256i a0)                                       \
0265     {                                                                                    \
0266         __m128i a1 = extract128<1>(a0);                                                  \
0267         __m128i r0 = _mm_##name(_mm256_castsi256_si128(a0));                             \
0268         __m128i r1 = _mm_##name(a1);                                                     \
0269         return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
0270     }
0271 #define Vc_AVX_TO_SSE_1_128(name, shift__)                                               \
0272     Vc_INTRINSIC Vc_CONST __m256i name(__m128i a0)                                       \
0273     {                                                                                    \
0274         __m128i r0 = _mm_##name(a0);                                                     \
0275         __m128i r1 = _mm_##name(_mm_srli_si128(a0, shift__));                            \
0276         return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
0277     }
0278 #define Vc_AVX_TO_SSE_2_NEW(name)                                                        \
0279     Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m256i b0)                             \
0280     {                                                                                    \
0281         m128i a1 = extract128<1>(a0);                                                    \
0282         m128i b1 = extract128<1>(b0);                                                    \
0283         m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0));   \
0284         m128i r1 = _mm_##name(a1, b1);                                                   \
0285         return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
0286     }
0287 #define Vc_AVX_TO_SSE_256_128(name)                                                      \
0288     Vc_INTRINSIC Vc_CONST m256i name(__m256i a0, __m128i b0)                             \
0289     {                                                                                    \
0290         m128i a1 = extract128<1>(a0);                                                    \
0291         m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), b0);                           \
0292         m128i r1 = _mm_##name(a1, b0);                                                   \
0293         return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
0294     }
0295 #define Vc_AVX_TO_SSE_1i(name)                                                           \
0296     template <int i> Vc_INTRINSIC Vc_CONST m256i name(__m256i a0)                        \
0297     {                                                                                    \
0298         m128i a1 = extract128<1>(a0);                                                    \
0299         m128i r0 = _mm_##name(_mm256_castsi256_si128(a0), i);                            \
0300         m128i r1 = _mm_##name(a1, i);                                                    \
0301         return insert128<1>(_mm256_castsi128_si256(r0), r1);                             \
0302     }
0303 #endif
0304     Vc_INTRINSIC Vc_CONST __m128i sll_epi16(__m128i a, __m128i b) { return _mm_sll_epi16(a, b); }
0305     Vc_INTRINSIC Vc_CONST __m128i sll_epi32(__m128i a, __m128i b) { return _mm_sll_epi32(a, b); }
0306     Vc_INTRINSIC Vc_CONST __m128i sll_epi64(__m128i a, __m128i b) { return _mm_sll_epi64(a, b); }
0307     Vc_INTRINSIC Vc_CONST __m128i srl_epi16(__m128i a, __m128i b) { return _mm_srl_epi16(a, b); }
0308     Vc_INTRINSIC Vc_CONST __m128i srl_epi32(__m128i a, __m128i b) { return _mm_srl_epi32(a, b); }
0309     Vc_INTRINSIC Vc_CONST __m128i srl_epi64(__m128i a, __m128i b) { return _mm_srl_epi64(a, b); }
0310     Vc_INTRINSIC Vc_CONST __m128i sra_epi16(__m128i a, __m128i b) { return _mm_sra_epi16(a, b); }
0311     Vc_INTRINSIC Vc_CONST __m128i sra_epi32(__m128i a, __m128i b) { return _mm_sra_epi32(a, b); }
0312
0313     Vc_AVX_TO_SSE_1i(slli_epi16)
0314     Vc_AVX_TO_SSE_1i(slli_epi32)
0315     Vc_AVX_TO_SSE_1i(slli_epi64)
0316     Vc_AVX_TO_SSE_1i(srai_epi16)
0317     Vc_AVX_TO_SSE_1i(srai_epi32)
0318     Vc_AVX_TO_SSE_1i(srli_epi16)
0319     Vc_AVX_TO_SSE_1i(srli_epi32)
0320     Vc_AVX_TO_SSE_1i(srli_epi64)
0321
0322     Vc_AVX_TO_SSE_256_128(sll_epi16)
0323     Vc_AVX_TO_SSE_256_128(sll_epi32)
0324     Vc_AVX_TO_SSE_256_128(sll_epi64)
0325     Vc_AVX_TO_SSE_256_128(srl_epi16)
0326     Vc_AVX_TO_SSE_256_128(srl_epi32)
0327     Vc_AVX_TO_SSE_256_128(srl_epi64)
0328     Vc_AVX_TO_SSE_256_128(sra_epi16)
0329     Vc_AVX_TO_SSE_256_128(sra_epi32)
0330
0331     Vc_AVX_TO_SSE_2_NEW(cmpeq_epi8)
0332     Vc_AVX_TO_SSE_2_NEW(cmpeq_epi16)
0333     Vc_AVX_TO_SSE_2_NEW(cmpeq_epi32)
0334     Vc_AVX_TO_SSE_2_NEW(cmpeq_epi64)
0335     Vc_AVX_TO_SSE_2_NEW(cmpgt_epi8)
0336     Vc_AVX_TO_SSE_2_NEW(cmpgt_epi16)
0337     Vc_AVX_TO_SSE_2_NEW(cmpgt_epi32)
0338     Vc_AVX_TO_SSE_2_NEW(cmpgt_epi64)
0339     Vc_AVX_TO_SSE_2_NEW(unpackhi_epi16)
0340     Vc_AVX_TO_SSE_2_NEW(unpacklo_epi16)
0341     Vc_AVX_TO_SSE_2_NEW(add_epi16)
0342     Vc_AVX_TO_SSE_2_NEW(add_epi32)
0343     Vc_AVX_TO_SSE_2_NEW(add_epi64)
0344     Vc_AVX_TO_SSE_2_NEW(sub_epi16)
0345     Vc_AVX_TO_SSE_2_NEW(sub_epi32)
0346     Vc_AVX_TO_SSE_2_NEW(mullo_epi16)
0347     Vc_AVX_TO_SSE_2_NEW(sign_epi16)
0348     Vc_AVX_TO_SSE_2_NEW(sign_epi32)
0349     Vc_AVX_TO_SSE_2_NEW(min_epi8)
0350     Vc_AVX_TO_SSE_2_NEW(max_epi8)
0351     Vc_AVX_TO_SSE_2_NEW(min_epu16)
0352     Vc_AVX_TO_SSE_2_NEW(max_epu16)
0353     Vc_AVX_TO_SSE_2_NEW(min_epi32)
0354     Vc_AVX_TO_SSE_2_NEW(max_epi32)
0355     Vc_AVX_TO_SSE_2_NEW(min_epu32)
0356     Vc_AVX_TO_SSE_2_NEW(max_epu32)
0357     Vc_AVX_TO_SSE_2_NEW(mullo_epi32)
0358
0359     Vc_AVX_TO_SSE_1(abs_epi8)
0360     Vc_AVX_TO_SSE_1(abs_epi16)
0361     Vc_AVX_TO_SSE_1(abs_epi32)
0362     Vc_AVX_TO_SSE_1_128(cvtepi8_epi16, 8)
0363     Vc_AVX_TO_SSE_1_128(cvtepi8_epi32, 4)
0364     Vc_AVX_TO_SSE_1_128(cvtepi8_epi64, 2)
0365     Vc_AVX_TO_SSE_1_128(cvtepi16_epi32, 8)
0366     Vc_AVX_TO_SSE_1_128(cvtepi16_epi64, 4)
0367     Vc_AVX_TO_SSE_1_128(cvtepi32_epi64, 8)
0368     Vc_AVX_TO_SSE_1_128(cvtepu8_epi16, 8)
0369     Vc_AVX_TO_SSE_1_128(cvtepu8_epi32, 4)
0370     Vc_AVX_TO_SSE_1_128(cvtepu8_epi64, 2)
0371     Vc_AVX_TO_SSE_1_128(cvtepu16_epi32, 8)
0372     Vc_AVX_TO_SSE_1_128(cvtepu16_epi64, 4)
0373     Vc_AVX_TO_SSE_1_128(cvtepu32_epi64, 8)
0374 #ifndef Vc_IMPL_AVX2
0375
0376 /////////////////////////////////////////////////////////////////////////
0377 // implementation of the intrinsics missing in AVX
0378 /////////////////////////////////////////////////////////////////////////
0379
0380     static Vc_INTRINSIC m256i Vc_CONST and_si256(__m256i x, __m256i y) {
0381         return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
0382     }
0383     static Vc_INTRINSIC m256i Vc_CONST andnot_si256(__m256i x, __m256i y) {
0384         return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
0385     }
0386     static Vc_INTRINSIC m256i Vc_CONST or_si256(__m256i x, __m256i y) {
0387         return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
0388     }
0389     static Vc_INTRINSIC m256i Vc_CONST xor_si256(__m256i x, __m256i y) {
0390         return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
0391     }
0392
0393     Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
0394     {
0395         m128i a1 = extract128<1>(a0);
0396         return (_mm_movemask_epi8(a1) << 16) | _mm_movemask_epi8(_mm256_castsi256_si128(a0));
0397     }
0398     template <int m> Vc_INTRINSIC Vc_CONST m256i blend_epi16(__m256i a0, __m256i b0)
0399     {
0400         m128i a1 = extract128<1>(a0);
0401         m128i b1 = extract128<1>(b0);
0402         m128i r0 = _mm_blend_epi16(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), m & 0xff);
0403         m128i r1 = _mm_blend_epi16(a1, b1, m >> 8);
0404         return insert128<1>(_mm256_castsi128_si256(r0), r1);
0405     }
0406     Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0) {
0407         m128i a1 = extract128<1>(a0);
0408         m128i b1 = extract128<1>(b0);
0409         m128i m1 = extract128<1>(m0);
0410         m128i r0 = _mm_blendv_epi8(_mm256_castsi256_si128(a0), _mm256_castsi256_si128(b0), _mm256_castsi256_si128(m0));
0411         m128i r1 = _mm_blendv_epi8(a1, b1, m1);
0412         return insert128<1>(_mm256_castsi128_si256(r0), r1);
0413     }
0414     // mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
0415
0416 #else // Vc_IMPL_AVX2
0417
0418 static Vc_INTRINSIC Vc_CONST m256i xor_si256(__m256i x, __m256i y) { return _mm256_xor_si256(x, y); }
0419 static Vc_INTRINSIC Vc_CONST m256i or_si256(__m256i x, __m256i y) { return _mm256_or_si256(x, y); }
0420 static Vc_INTRINSIC Vc_CONST m256i and_si256(__m256i x, __m256i y) { return _mm256_and_si256(x, y); }
0421 static Vc_INTRINSIC Vc_CONST m256i andnot_si256(__m256i x, __m256i y) { return _mm256_andnot_si256(x, y); }
0422
0423 /////////////////////////////////////////////////////////////////////////
0424 // implementation of the intrinsics missing in AVX2
0425 /////////////////////////////////////////////////////////////////////////
0426 Vc_INTRINSIC Vc_CONST m256i blendv_epi8(__m256i a0, __m256i b0, __m256i m0)
0427 {
0428     return _mm256_blendv_epi8(a0, b0, m0);
0429 }
0430 Vc_INTRINSIC Vc_CONST int movemask_epi8(__m256i a0)
0431 {
0432     return _mm256_movemask_epi8(a0);
0433 }
0434
0435 #endif // Vc_IMPL_AVX2
0436
0437 /////////////////////////////////////////////////////////////////////////
0438 // implementation of intrinsics missing in AVX and AVX2
0439 /////////////////////////////////////////////////////////////////////////
0440
0441 static Vc_INTRINSIC m256i cmplt_epi64(__m256i a, __m256i b) {
0442     return cmpgt_epi64(b, a);
0443 }
0444 static Vc_INTRINSIC m256i cmplt_epi32(__m256i a, __m256i b) {
0445     return cmpgt_epi32(b, a);
0446 }
0447 static Vc_INTRINSIC m256i cmplt_epi16(__m256i a, __m256i b) {
0448     return cmpgt_epi16(b, a);
0449 }
0450 static Vc_INTRINSIC m256i cmplt_epi8(__m256i a, __m256i b) {
0451     return cmpgt_epi8(b, a);
0452 }
0453
0454 static Vc_INTRINSIC m256i cmpgt_epu8(__m256i a, __m256i b) {
0455     return cmpgt_epi8(xor_si256(a, setmin_epi8()), xor_si256(b, setmin_epi8()));
0456 }
0457 #if defined(Vc_IMPL_XOP)
0458     Vc_AVX_TO_SSE_2_NEW(comlt_epu32)
0459     Vc_AVX_TO_SSE_2_NEW(comgt_epu32)
0460     Vc_AVX_TO_SSE_2_NEW(comlt_epu16)
0461     Vc_AVX_TO_SSE_2_NEW(comgt_epu16)
0462     static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i a, __m256i b) { return comlt_epu32(a, b); }
0463     static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i a, __m256i b) { return comgt_epu32(a, b); }
0464     static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i a, __m256i b) { return comlt_epu16(a, b); }
0465     static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i a, __m256i b) { return comgt_epu16(a, b); }
0466 #else
0467     static Vc_INTRINSIC m256i Vc_CONST cmplt_epu32(__m256i _a, __m256i _b) {
0468         m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
0469         m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
0470         return cmplt_epi32(a, b);
0471     }
0472     static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu32(__m256i _a, __m256i _b) {
0473         m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi32())));
0474         m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi32())));
0475         return cmpgt_epi32(a, b);
0476     }
0477     static Vc_INTRINSIC m256i Vc_CONST cmplt_epu16(__m256i _a, __m256i _b) {
0478         m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
0479         m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
0480         return cmplt_epi16(a, b);
0481     }
0482     static Vc_INTRINSIC m256i Vc_CONST cmpgt_epu16(__m256i _a, __m256i _b) {
0483         m256i a = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_a), _mm256_castsi256_ps(setmin_epi16())));
0484         m256i b = _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(_b), _mm256_castsi256_ps(setmin_epi16())));
0485         return cmpgt_epi16(a, b);
0486     }
0487 #endif
0488
0489 static Vc_INTRINSIC void _mm256_maskstore(float *mem, const __m256 mask, const __m256 v) {
0490     _mm256_maskstore_ps(mem, _mm256_castps_si256(mask), v);
0491 }
0492 static Vc_INTRINSIC void _mm256_maskstore(double *mem, const __m256d mask, const __m256d v) {
0493     _mm256_maskstore_pd(mem, _mm256_castpd_si256(mask), v);
0494 }
0495 static Vc_INTRINSIC void _mm256_maskstore(int *mem, const __m256i mask, const __m256i v) {
0496 #ifdef Vc_IMPL_AVX2
0497     _mm256_maskstore_epi32(mem, mask, v);
0498 #else
0499     _mm256_maskstore_ps(reinterpret_cast<float *>(mem), mask, _mm256_castsi256_ps(v));
0500 #endif
0501 }
0502 static Vc_INTRINSIC void _mm256_maskstore(unsigned int *mem, const __m256i mask, const __m256i v) {
0503     _mm256_maskstore(reinterpret_cast<int *>(mem), mask, v);
0504 }
0505 static Vc_INTRINSIC void _mm256_maskstore(short *mem, const __m256i mask, const __m256i v) {
0506     using namespace AVX;
0507     _mm_maskmoveu_si128(_mm256_castsi256_si128(v), _mm256_castsi256_si128(mask), reinterpret_cast<char *>(&mem[0]));
0508     _mm_maskmoveu_si128(extract128<1>(v), extract128<1>(mask), reinterpret_cast<char *>(&mem[8]));
0509 }
0510 static Vc_INTRINSIC void _mm256_maskstore(unsigned short *mem, const __m256i mask, const __m256i v) {
0511     _mm256_maskstore(reinterpret_cast<short *>(mem), mask, v);
0512 }
0513
0514 #undef Vc_AVX_TO_SSE_1
0515 #undef Vc_AVX_TO_SSE_1_128
0516 #undef Vc_AVX_TO_SSE_2_NEW
0517 #undef Vc_AVX_TO_SSE_256_128
0518 #undef Vc_AVX_TO_SSE_1i
0519
0520 template<typename R> Vc_INTRINSIC_L R stream_load(const float *mem) Vc_INTRINSIC_R;
0521 template<> Vc_INTRINSIC m128 stream_load<m128>(const float *mem)
0522 {
0523     return _mm_castsi128_ps(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<float *>(mem))));
0524 }
0525 template<> Vc_INTRINSIC m256 stream_load<m256>(const float *mem)
0526 {
0527     return insert128<1>(_mm256_castps128_ps256(stream_load<m128>(mem)),
0528                                 stream_load<m128>(mem + 4));
0529 }
0530
0531 template<typename R> Vc_INTRINSIC_L R stream_load(const double *mem) Vc_INTRINSIC_R;
0532 template<> Vc_INTRINSIC m128d stream_load<m128d>(const double *mem)
0533 {
0534     return _mm_castsi128_pd(_mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<double *>(mem))));
0535 }
0536 template<> Vc_INTRINSIC m256d stream_load<m256d>(const double *mem)
0537 {
0538     return insert128<1>(_mm256_castpd128_pd256(stream_load<m128d>(mem)),
0539                                 stream_load<m128d>(mem + 2));
0540 }
0541
0542 template<typename R> Vc_INTRINSIC_L R stream_load(const void *mem) Vc_INTRINSIC_R;
0543 template<> Vc_INTRINSIC m128i stream_load<m128i>(const void *mem)
0544 {
0545     return _mm_stream_load_si128(reinterpret_cast<__m128i *>(const_cast<void *>(mem)));
0546 }
0547 template<> Vc_INTRINSIC m256i stream_load<m256i>(const void *mem)
0548 {
0549     return insert128<1>(_mm256_castsi128_si256(stream_load<m128i>(mem)),
0550                                 stream_load<m128i>(static_cast<const __m128i *>(mem) + 1));
0551 }
0552
0553 Vc_INTRINSIC void stream_store(float *mem, __m128 value, __m128 mask)
0554 {
0555     _mm_maskmoveu_si128(_mm_castps_si128(value), _mm_castps_si128(mask), reinterpret_cast<char *>(mem));
0556 }
0557 Vc_INTRINSIC void stream_store(float *mem, __m256 value, __m256 mask)
0558 {
0559     stream_store(mem, _mm256_castps256_ps128(value), _mm256_castps256_ps128(mask));
0560     stream_store(mem + 4, extract128<1>(value), extract128<1>(mask));
0561 }
0562 Vc_INTRINSIC void stream_store(double *mem, __m128d value, __m128d mask)
0563 {
0564     _mm_maskmoveu_si128(_mm_castpd_si128(value), _mm_castpd_si128(mask), reinterpret_cast<char *>(mem));
0565 }
0566 Vc_INTRINSIC void stream_store(double *mem, __m256d value, __m256d mask)
0567 {
0568     stream_store(mem, _mm256_castpd256_pd128(value), _mm256_castpd256_pd128(mask));
0569     stream_store(mem + 2, extract128<1>(value), extract128<1>(mask));
0570 }
0571 Vc_INTRINSIC void stream_store(void *mem, __m128i value, __m128i mask)
0572 {
0573     _mm_maskmoveu_si128(value, mask, reinterpret_cast<char *>(mem));
0574 }
0575 Vc_INTRINSIC void stream_store(void *mem, __m256i value, __m256i mask)
0576 {
0577     stream_store(mem, _mm256_castsi256_si128(value), _mm256_castsi256_si128(mask));
0578     stream_store(static_cast<__m128i *>(mem) + 1, extract128<1>(value), extract128<1>(mask));
0579 }
0580
0581 #ifndef __x86_64__
0582 Vc_INTRINSIC Vc_PURE __m128i _mm_cvtsi64_si128(int64_t x) {
0583     return _mm_castpd_si128(_mm_load_sd(reinterpret_cast<const double *>(&x)));
0584 }
0585 #endif
0586
0587 #ifdef Vc_IMPL_AVX2
0588 template <int Scale> __m256 gather(const float *addr, __m256i idx)
0589 {
0590     return _mm256_i32gather_ps(addr, idx, Scale);
0591 }
0592 template <int Scale> __m256d gather(const double *addr, __m128i idx)
0593 {
0594     return _mm256_i32gather_pd(addr, idx, Scale);
0595 }
0596 template <int Scale> __m256i gather(const int *addr, __m256i idx)
0597 {
0598     return _mm256_i32gather_epi32(addr, idx, Scale);
0599 }
0600 template <int Scale> __m256i gather(const unsigned *addr, __m256i idx)
0601 {
0602     return _mm256_i32gather_epi32(aliasing_cast<int>(addr), idx, Scale);
0603 }
0604
0605 template <int Scale> __m256 gather(__m256 src, __m256 k, const float *addr, __m256i idx)
0606 {
0607     return _mm256_mask_i32gather_ps(src, addr, idx, k, Scale);
0608 }
0609 template <int Scale>
0610 __m256d gather(__m256d src, __m256d k, const double *addr, __m128i idx)
0611 {
0612     return _mm256_mask_i32gather_pd(src, addr, idx, k, Scale);
0613 }
0614 template <int Scale> __m256i gather(__m256i src, __m256i k, const int *addr, __m256i idx)
0615 {
0616     return _mm256_mask_i32gather_epi32(src, addr, idx, k, Scale);
0617 }
0618 template <int Scale>
0619 __m256i gather(__m256i src, __m256i k, const unsigned *addr, __m256i idx)
0620 {
0621     return _mm256_mask_i32gather_epi32(src, aliasing_cast<int>(addr), idx, k, Scale);
0622 }
0623 #endif
0624
0625 }  // namespace AvxIntrinsics
0626 }  // namespace Vc
0627
0628 namespace Vc_VERSIONED_NAMESPACE
0629 {
0630 namespace AVX
0631 {
0632     using namespace AvxIntrinsics;
0633 }  // namespace AVX
0634 namespace AVX2
0635 {
0636     using namespace AvxIntrinsics;
0637 }  // namespace AVX2
0638 namespace AVX
0639 {
0640     template<typename T> struct VectorTypeHelper;
0641     template<> struct VectorTypeHelper<         char > { typedef __m256i Type; };
0642     template<> struct VectorTypeHelper<  signed char > { typedef __m256i Type; };
0643     template<> struct VectorTypeHelper<unsigned char > { typedef __m256i Type; };
0644     template<> struct VectorTypeHelper<         short> { typedef __m256i Type; };
0645     template<> struct VectorTypeHelper<unsigned short> { typedef __m256i Type; };
0646     template<> struct VectorTypeHelper<         int  > { typedef __m256i Type; };
0647     template<> struct VectorTypeHelper<unsigned int  > { typedef __m256i Type; };
0648     template<> struct VectorTypeHelper<         long > { typedef __m256i Type; };
0649     template<> struct VectorTypeHelper<unsigned long > { typedef __m256i Type; };
0650     template<> struct VectorTypeHelper<         long long> { typedef __m256i Type; };
0651     template<> struct VectorTypeHelper<unsigned long long> { typedef __m256i Type; };
0652     template<> struct VectorTypeHelper<         float> { typedef __m256  Type; };
0653     template<> struct VectorTypeHelper<        double> { typedef __m256d Type; };
0654
0655     template <typename T>
0656     using IntegerVectorType =
0657         typename std::conditional<sizeof(T) == 16, __m128i, __m256i>::type;
0658     template <typename T>
0659     using DoubleVectorType =
0660         typename std::conditional<sizeof(T) == 16, __m128d, __m256d>::type;
0661     template <typename T>
0662     using FloatVectorType =
0663         typename std::conditional<sizeof(T) == 16, __m128, __m256>::type;
0664
0665     template<typename T> struct VectorHelper {};
0666     template<typename T> struct VectorHelperSize;
0667 }  // namespace AVX
0668 }  // namespace Vc
0669
0670 #endif // VC_AVX_INTRINSICS_H_