Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-31 10:25:33

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
0003 
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014 
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025 
0026 }}}*/
0027 
0028 #ifndef VC_AVX_SHUFFLE_H_
0029 #define VC_AVX_SHUFFLE_H_
0030 
0031 #include "../sse/shuffle.h"
0032 #include "macros.h"
0033 
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 namespace Detail
0037 {
0038 template <int... Dst> struct Permutation {};
0039 template <uint8_t... Sel> struct Mask {};
0040 
0041 #ifdef Vc_IMPL_AVX2
0042 template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
0043           uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
0044           uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
0045           uint8_t Sel15>
0046 Vc_INTRINSIC Vc_CONST __m256i
0047 blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
0048                                  Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
0049 {
0050     static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
0051                       (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
0052                       (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
0053                       (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
0054                       (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
0055                       (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
0056                       (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
0057                       (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
0058                   "Selectors must be 0 or 1 to select the value from a or b");
0059     constexpr uint8_t mask = static_cast<uint8_t>(
0060         (Sel0  << 0 ) | (Sel1  << 1 ) | (Sel2  << 2 ) | (Sel3  << 3 ) |
0061         (Sel4  << 4 ) | (Sel5  << 5 ) | (Sel6  << 6 ) | (Sel7  << 7 ) |
0062         (Sel8  << 8 ) | (Sel9  << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
0063         (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
0064     return _mm256_blend_epi16(a, b, mask);
0065 }
0066 #endif  // Vc_IMPL_AVX2
0067 }  // namespace Detail
0068 namespace Mem
0069 {
0070 #ifdef Vc_IMPL_AVX2
0071         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
0072             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0073             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0074             return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0075         }
0076 
0077         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
0078             static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
0079             static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
0080             return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
0081         }
0082 #endif  // Vc_IMPL_AVX2
0083 
0084         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
0085             static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
0086             static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
0087             return _mm256_permute2f128_ps(
0088                 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0089         }
0090         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
0091             static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
0092             static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
0093             return _mm256_permute2f128_pd(
0094                 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0095         }
0096         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
0097             static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
0098             static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
0099 #ifdef Vc_IMPL_AVX2
0100             return _mm256_permute2x128_si256(
0101                 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0102 #else
0103             return _mm256_permute2f128_si256(
0104                 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0105 #endif
0106         }
0107         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
0108             static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0109             static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0110             return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0111         }
0112         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
0113             static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0114             static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0115 #ifdef Vc_IMPL_AVX2
0116             return _mm256_permute2x128_si256(
0117                 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0118 #else
0119             return _mm256_permute2f128_si256(
0120                 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0121 #endif
0122         }
0123         template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
0124             static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0125             static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0126             return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0127         }
0128         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
0129             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
0130             static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0131             return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
0132         }
0133         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
0134             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0135             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0136             return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0137         }
0138         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
0139             return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
0140         }
0141 #ifdef Vc_IMPL_AVX2
0142         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
0143             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0144             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0145             return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0146         }
0147 #endif  // Vc_IMPL_AVX2
0148         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
0149             static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
0150             static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
0151             return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
0152         }
0153         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
0154             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0155             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0156             return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
0157         }
0158         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0159         static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
0160             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0161             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0162             static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
0163             static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
0164             static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
0165             static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
0166             static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
0167             static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
0168             return _mm256_blend_ps(x, y,
0169                     (Dst0 / Y0) *  1 + (Dst1 / Y1) *  2 +
0170                     (Dst2 / Y2) *  4 + (Dst3 / Y3) *  8 +
0171                     (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
0172                     (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
0173                     );
0174         }
0175         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0176         static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
0177             return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
0178         }
0179         template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
0180         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0181         static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
0182             static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
0183             static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
0184             static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
0185             static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
0186             static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
0187             static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
0188             static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
0189             static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
0190             if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
0191                 return permute<Dst0, Dst1, Dst2, Dst3>(x);
0192             }
0193             const __m128 loIn = _mm256_castps256_ps128(x);
0194             const __m128 hiIn = _mm256_extractf128_ps(x, 1);
0195             __m128 lo, hi;
0196 
0197             if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
0198                 lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0199             } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
0200                 lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0201             } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
0202                 lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
0203             } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
0204                 lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
0205             } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
0206                 lo = _mm_unpacklo_ps(loIn, hiIn);
0207             } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
0208                 lo = _mm_unpacklo_ps(hiIn, loIn);
0209             } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
0210                 lo = _mm_unpackhi_ps(loIn, hiIn);
0211             } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
0212                 lo = _mm_unpackhi_ps(hiIn, loIn);
0213             } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
0214                 lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
0215                    ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
0216             }
0217 
0218             if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
0219                 hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
0220             } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
0221                 hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
0222             } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
0223                 hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
0224             } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
0225                 hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
0226             } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
0227                 hi = _mm_unpacklo_ps(loIn, hiIn);
0228             } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
0229                 hi = _mm_unpacklo_ps(hiIn, loIn);
0230             } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
0231                 hi = _mm_unpackhi_ps(loIn, hiIn);
0232             } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
0233                 hi = _mm_unpackhi_ps(hiIn, loIn);
0234             } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
0235                 hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
0236                    ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
0237             }
0238 
0239             return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
0240         }
0241 }  // namespace Mem
0242 }  // namespace Vc
0243 
0244     // little endian has the lo bits on the right and high bits on the left
0245     // with vectors this becomes greatly confusing:
0246     // Mem: abcd
0247     // Reg: dcba
0248     //
0249     // The shuffles and permutes above use memory ordering. The ones below use register ordering:
0250 namespace Vc_VERSIONED_NAMESPACE
0251 {
0252 namespace Reg
0253 {
0254         template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
0255             static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0256             static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0257             return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0258         }
0259         template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
0260             static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0261             static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0262 #ifdef Vc_IMPL_AVX2
0263             return _mm256_permute2x128_si256(
0264                 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0265 #else
0266             return _mm256_permute2f128_si256(
0267                 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0268 #endif
0269         }
0270         template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
0271             static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0272             static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0273             return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0274         }
0275         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
0276             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
0277             static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0278             return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
0279         }
0280         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
0281             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0282             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0283             return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0284         }
0285         template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
0286             static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
0287             static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
0288             return _mm_permute_pd(x, Dst0 + Dst1 * 2);
0289         }
0290         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
0291             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0292             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0293             return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0294         }
0295         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
0296             static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
0297             static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
0298             return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
0299         }
0300         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
0301             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0302             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0303             return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
0304         }
0305 }  // namespace Reg
0306 }  // namespace Vc
0307 
0308 #endif // VC_AVX_SHUFFLE_H_