File indexing completed on 2025-01-31 10:25:33
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_AVX_SHUFFLE_H_
0029 #define VC_AVX_SHUFFLE_H_
0030
0031 #include "../sse/shuffle.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 namespace Detail
0037 {
0038 template <int... Dst> struct Permutation {};
0039 template <uint8_t... Sel> struct Mask {};
0040
0041 #ifdef Vc_IMPL_AVX2
0042 template <uint8_t Sel0, uint8_t Sel1, uint8_t Sel2, uint8_t Sel3, uint8_t Sel4,
0043 uint8_t Sel5, uint8_t Sel6, uint8_t Sel7, uint8_t Sel8, uint8_t Sel9,
0044 uint8_t Sel10, uint8_t Sel11, uint8_t Sel12, uint8_t Sel13, uint8_t Sel14,
0045 uint8_t Sel15>
0046 Vc_INTRINSIC Vc_CONST __m256i
0047 blend(__m256i a, __m256i b, Mask<Sel0, Sel1, Sel2, Sel3, Sel4, Sel5, Sel6, Sel7, Sel8,
0048 Sel9, Sel10, Sel11, Sel12, Sel13, Sel14, Sel15>)
0049 {
0050 static_assert((Sel0 == 0 || Sel0 == 1) && (Sel1 == 0 || Sel1 == 1) &&
0051 (Sel2 == 0 || Sel2 == 1) && (Sel3 == 0 || Sel3 == 1) &&
0052 (Sel4 == 0 || Sel4 == 1) && (Sel5 == 0 || Sel5 == 1) &&
0053 (Sel6 == 0 || Sel6 == 1) && (Sel7 == 0 || Sel7 == 1) &&
0054 (Sel8 == 0 || Sel8 == 1) && (Sel9 == 0 || Sel9 == 1) &&
0055 (Sel10 == 0 || Sel10 == 1) && (Sel11 == 0 || Sel11 == 1) &&
0056 (Sel12 == 0 || Sel12 == 1) && (Sel13 == 0 || Sel13 == 1) &&
0057 (Sel14 == 0 || Sel14 == 1) && (Sel15 == 0 || Sel15 == 1),
0058 "Selectors must be 0 or 1 to select the value from a or b");
0059 constexpr uint8_t mask = static_cast<uint8_t>(
0060 (Sel0 << 0 ) | (Sel1 << 1 ) | (Sel2 << 2 ) | (Sel3 << 3 ) |
0061 (Sel4 << 4 ) | (Sel5 << 5 ) | (Sel6 << 6 ) | (Sel7 << 7 ) |
0062 (Sel8 << 8 ) | (Sel9 << 9 ) | (Sel10 << 10) | (Sel11 << 11) |
0063 (Sel12 << 12) | (Sel13 << 13) | (Sel14 << 14) | (Sel15 << 15));
0064 return _mm256_blend_epi16(a, b, mask);
0065 }
0066 #endif
0067 }
0068 namespace Mem
0069 {
0070 #ifdef Vc_IMPL_AVX2
0071 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteLo(__m256i x) {
0072 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0073 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0074 return _mm256_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0075 }
0076
0077 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permuteHi(__m256i x) {
0078 static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
0079 static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
0080 return _mm256_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
0081 }
0082 #endif
0083
0084 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x) {
0085 static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
0086 static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
0087 return _mm256_permute2f128_ps(
0088 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0089 }
0090 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x) {
0091 static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
0092 static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
0093 return _mm256_permute2f128_pd(
0094 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0095 }
0096 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x) {
0097 static_assert((L >= X0 && L <= X1) || L == Const0, "Incorrect_Range");
0098 static_assert((H >= X0 && H <= X1) || H == Const0, "Incorrect_Range");
0099 #ifdef Vc_IMPL_AVX2
0100 return _mm256_permute2x128_si256(
0101 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0102 #else
0103 return _mm256_permute2f128_si256(
0104 x, x, (L == Const0 ? 0x8 : L) + (H == Const0 ? 0x80 : H * (1 << 4)));
0105 #endif
0106 }
0107 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle128(__m256 x, __m256 y) {
0108 static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0109 static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0110 return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0111 }
0112 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256i Vc_CONST shuffle128(__m256i x, __m256i y) {
0113 static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0114 static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0115 #ifdef Vc_IMPL_AVX2
0116 return _mm256_permute2x128_si256(
0117 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0118 #else
0119 return _mm256_permute2f128_si256(
0120 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0121 #endif
0122 }
0123 template<VecPos L, VecPos H> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle128(__m256d x, __m256d y) {
0124 static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0125 static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0126 return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0127 }
0128 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
0129 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
0130 static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0131 return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
0132 }
0133 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
0134 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0135 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0136 return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0137 }
0138 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute(__m256i x) {
0139 return _mm256_castps_si256(permute<Dst0, Dst1, Dst2, Dst3>(_mm256_castsi256_ps(x)));
0140 }
0141 #ifdef Vc_IMPL_AVX2
0142 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute4x64(__m256i x) {
0143 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0144 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0145 return _mm256_permute4x64_epi64(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0146 }
0147 #endif
0148 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
0149 static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
0150 static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
0151 return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
0152 }
0153 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
0154 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0155 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0156 return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
0157 }
0158 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0159 static Vc_ALWAYS_INLINE __m256 Vc_CONST blend(__m256 x, __m256 y) {
0160 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0161 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0162 static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
0163 static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
0164 static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
0165 static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
0166 static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
0167 static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
0168 return _mm256_blend_ps(x, y,
0169 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
0170 (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
0171 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 +
0172 (Dst6 / Y6) * 64 + (Dst7 / Y7) *128
0173 );
0174 }
0175 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0176 static Vc_ALWAYS_INLINE __m256i Vc_CONST blend(__m256i x, __m256i y) {
0177 return _mm256_castps_si256(blend<Dst0, Dst1, Dst2, Dst3, Dst4, Dst5, Dst6, Dst7>(_mm256_castsi256_ps(x), _mm256_castsi256_ps(y)));
0178 }
0179 template<VecPos Dst> struct ScaleForBlend { enum { Value = Dst >= X4 ? Dst - X4 + Y0 : Dst }; };
0180 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0181 static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
0182 static_assert(Dst0 >= X0 && Dst0 <= X7, "Incorrect_Range");
0183 static_assert(Dst1 >= X0 && Dst1 <= X7, "Incorrect_Range");
0184 static_assert(Dst2 >= X0 && Dst2 <= X7, "Incorrect_Range");
0185 static_assert(Dst3 >= X0 && Dst3 <= X7, "Incorrect_Range");
0186 static_assert(Dst4 >= X0 && Dst4 <= X7, "Incorrect_Range");
0187 static_assert(Dst5 >= X0 && Dst5 <= X7, "Incorrect_Range");
0188 static_assert(Dst6 >= X0 && Dst6 <= X7, "Incorrect_Range");
0189 static_assert(Dst7 >= X0 && Dst7 <= X7, "Incorrect_Range");
0190 if (Dst0 + X4 == Dst4 && Dst1 + X4 == Dst5 && Dst2 + X4 == Dst6 && Dst3 + X4 == Dst7) {
0191 return permute<Dst0, Dst1, Dst2, Dst3>(x);
0192 }
0193 const __m128 loIn = _mm256_castps256_ps128(x);
0194 const __m128 hiIn = _mm256_extractf128_ps(x, 1);
0195 __m128 lo, hi;
0196
0197 if (Dst0 < X4 && Dst1 < X4 && Dst2 < X4 && Dst3 < X4) {
0198 lo = _mm_permute_ps(loIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0199 } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4) {
0200 lo = _mm_permute_ps(hiIn, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0201 } else if (Dst0 < X4 && Dst1 < X4 && Dst2 >= X4 && Dst3 >= X4) {
0202 lo = shuffle<Dst0, Dst1, Dst2 - X4 + Y0, Dst3 - X4 + Y0>(loIn, hiIn);
0203 } else if (Dst0 >= X4 && Dst1 >= X4 && Dst2 < X4 && Dst3 < X4) {
0204 lo = shuffle<Dst0 - X4, Dst1 - X4, Dst2 + Y0, Dst3 + Y0>(hiIn, loIn);
0205 } else if (Dst0 == X0 && Dst1 == X4 && Dst2 == X1 && Dst3 == X5) {
0206 lo = _mm_unpacklo_ps(loIn, hiIn);
0207 } else if (Dst0 == X4 && Dst1 == X0 && Dst2 == X5 && Dst3 == X1) {
0208 lo = _mm_unpacklo_ps(hiIn, loIn);
0209 } else if (Dst0 == X2 && Dst1 == X6 && Dst2 == X3 && Dst3 == X7) {
0210 lo = _mm_unpackhi_ps(loIn, hiIn);
0211 } else if (Dst0 == X6 && Dst1 == X2 && Dst2 == X7 && Dst3 == X3) {
0212 lo = _mm_unpackhi_ps(hiIn, loIn);
0213 } else if (Dst0 % X4 == 0 && Dst1 % X4 == 1 && Dst2 % X4 == 2 && Dst3 % X4 == 3) {
0214 lo = blend<ScaleForBlend<Dst0>::Value, ScaleForBlend<Dst1>::Value,
0215 ScaleForBlend<Dst2>::Value, ScaleForBlend<Dst3>::Value>(loIn, hiIn);
0216 }
0217
0218 if (Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4) {
0219 hi = _mm_permute_ps(hiIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
0220 } else if (Dst4 < X4 && Dst5 < X4 && Dst6 < X4 && Dst7 < X4) {
0221 hi = _mm_permute_ps(loIn, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
0222 } else if (Dst4 < X4 && Dst5 < X4 && Dst6 >= X4 && Dst7 >= X4) {
0223 hi = shuffle<Dst4, Dst5, Dst6 - X4 + Y0, Dst7 - X4 + Y0>(loIn, hiIn);
0224 } else if (Dst4 >= X4 && Dst5 >= X4 && Dst6 < X4 && Dst7 < X4) {
0225 hi = shuffle<Dst4 - X4, Dst5 - X4, Dst6 + Y0, Dst7 + Y0>(hiIn, loIn);
0226 } else if (Dst4 == X0 && Dst5 == X4 && Dst6 == X1 && Dst7 == X5) {
0227 hi = _mm_unpacklo_ps(loIn, hiIn);
0228 } else if (Dst4 == X4 && Dst5 == X0 && Dst6 == X5 && Dst7 == X1) {
0229 hi = _mm_unpacklo_ps(hiIn, loIn);
0230 } else if (Dst4 == X2 && Dst5 == X6 && Dst6 == X3 && Dst7 == X7) {
0231 hi = _mm_unpackhi_ps(loIn, hiIn);
0232 } else if (Dst4 == X6 && Dst5 == X2 && Dst6 == X7 && Dst7 == X3) {
0233 hi = _mm_unpackhi_ps(hiIn, loIn);
0234 } else if (Dst4 % X4 == 0 && Dst5 % X4 == 1 && Dst6 % X4 == 2 && Dst7 % X4 == 3) {
0235 hi = blend<ScaleForBlend<Dst4>::Value, ScaleForBlend<Dst5>::Value,
0236 ScaleForBlend<Dst6>::Value, ScaleForBlend<Dst7>::Value>(loIn, hiIn);
0237 }
0238
0239 return _mm256_insertf128_ps(_mm256_castps128_ps256(lo), hi, 1);
0240 }
0241 }
0242 }
0243
0244
0245
0246
0247
0248
0249
0250 namespace Vc_VERSIONED_NAMESPACE
0251 {
0252 namespace Reg
0253 {
0254 template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute128(__m256 x, __m256 y) {
0255 static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0256 static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0257 return _mm256_permute2f128_ps(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0258 }
0259 template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256i Vc_CONST permute128(__m256i x, __m256i y) {
0260 static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0261 static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0262 #ifdef Vc_IMPL_AVX2
0263 return _mm256_permute2x128_si256(
0264 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0265 #else
0266 return _mm256_permute2f128_si256(
0267 x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0268 #endif
0269 }
0270 template<VecPos H, VecPos L> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute128(__m256d x, __m256d y) {
0271 static_assert(L >= X0 && H >= X0, "Incorrect_Range");
0272 static_assert(L <= Y1 && H <= Y1, "Incorrect_Range");
0273 return _mm256_permute2f128_pd(x, y, (L < Y0 ? L : L - Y0 + 2) + (H < Y0 ? H : H - Y0 + 2) * (1 << 4));
0274 }
0275 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST permute(__m256d x) {
0276 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X2 && Dst3 >= X2, "Incorrect_Range");
0277 static_assert(Dst0 <= X1 && Dst1 <= X1 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0278 return _mm256_permute_pd(x, Dst0 + Dst1 * 2 + (Dst2 - X2) * 4 + (Dst3 - X2) * 8);
0279 }
0280 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST permute(__m256 x) {
0281 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0282 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0283 return _mm256_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0284 }
0285 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST permute(__m128d x) {
0286 static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
0287 static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
0288 return _mm_permute_pd(x, Dst0 + Dst1 * 2);
0289 }
0290 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
0291 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0292 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0293 return _mm_permute_ps(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0294 }
0295 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256d Vc_CONST shuffle(__m256d x, __m256d y) {
0296 static_assert(Dst0 >= X0 && Dst1 >= Y0 && Dst2 >= X2 && Dst3 >= Y2, "Incorrect_Range");
0297 static_assert(Dst0 <= X1 && Dst1 <= Y1 && Dst2 <= X3 && Dst3 <= Y3, "Incorrect_Range");
0298 return _mm256_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2 + (Dst2 - X2) * 4 + (Dst3 - Y2) * 8);
0299 }
0300 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m256 Vc_CONST shuffle(__m256 x, __m256 y) {
0301 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0302 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0303 return _mm256_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
0304 }
0305 }
0306 }
0307
0308 #endif