File indexing completed on 2025-01-31 10:25:44
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_SSE_SHUFFLE_H_
0029 #define VC_SSE_SHUFFLE_H_
0030
0031 #include "intrinsics.h"
0032 #include "macros.h"
0033
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036 enum VecPos {
0037 X0, X1, X2, X3, X4, X5, X6, X7,
0038 Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
0039 Const0
0040 };
0041
0042 namespace Mem
0043 {
0044
0045 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
0046 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0047 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0048 return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
0049 }
0050
0051
0052 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
0053 static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
0054 static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
0055 return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
0056 }
0057
0058
0059 template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
0060 Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
0061 {
0062 return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
0063 _mm_castsi128_ps(y)));
0064 }
0065
0066
0067 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
0068 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0069 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0070 return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
0071 }
0072
0073
0074 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
0075 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0076 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0077 static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
0078 static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
0079 return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
0080 (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
0081 }
0082
0083 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0084 static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
0085 static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0086 static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0087 static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
0088 static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
0089 static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
0090 static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
0091 static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
0092 static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
0093 return Vc::SseIntrinsics::blend_epi16<
0094 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
0095 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
0096 (Dst7 / Y7) * 128>(x, y);
0097 }
0098
0099
0100 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
0101 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0102 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0103 return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0104 }
0105
0106 template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
0107 static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
0108 static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
0109 return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
0110 }
0111
0112 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
0113 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0114 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0115 return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0116 }
0117
0118 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
0119 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0120 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0121 return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0122 }
0123
0124 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
0125 static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
0126 static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
0127 return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
0128 }
0129
0130 template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0131 static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
0132 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0133 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0134 static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
0135 static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
0136 if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
0137 x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0138 }
0139 if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
0140 x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
0141 }
0142 return x;
0143 }
0144 }
0145
0146
0147 namespace Reg
0148 {
0149
0150 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
0151 return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
0152 }
0153
0154
0155 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
0156 return Mem::shuffle<Dst0, Dst1>(x, y);
0157 }
0158
0159
0160 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
0161 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0162 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0163 return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0164 }
0165
0166
0167 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
0168 static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0169 static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0170 return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
0171 }
0172
0173
0174 template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
0175 return Mem::blend<Dst0, Dst1>(x, y);
0176 }
0177
0178 template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
0179 return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
0180 }
0181 }
0182 }
0183
0184 #endif