Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-31 10:25:44

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2011-2015 Matthias Kretz <kretz@kde.org>
0003 
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014 
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025 
0026 }}}*/
0027 
0028 #ifndef VC_SSE_SHUFFLE_H_
0029 #define VC_SSE_SHUFFLE_H_
0030 
0031 #include "intrinsics.h"
0032 #include "macros.h"
0033 
0034 namespace Vc_VERSIONED_NAMESPACE
0035 {
0036     enum VecPos {
0037         X0, X1, X2, X3, X4, X5, X6, X7,
0038         Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7,
0039         Const0
0040     };
0041 
0042 namespace Mem
0043 {
0044         // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
0045         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
0046             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0047             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0048             return _mm_shuffle_ps(x, y, Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64);
0049         }
0050 
0051         // shuffle<X1, Y0>([x0 x1], [y0 y1]) = [x1 y0]
0052         template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
0053             static_assert(Dst0 >= X0 && Dst1 >= Y0, "Incorrect_Range");
0054             static_assert(Dst0 <= X1 && Dst1 <= Y1, "Incorrect_Range");
0055             return _mm_shuffle_pd(x, y, Dst0 + (Dst1 - Y0) * 2);
0056         }
0057 
0058         // shuffle<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
0059         template <VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3>
0060         Vc_INTRINSIC Vc_CONST __m128i shuffle(__m128i x, __m128i y)
0061         {
0062             return _mm_castps_si128(shuffle<Dst0, Dst1, Dst2, Dst3>(_mm_castsi128_ps(x),
0063                                                                     _mm_castsi128_ps(y)));
0064         }
0065 
0066         // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
0067         template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
0068             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0069             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0070             return Vc::SseIntrinsics::blend_pd<(Dst0 / Y0) + (Dst1 / Y0) * 2>(x, y);
0071         }
0072 
0073         // blend<X0, Y1>([x0 x1], [y0, y1]) = [x0 y1]
0074         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
0075             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0076             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0077             static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
0078             static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
0079             return Vc::SseIntrinsics::blend_ps<(Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 +
0080                                                (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8>(x, y);
0081         }
0082 
0083         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0084         static Vc_ALWAYS_INLINE __m128i Vc_CONST blend(__m128i x, __m128i y) {
0085             static_assert(Dst0 == X0 || Dst0 == Y0, "Incorrect_Range");
0086             static_assert(Dst1 == X1 || Dst1 == Y1, "Incorrect_Range");
0087             static_assert(Dst2 == X2 || Dst2 == Y2, "Incorrect_Range");
0088             static_assert(Dst3 == X3 || Dst3 == Y3, "Incorrect_Range");
0089             static_assert(Dst4 == X4 || Dst4 == Y4, "Incorrect_Range");
0090             static_assert(Dst5 == X5 || Dst5 == Y5, "Incorrect_Range");
0091             static_assert(Dst6 == X6 || Dst6 == Y6, "Incorrect_Range");
0092             static_assert(Dst7 == X7 || Dst7 == Y7, "Incorrect_Range");
0093             return Vc::SseIntrinsics::blend_epi16<
0094                 (Dst0 / Y0) * 1 + (Dst1 / Y1) * 2 + (Dst2 / Y2) * 4 + (Dst3 / Y3) * 8 +
0095                 (Dst4 / Y4) * 16 + (Dst5 / Y5) * 32 + (Dst6 / Y6) * 64 +
0096                 (Dst7 / Y7) * 128>(x, y);
0097         }
0098 
0099         // permute<X1, X2, Y0, Y2>([x0 x1 x2 x3], [y0 y1 y2 y3]) = [x1 x2 y0 y2]
0100         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128 Vc_CONST permute(__m128 x) {
0101             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0102             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0103             return _mm_shuffle_ps(x, x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0104         }
0105 
0106         template<VecPos Dst0, VecPos Dst1> static Vc_ALWAYS_INLINE Vc_CONST __m128d permute(__m128d x) {
0107             static_assert(Dst0 >= X0 && Dst1 >= X0, "Incorrect_Range");
0108             static_assert(Dst0 <= X1 && Dst1 <= X1, "Incorrect_Range");
0109             return _mm_shuffle_pd(x, x, Dst0 + Dst1 * 4);
0110         }
0111 
0112         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
0113             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0114             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0115             return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0116         }
0117 
0118         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteLo(__m128i x) {
0119             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0120             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0121             return _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0122         }
0123 
0124         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3> static Vc_ALWAYS_INLINE __m128i Vc_CONST permuteHi(__m128i x) {
0125             static_assert(Dst0 >= X4 && Dst1 >= X4 && Dst2 >= X4 && Dst3 >= X4, "Incorrect_Range");
0126             static_assert(Dst0 <= X7 && Dst1 <= X7 && Dst2 <= X7 && Dst3 <= X7, "Incorrect_Range");
0127             return _mm_shufflehi_epi16(x, (Dst0 - X4) + (Dst1 - X4) * 4 + (Dst2 - X4) * 16 + (Dst3 - X4) * 64);
0128         }
0129 
0130         template<VecPos Dst0, VecPos Dst1, VecPos Dst2, VecPos Dst3, VecPos Dst4, VecPos Dst5, VecPos Dst6, VecPos Dst7>
0131             static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
0132             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0133             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0134             static_assert(Dst4 >= X4 && Dst5 >= X4 && Dst6 >= X4 && Dst7 >= X4, "Incorrect_Range");
0135             static_assert(Dst4 <= X7 && Dst5 <= X7 && Dst6 <= X7 && Dst7 <= X7, "Incorrect_Range");
0136             if (Dst0 != X0 || Dst1 != X1 || Dst2 != X2 || Dst3 != X3) {
0137                 x = _mm_shufflelo_epi16(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0138             }
0139             if (Dst4 != X4 || Dst5 != X5 || Dst6 != X6 || Dst7 != X7) {
0140                 x = _mm_shufflehi_epi16(x, (Dst4 - X4) + (Dst5 - X4) * 4 + (Dst6 - X4) * 16 + (Dst7 - X4) * 64);
0141             }
0142             return x;
0143         }
0144 }  // namespace Mem
0145 
0146     // The shuffles and permutes above use memory ordering. The ones below use register ordering:
0147 namespace Reg
0148 {
0149         // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
0150         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST shuffle(__m128 x, __m128 y) {
0151             return Mem::shuffle<Dst0, Dst1, Dst2, Dst3>(x, y);
0152         }
0153 
0154         // shuffle<Y0, X1>([x1 x0], [y1 y0]) = [y0 x1]
0155         template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST shuffle(__m128d x, __m128d y) {
0156             return Mem::shuffle<Dst0, Dst1>(x, y);
0157         }
0158 
0159         // shuffle<X3, X0, X2, X1>([x3 x2 x1 x0]) = [x3 x0 x2 x1]
0160         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST permute(__m128i x) {
0161             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= X0 && Dst3 >= X0, "Incorrect_Range");
0162             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= X3 && Dst3 <= X3, "Incorrect_Range");
0163             return _mm_shuffle_epi32(x, Dst0 + Dst1 * 4 + Dst2 * 16 + Dst3 * 64);
0164         }
0165 
0166         // shuffle<Y2, Y0, X2, X1>([x3 x2 x1 x0], [y3 y2 y1 y0]) = [y2 y0 x2 x1]
0167         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128i Vc_CONST shuffle(__m128i x, __m128i y) {
0168             static_assert(Dst0 >= X0 && Dst1 >= X0 && Dst2 >= Y0 && Dst3 >= Y0, "Incorrect_Range");
0169             static_assert(Dst0 <= X3 && Dst1 <= X3 && Dst2 <= Y3 && Dst3 <= Y3, "Incorrect_Range");
0170             return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(x), _mm_castsi128_ps(y), Dst0 + Dst1 * 4 + (Dst2 - Y0) * 16 + (Dst3 - Y0) * 64));
0171         }
0172 
0173         // blend<Y1, X0>([x1 x0], [y1, y0]) = [x1 y0]
0174         template<VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128d Vc_CONST blend(__m128d x, __m128d y) {
0175             return Mem::blend<Dst0, Dst1>(x, y);
0176         }
0177 
0178         template<VecPos Dst3, VecPos Dst2, VecPos Dst1, VecPos Dst0> static Vc_ALWAYS_INLINE __m128 Vc_CONST blend(__m128 x, __m128 y) {
0179             return Mem::blend<Dst0, Dst1, Dst2, Dst3>(x, y);
0180         }
0181 }  // namespace Reg
0182 }  // namespace Vc
0183 
0184 #endif // VC_SSE_SHUFFLE_H_