Warning, /include/Vc/avx/deinterleave.tcc is written in an unsupported language. File is not indexed.
0001 /* This file is part of the Vc library. {{{
0002 Copyright © 2010-2015 Matthias Kretz <kretz@kde.org>
0003
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006 * Redistributions of source code must retain the above copyright
0007 notice, this list of conditions and the following disclaimer.
0008 * Redistributions in binary form must reproduce the above copyright
0009 notice, this list of conditions and the following disclaimer in the
0010 documentation and/or other materials provided with the distribution.
0011 * Neither the names of contributing organizations nor the
0012 names of its contributors may be used to endorse or promote products
0013 derived from this software without specific prior written permission.
0014
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025
0026 }}}*/
0027
0028 namespace Vc_VERSIONED_NAMESPACE
0029 {
0030 namespace AVX2
0031 {
0032
0033 inline void deinterleave(double_v &Vc_RESTRICT a, double_v &Vc_RESTRICT b, double_v &Vc_RESTRICT c)
0034 { // estimated latency (AVX): 4.5 cycles
0035 const m256d tmp0 = Mem::shuffle128<X0, Y1>(a.data(), b.data());
0036 const m256d tmp1 = Mem::shuffle128<X1, Y0>(a.data(), c.data());
0037 const m256d tmp2 = Mem::shuffle128<X0, Y1>(b.data(), c.data());
0038 a.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp0, tmp1);
0039 b.data() = Mem::shuffle<X1, Y0, X3, Y2>(tmp0, tmp2);
0040 c.data() = Mem::shuffle<X0, Y1, X2, Y3>(tmp1, tmp2);
0041 }
0042
0043 inline void deinterleave(float_v &Vc_RESTRICT a, float_v &Vc_RESTRICT b, float_v &Vc_RESTRICT c)
0044 {
0045 // abc abc abc
0046 // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
0047 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
0048 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
0049 const m256 ac0 = Mem::shuffle128<X0, Y0>(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6
0050 const m256 ac1 = Mem::shuffle128<X1, Y1>(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7
0051
0052 m256 tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
0053 tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5
0054 m256 tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
0055 tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6
0056 m256 tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
0057 tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7
0058
0059 a.data() = Mem::permute<X0, X3, X2, X1>(tmp0);
0060 b.data() = Mem::permute<X1, X0, X3, X2>(tmp1);
0061 c.data() = Mem::permute<X2, X1, X0, X3>(tmp2);
0062 }
0063
0064 inline void deinterleave(int_v &Vc_RESTRICT a, int_v &Vc_RESTRICT b, int_v &Vc_RESTRICT c)
0065 {
0066 deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
0067 reinterpret_cast<float_v &>(c));
0068 }
0069
0070 inline void deinterleave(uint_v &Vc_RESTRICT a, uint_v &Vc_RESTRICT b, uint_v &Vc_RESTRICT c)
0071 {
0072 deinterleave(reinterpret_cast<float_v &>(a), reinterpret_cast<float_v &>(b),
0073 reinterpret_cast<float_v &>(c));
0074 }
0075
0076 inline void deinterleave(Vector<short> &Vc_RESTRICT , Vector<short> &Vc_RESTRICT ,
0077 Vector<short> &Vc_RESTRICT )
0078 {
0079 return;
0080 /* TODO:
0081 // abc abc abc
0082 // a = [a0 b0 c0 a1 b1 c1 a2 b2] 332 = 211+121
0083 // b = [c2 a3 b3 c3 a4 b4 c4 a5] 323 = 112+211
0084 // c = [b5 c5 a6 b6 c6 a7 b7 c7] 233 = 121+112
0085 m128i ac0 = _mm_unpacklo_epi64(a.data(), c.data()); // a0 b0 c0 a1 b5 c5 a6 b6
0086 m128i ac1 = _mm_unpackhi_epi64(a.data(), c.data()); // b1 c1 a2 b2 c6 a7 b7 c7
0087
0088 m128i tmp0 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>( ac0, b.data());
0089 tmp0 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>(tmp0, ac1); // a0 a3 a2 a1 a4 a7 a6 a5
0090 m128i tmp1 = Mem::blend<X0, X1, Y2, X3, X4, Y5, X6, X7>( ac0, b.data());
0091 tmp1 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>(tmp1, ac1); // b1 b0 b3 b2 b5 b4 b7 b6
0092 m128i tmp2 = Mem::blend<Y0, X1, X2, Y3, X4, X5, Y6, X7>( ac0, b.data());
0093 tmp2 = Mem::blend<X0, Y1, X2, X3, Y4, X5, X6, Y7>(tmp2, ac1); // c2 c1 c0 c3 c6 c5 c4 c7
0094
0095 a.data() = Mem::permuteHi<X4, X7, X6, X5>(Mem::permuteLo<X0, X3, X2, X1>(tmp0));
0096 b.data() = Mem::permuteHi<X5, X4, X7, X6>(Mem::permuteLo<X1, X0, X3, X2>(tmp1));
0097 c.data() = Mem::permuteHi<X6, X5, X4, X7>(Mem::permuteLo<X2, X1, X0, X3>(tmp2));
0098 */
0099 }
0100
0101 inline void deinterleave(Vector<unsigned short> &Vc_RESTRICT a, Vector<unsigned short> &Vc_RESTRICT b,
0102 Vector<unsigned short> &Vc_RESTRICT c)
0103 {
0104 deinterleave(reinterpret_cast<Vector<short> &>(a), reinterpret_cast<Vector<short> &>(b),
0105 reinterpret_cast<Vector<short> &>(c));
0106 }
0107
0108 inline void deinterleave(Vector<float> &a, Vector<float> &b)
0109 {
0110 // a7 a6 a5 a4 a3 a2 a1 a0
0111 // b7 b6 b5 b4 b3 b2 b1 b0
0112 const m256 tmp0 = Reg::permute128<Y0, X0>(a.data(), b.data()); // b3 b2 b1 b0 a3 a2 a1 a0
0113 const m256 tmp1 = Reg::permute128<Y1, X1>(a.data(), b.data()); // b7 b6 b5 b4 a7 a6 a5 a4
0114
0115 const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
0116 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
0117
0118 a.data() = _mm256_unpacklo_ps(tmp2, tmp3); // b6 b4 b2 b0 a6 a4 a2 a0
0119 b.data() = _mm256_unpackhi_ps(tmp2, tmp3); // b7 b5 b3 b1 a7 a5 a3 a1
0120 }
0121
0122 inline void deinterleave(Vector<short> &a, // a0 b0 a1 b1 a2 b2 a3 b3 | a4 b4 a5 ...
0123 Vector<short> &b) // a8 b8 a9 ...
0124 {
0125 auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
0126 auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
0127 auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ...
0128 auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ...
0129 v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ...
0130 v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ...
0131 a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ...
0132 b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ...
0133 }
0134
0135 inline void deinterleave(Vector<ushort> &a, Vector<ushort> &b)
0136 {
0137 auto v0 = Mem::shuffle128<X0, Y0>(a.data(), b.data());
0138 auto v1 = Mem::shuffle128<X1, Y1>(a.data(), b.data());
0139 auto v2 = AVX::unpacklo_epi16(v0, v1); // a0 a4 ...
0140 auto v3 = AVX::unpackhi_epi16(v0, v1); // a2 a6 ...
0141 v0 = AVX::unpacklo_epi16(v2, v3); // a0 a2 ...
0142 v1 = AVX::unpackhi_epi16(v2, v3); // a1 a3 ...
0143 a.data() = AVX::unpacklo_epi16(v0, v1); // a0 a1 ...
0144 b.data() = AVX::unpackhi_epi16(v0, v1); // b0 b1 ...
0145 }
0146
0147 } // namespace AVX2
0148 namespace Detail
0149 {
0150 template <typename Flags>
0151 inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const float *m, Flags align)
0152 {
0153 a.load(m, align);
0154 b.load(m + AVX2::float_v::Size, align);
0155 Vc::AVX2::deinterleave(a, b);
0156 }
0157
0158 template <typename Flags>
0159 inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const short *m, Flags f)
0160 {
0161 using namespace Vc::AVX2;
0162 const auto tmp = Detail::load32(m, f);
0163 a.data() =
0164 _mm256_cvtepi32_ps(concat(_mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
0165 _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16)));
0166 b.data() = _mm256_cvtepi32_ps(
0167 concat(_mm_srai_epi32(lo128(tmp), 16), _mm_srai_epi32(hi128(tmp), 16)));
0168 }
0169
0170 template <typename Flags>
0171 inline void deinterleave(AVX2::float_v &a, AVX2::float_v &b, const unsigned short *m, Flags f)
0172 {
0173 using namespace Vc::AVX2;
0174 const auto tmp = Detail::load32(m, f);
0175 a.data() = _mm256_cvtepi32_ps(
0176 concat(_mm_blend_epi16(lo128(tmp), _mm_setzero_si128(), 0xaa),
0177 _mm_blend_epi16(hi128(tmp), _mm_setzero_si128(), 0xaa)));
0178 b.data() = _mm256_cvtepi32_ps(
0179 concat(_mm_srli_epi32(lo128(tmp), 16), _mm_srli_epi32(hi128(tmp), 16)));
0180 }
0181
0182 template <typename Flags>
0183 inline void deinterleave(AVX2::double_v &a, AVX2::double_v &b, const double *m, Flags align)
0184 {
0185 using namespace Vc::AVX2;
0186
0187 a.load(m, align);
0188 b.load(m + AVX2::double_v::Size, align);
0189
0190 m256d tmp0 = Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()); // b1 b0 a1 a0
0191 m256d tmp1 = Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()); // b3 b2 a3 a2
0192
0193 a.data() = _mm256_unpacklo_pd(tmp0, tmp1); // b2 b0 a2 a0
0194 b.data() = _mm256_unpackhi_pd(tmp0, tmp1); // b3 b1 a3 a1
0195 }
0196
0197 template <typename Flags>
0198 inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const int *m, Flags align)
0199 {
0200 using namespace AVX;
0201 a.load(m, align);
0202 b.load(m + AVX2::int_v::Size, align);
0203
0204 const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
0205 const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
0206
0207 const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
0208 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
0209
0210 a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0
0211 b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1
0212 }
0213
0214 template <typename Flags>
0215 inline void deinterleave(AVX2::int_v &a, AVX2::int_v &b, const short *m, Flags f)
0216 {
0217 using namespace Vc::AVX;
0218 const AVX2::short_v tmp0(m, f);
0219 const m256i tmp = tmp0.data();
0220 a.data() = concat(
0221 _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
0222 _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
0223 b.data() = concat(
0224 _mm_srai_epi32(lo128(tmp), 16),
0225 _mm_srai_epi32(hi128(tmp), 16));
0226 }
0227
0228 template <typename Flags>
0229 inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned int *m, Flags align)
0230 {
0231 using namespace AVX;
0232 a.load(m, align);
0233 b.load(m + AVX2::uint_v::Size, align);
0234
0235 const m256 tmp0 = avx_cast<m256>(Mem::shuffle128<Vc::X0, Vc::Y0>(a.data(), b.data()));
0236 const m256 tmp1 = avx_cast<m256>(Mem::shuffle128<Vc::X1, Vc::Y1>(a.data(), b.data()));
0237
0238 const m256 tmp2 = _mm256_unpacklo_ps(tmp0, tmp1); // b5 b1 b4 b0 a5 a1 a4 a0
0239 const m256 tmp3 = _mm256_unpackhi_ps(tmp0, tmp1); // b7 b3 b6 b2 a7 a3 a6 a2
0240
0241 a.data() = avx_cast<m256i>(_mm256_unpacklo_ps(tmp2, tmp3)); // b6 b4 b2 b0 a6 a4 a2 a0
0242 b.data() = avx_cast<m256i>(_mm256_unpackhi_ps(tmp2, tmp3)); // b7 b5 b3 b1 a7 a5 a3 a1
0243 }
0244
0245 template <typename Flags>
0246 inline void deinterleave(AVX2::uint_v &a, AVX2::uint_v &b, const unsigned short *m, Flags f)
0247 {
0248 using namespace Vc::AVX;
0249 const AVX2::ushort_v tmp0(m, f);
0250 const m256i tmp = tmp0.data();
0251 a.data() = concat(
0252 _mm_srai_epi32(_mm_slli_epi32(lo128(tmp), 16), 16),
0253 _mm_srai_epi32(_mm_slli_epi32(hi128(tmp), 16), 16));
0254 b.data() = concat(
0255 _mm_srai_epi32(lo128(tmp), 16),
0256 _mm_srai_epi32(hi128(tmp), 16));
0257 }
0258
0259 template <typename Flags>
0260 inline void deinterleave(AVX2::short_v &a, AVX2::short_v &b, const short *m, Flags align)
0261 {
0262 a.load(m, align);
0263 b.load(m + AVX2::short_v::Size, align);
0264 Vc::AVX2::deinterleave(a, b);
0265 }
0266
0267 template <typename Flags>
0268 inline void deinterleave(AVX2::ushort_v &a, AVX2::ushort_v &b, const unsigned short *m, Flags align)
0269 {
0270 a.load(m, align);
0271 b.load(m + AVX2::ushort_v::Size, align);
0272 Vc::AVX2::deinterleave(a, b);
0273 }
0274
0275 // only support M == V::EntryType -> no specialization
0276 template <typename T, typename M, typename Flags>
0277 Vc_ALWAYS_INLINE void deinterleave(AVX2::Vector<T> &Vc_RESTRICT a,
0278 AVX2::Vector<T> &Vc_RESTRICT b,
0279 AVX2::Vector<T> &Vc_RESTRICT c,
0280 const M *Vc_RESTRICT memory, Flags align)
0281 {
0282 using V = AVX2::Vector<T>;
0283 a.load(&memory[0 * V::Size], align);
0284 b.load(&memory[1 * V::Size], align);
0285 c.load(&memory[2 * V::Size], align);
0286 Vc::AVX2::deinterleave(a, b, c);
0287 }
0288
0289 } // namespace Detail
0290 } // namespace Vc