Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-31 10:25:31

0001 /*  This file is part of the Vc library. {{{
0002 Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
0003 
0004 Redistribution and use in source and binary forms, with or without
0005 modification, are permitted provided that the following conditions are met:
0006     * Redistributions of source code must retain the above copyright
0007       notice, this list of conditions and the following disclaimer.
0008     * Redistributions in binary form must reproduce the above copyright
0009       notice, this list of conditions and the following disclaimer in the
0010       documentation and/or other materials provided with the distribution.
0011     * Neither the names of contributing organizations nor the
0012       names of its contributors may be used to endorse or promote products
0013       derived from this software without specific prior written permission.
0014 
0015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
0016 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
0017 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
0018 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
0019 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
0020 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
0021 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
0022 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
0023 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
0024 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
0025 
0026 }}}*/
0027 
0028 #ifndef VC_AVX_CASTS_H_
0029 #define VC_AVX_CASTS_H_
0030 
0031 #include "intrinsics.h"
0032 #include "types.h"
0033 #include "../sse/casts.h"
0034 #include "shuffle.h"
0035 #include "macros.h"
0036 
0037 namespace Vc_VERSIONED_NAMESPACE
0038 {
0039 namespace AVX
0040 {
0041 namespace Casts
0042 {
0043     template<typename T> Vc_INTRINSIC_L T avx_cast(__m128  v) Vc_INTRINSIC_R;
0044     template<typename T> Vc_INTRINSIC_L T avx_cast(__m128i v) Vc_INTRINSIC_R;
0045     template<typename T> Vc_INTRINSIC_L T avx_cast(__m128d v) Vc_INTRINSIC_R;
0046     template<typename T> Vc_INTRINSIC_L T avx_cast(__m256  v) Vc_INTRINSIC_R;
0047     template<typename T> Vc_INTRINSIC_L T avx_cast(__m256i v) Vc_INTRINSIC_R;
0048     template<typename T> Vc_INTRINSIC_L T avx_cast(__m256d v) Vc_INTRINSIC_R;
0049 
0050     // 128 -> 128
0051     template<> Vc_INTRINSIC __m128  avx_cast(__m128  v) { return v; }
0052     template<> Vc_INTRINSIC __m128  avx_cast(__m128i v) { return _mm_castsi128_ps(v); }
0053     template<> Vc_INTRINSIC __m128  avx_cast(__m128d v) { return _mm_castpd_ps(v); }
0054     template<> Vc_INTRINSIC __m128i avx_cast(__m128  v) { return _mm_castps_si128(v); }
0055     template<> Vc_INTRINSIC __m128i avx_cast(__m128i v) { return v; }
0056     template<> Vc_INTRINSIC __m128i avx_cast(__m128d v) { return _mm_castpd_si128(v); }
0057     template<> Vc_INTRINSIC __m128d avx_cast(__m128  v) { return _mm_castps_pd(v); }
0058     template<> Vc_INTRINSIC __m128d avx_cast(__m128i v) { return _mm_castsi128_pd(v); }
0059     template<> Vc_INTRINSIC __m128d avx_cast(__m128d v) { return v; }
0060 
0061     // 128 -> 256
0062     // FIXME: the following casts leave the upper 128bits undefined. With GCC and ICC I've never
0063     // seen the cast not do what I want though: after a VEX-coded SSE instruction the register's
0064     // upper 128bits are zero. Thus using the same register as AVX register will have the upper
0065     // 128bits zeroed. MSVC, though, implements _mm256_castxx128_xx256 with a 128bit move to memory
0066     // + 256bit load. Thus the upper 128bits are really undefined. But there is no intrinsic to do
0067     // what I want (i.e. alias the register, disallowing the move to memory in-between). I'm stuck,
0068     // do we really want to rely on specific compiler behavior here?
0069     template<> Vc_INTRINSIC __m256  avx_cast(__m128  v) { return _mm256_castps128_ps256(v); }
0070     template<> Vc_INTRINSIC __m256  avx_cast(__m128i v) { return _mm256_castps128_ps256(_mm_castsi128_ps(v)); }
0071     template<> Vc_INTRINSIC __m256  avx_cast(__m128d v) { return _mm256_castps128_ps256(_mm_castpd_ps(v)); }
0072     template<> Vc_INTRINSIC __m256i avx_cast(__m128  v) { return _mm256_castsi128_si256(_mm_castps_si128(v)); }
0073     template<> Vc_INTRINSIC __m256i avx_cast(__m128i v) { return _mm256_castsi128_si256(v); }
0074     template<> Vc_INTRINSIC __m256i avx_cast(__m128d v) { return _mm256_castsi128_si256(_mm_castpd_si128(v)); }
0075     template<> Vc_INTRINSIC __m256d avx_cast(__m128  v) { return _mm256_castpd128_pd256(_mm_castps_pd(v)); }
0076     template<> Vc_INTRINSIC __m256d avx_cast(__m128i v) { return _mm256_castpd128_pd256(_mm_castsi128_pd(v)); }
0077     template<> Vc_INTRINSIC __m256d avx_cast(__m128d v) { return _mm256_castpd128_pd256(v); }
0078 
0079 #if defined Vc_MSVC || defined Vc_CLANG || defined Vc_APPLECLANG
0080     static Vc_INTRINSIC Vc_CONST __m256  zeroExtend(__m128  v) { return _mm256_permute2f128_ps   (_mm256_castps128_ps256(v), _mm256_castps128_ps256(v), 0x80); }
0081     static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_permute2f128_si256(_mm256_castsi128_si256(v), _mm256_castsi128_si256(v), 0x80); }
0082     static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_permute2f128_pd   (_mm256_castpd128_pd256(v), _mm256_castpd128_pd256(v), 0x80); }
0083 #else
0084     static Vc_INTRINSIC Vc_CONST __m256  zeroExtend(__m128  v) { return _mm256_castps128_ps256(v); }
0085     static Vc_INTRINSIC Vc_CONST __m256i zeroExtend(__m128i v) { return _mm256_castsi128_si256(v); }
0086     static Vc_INTRINSIC Vc_CONST __m256d zeroExtend(__m128d v) { return _mm256_castpd128_pd256(v); }
0087 #endif
0088 
0089     // 256 -> 128
0090     template<> Vc_INTRINSIC __m128  avx_cast(__m256  v) { return _mm256_castps256_ps128(v); }
0091     template<> Vc_INTRINSIC __m128  avx_cast(__m256i v) { return _mm256_castps256_ps128(_mm256_castsi256_ps(v)); }
0092     template<> Vc_INTRINSIC __m128  avx_cast(__m256d v) { return _mm256_castps256_ps128(_mm256_castpd_ps(v)); }
0093     template<> Vc_INTRINSIC __m128i avx_cast(__m256  v) { return _mm256_castsi256_si128(_mm256_castps_si256(v)); }
0094     template<> Vc_INTRINSIC __m128i avx_cast(__m256i v) { return _mm256_castsi256_si128(v); }
0095     template<> Vc_INTRINSIC __m128i avx_cast(__m256d v) { return _mm256_castsi256_si128(_mm256_castpd_si256(v)); }
0096     template<> Vc_INTRINSIC __m128d avx_cast(__m256  v) { return _mm256_castpd256_pd128(_mm256_castps_pd(v)); }
0097     template<> Vc_INTRINSIC __m128d avx_cast(__m256i v) { return _mm256_castpd256_pd128(_mm256_castsi256_pd(v)); }
0098     template<> Vc_INTRINSIC __m128d avx_cast(__m256d v) { return _mm256_castpd256_pd128(v); }
0099 
0100     // 256 -> 256
0101     template<> Vc_INTRINSIC __m256  avx_cast(__m256  v) { return v; }
0102     template<> Vc_INTRINSIC __m256  avx_cast(__m256i v) { return _mm256_castsi256_ps(v); }
0103     template<> Vc_INTRINSIC __m256  avx_cast(__m256d v) { return _mm256_castpd_ps(v); }
0104     template<> Vc_INTRINSIC __m256i avx_cast(__m256  v) { return _mm256_castps_si256(v); }
0105     template<> Vc_INTRINSIC __m256i avx_cast(__m256i v) { return v; }
0106     template<> Vc_INTRINSIC __m256i avx_cast(__m256d v) { return _mm256_castpd_si256(v); }
0107     template<> Vc_INTRINSIC __m256d avx_cast(__m256  v) { return _mm256_castps_pd(v); }
0108     template<> Vc_INTRINSIC __m256d avx_cast(__m256i v) { return _mm256_castsi256_pd(v); }
0109     template<> Vc_INTRINSIC __m256d avx_cast(__m256d v) { return v; }
0110 
0111     // simplify splitting 256-bit registers in 128-bit registers
0112     Vc_INTRINSIC Vc_CONST __m128  lo128(__m256  v) { return avx_cast<__m128>(v); }
0113     Vc_INTRINSIC Vc_CONST __m128d lo128(__m256d v) { return avx_cast<__m128d>(v); }
0114     Vc_INTRINSIC Vc_CONST __m128i lo128(__m256i v) { return avx_cast<__m128i>(v); }
0115     Vc_INTRINSIC Vc_CONST __m128  hi128(__m256  v) { return extract128<1>(v); }
0116     Vc_INTRINSIC Vc_CONST __m128d hi128(__m256d v) { return extract128<1>(v); }
0117     Vc_INTRINSIC Vc_CONST __m128i hi128(__m256i v) { return extract128<1>(v); }
0118 
0119     // simplify combining 128-bit registers in 256-bit registers
0120     Vc_INTRINSIC Vc_CONST __m256  concat(__m128  a, __m128  b) { return insert128<1>(avx_cast<__m256 >(a), b); }
0121     Vc_INTRINSIC Vc_CONST __m256d concat(__m128d a, __m128d b) { return insert128<1>(avx_cast<__m256d>(a), b); }
0122     Vc_INTRINSIC Vc_CONST __m256i concat(__m128i a, __m128i b) { return insert128<1>(avx_cast<__m256i>(a), b); }
0123 
0124 }  // namespace Casts
0125 using namespace Casts;
0126 }  // namespace AVX
0127 
0128 namespace AVX2
0129 {
0130 using namespace AVX::Casts;
0131 }  // namespace AVX2
0132 
0133 namespace AVX
0134 {
0135 template <typename From, typename To> struct ConvertTag {};
0136 
0137 Vc_INTRINSIC __m256i convert(__m256  v, ConvertTag<float , int>) { return _mm256_cvttps_epi32(v); }
0138 Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, int>) { return _mm256_cvttpd_epi32(v); }
0139 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int   , int>) { return v; }
0140 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint  , int>) { return v; }
0141 Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , int>) {
0142 #ifdef Vc_IMPL_AVX2
0143     return _mm256_cvtepi16_epi32(v);
0144 #else
0145     return AVX::srai_epi32<16>(
0146         concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
0147 #endif
0148 }
0149 Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, int>) {
0150 #ifdef Vc_IMPL_AVX2
0151     return _mm256_cvtepu16_epi32(v);
0152 #else
0153     return AVX::srli_epi32<16>(
0154         concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
0155 #endif
0156 }
0157 
0158 Vc_INTRINSIC __m256i convert(__m256  v, ConvertTag<float , uint>) {
0159     using namespace AVX;
0160     return _mm256_castps_si256(_mm256_blendv_ps(
0161         _mm256_castsi256_ps(_mm256_cvttps_epi32(v)),
0162         _mm256_castsi256_ps(add_epi32(_mm256_cvttps_epi32(_mm256_sub_ps(v, set2power31_ps())),
0163                                       set2power31_epu32())),
0164         cmpge_ps(v, set2power31_ps())));
0165 }
0166 Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, uint>) {
0167     using namespace AVX;
0168     return _mm_xor_si128(
0169         _mm256_cvttpd_epi32(_mm256_sub_pd(_mm256_floor_pd(v), set1_pd(0x80000000u))),
0170         _mm_set2power31_epu32());
0171 }
0172 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<int   , uint>) { return v; }
0173 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<uint  , uint>) { return v; }
0174 Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<short , uint>) {
0175 #ifdef Vc_IMPL_AVX2
0176     return _mm256_cvtepi16_epi32(v);
0177 #else
0178     return AVX::srai_epi32<16>(
0179         concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
0180 #endif
0181 }
0182 Vc_INTRINSIC __m256i convert(__m128i v, ConvertTag<ushort, uint>) {
0183 #ifdef Vc_IMPL_AVX2
0184     return _mm256_cvtepu16_epi32(v);
0185 #else
0186     return AVX::srli_epi32<16>(
0187         concat(_mm_unpacklo_epi16(v, v), _mm_unpackhi_epi16(v, v)));
0188 #endif
0189 }
0190 
0191 Vc_INTRINSIC __m256  convert(__m256  v, ConvertTag<float , float>) { return v; }
0192 Vc_INTRINSIC __m128  convert(__m256d v, ConvertTag<double, float>) { return _mm256_cvtpd_ps(v); }
0193 Vc_INTRINSIC __m256  convert(__m256i v, ConvertTag<int   , float>) { return _mm256_cvtepi32_ps(v); }
0194 Vc_INTRINSIC __m256  convert(__m256i v, ConvertTag<uint  , float>) {
0195     // this is complicated because cvtepi32_ps only supports signed input. Thus, all
0196     // input values with the MSB set would produce a negative result. We can reuse the
0197     // cvtepi32_ps instruction if we unset the MSB. But then the rounding results can be
0198     // different. Since float uses 24 bits for the mantissa (effectively), the 9-bit LSB
0199     // determines the rounding direction. (Consider the bits ...8'7654'3210. The bits [0:7]
0200     // need to be dropped and if > 0x80 round up, if < 0x80 round down. If [0:7] == 0x80
0201     // then the rounding direction is determined by bit [8] for round to even. That's why
0202     // the 9th bit is relevant for the rounding decision.)
0203     // If the MSB of the input is set to 0, the cvtepi32_ps instruction makes its rounding
0204     // decision on the lowest 8 bits instead. A second rounding decision is made when
0205     // float(0x8000'0000) is added. This will rarely fix the rounding issue.
0206     //
0207     // Here's what the standard rounding mode expects:
0208     // 0xc0000080 should cvt to 0xc0000000
0209     // 0xc0000081 should cvt to 0xc0000100
0210     //     --     should cvt to 0xc0000100
0211     // 0xc000017f should cvt to 0xc0000100
0212     // 0xc0000180 should cvt to 0xc0000200
0213     //
0214     // However: using float(input ^ 0x8000'0000) + float(0x8000'0000) we get:
0215     // 0xc0000081 would cvt to 0xc0000000
0216     // 0xc00000c0 would cvt to 0xc0000000
0217     // 0xc00000c1 would cvt to 0xc0000100
0218     // 0xc000013f would cvt to 0xc0000100
0219     // 0xc0000140 would cvt to 0xc0000200
0220     //
0221     // Solution: float(input & 0x7fff'fe00) + (float(0x8000'0000) + float(input & 0x1ff))
0222     // This ensures the rounding decision is made on the 9-bit LSB when 0x8000'0000 is
0223     // added to the float value of the low 8 bits of the input.
0224     using namespace AVX;
0225     return _mm256_blendv_ps(
0226         _mm256_cvtepi32_ps(v),
0227         _mm256_add_ps(_mm256_cvtepi32_ps(and_si256(v, set1_epi32(0x7ffffe00))),
0228                       _mm256_add_ps(set2power31_ps(), _mm256_cvtepi32_ps(and_si256(
0229                                                           v, set1_epi32(0x000001ff))))),
0230         _mm256_castsi256_ps(cmplt_epi32(v, _mm256_setzero_si256())));
0231 }
0232 Vc_INTRINSIC __m256  convert(__m128i v, ConvertTag<short , float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag< short, int>())); }
0233 Vc_INTRINSIC __m256  convert(__m128i v, ConvertTag<ushort, float>) { return _mm256_cvtepi32_ps(convert(v, ConvertTag<ushort, int>())); }
0234 
0235 Vc_INTRINSIC __m256d convert(__m128  v, ConvertTag<float , double>) { return _mm256_cvtps_pd(v); }
0236 Vc_INTRINSIC __m256d convert(__m256d v, ConvertTag<double, double>) { return v; }
0237 Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<int   , double>) { return _mm256_cvtepi32_pd(v); }
0238 Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<uint  , double>) {
0239     using namespace AVX;
0240     return _mm256_add_pd(
0241         _mm256_cvtepi32_pd(_mm_xor_si128(v, _mm_setmin_epi32())),
0242         set1_pd(1u << 31)); }
0243 Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, SSE::ConvertTag< short, int>()), ConvertTag<int, double>()); }
0244 Vc_INTRINSIC __m256d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, SSE::ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
0245 
0246 Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int   , short>) {
0247 #ifdef Vc_IMPL_AVX2
0248     auto a = _mm256_shuffle_epi8(
0249         v, _mm256_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80, -0x80, -0x80, -0x80,
0250                             -0x80, -0x80, -0x80, 0, 1, 4, 5, 8, 9, 12, 13, -0x80, -0x80,
0251                             -0x80, -0x80, -0x80, -0x80, -0x80, -0x80));
0252     return lo128(_mm256_permute4x64_epi64(a, 0xf8));  // a[0] a[2] | a[3] a[3]
0253 #else
0254     const auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
0255     const auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
0256     const auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0257     const auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0258     return _mm_unpacklo_epi16(tmp2, tmp3);
0259 #endif
0260 }
0261 Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint  , short>) { return convert(v, ConvertTag<int, short>()); }
0262 Vc_INTRINSIC __m128i convert(__m256  v, ConvertTag<float , short>) { return convert(convert(v, ConvertTag<float, int>()), ConvertTag<int, short>()); }
0263 Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, short>) { return convert(convert(v, ConvertTag<double, int>()), SSE::ConvertTag<int, short>()); }
0264 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , short>) { return v; }
0265 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, short>) { return v; }
0266 
0267 Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<int   , ushort>) {
0268     auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
0269     auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
0270     auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0271     auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0272     return _mm_unpacklo_epi16(tmp2, tmp3);
0273 }
0274 Vc_INTRINSIC __m128i convert(__m256i v, ConvertTag<uint  , ushort>) {
0275     auto tmp0 = _mm_unpacklo_epi16(lo128(v), hi128(v));
0276     auto tmp1 = _mm_unpackhi_epi16(lo128(v), hi128(v));
0277     auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0278     auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0279     return _mm_unpacklo_epi16(tmp2, tmp3);
0280 }
0281 Vc_INTRINSIC __m128i convert(__m256  v, ConvertTag<float , ushort>) { return convert(convert(v, ConvertTag<float, uint>()), ConvertTag<uint, ushort>()); }
0282 Vc_INTRINSIC __m128i convert(__m256d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, uint>()), SSE::ConvertTag<uint, ushort>()); }
0283 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<short , ushort>) { return v; }
0284 Vc_INTRINSIC __m256i convert(__m256i v, ConvertTag<ushort, ushort>) { return v; }
0285 
0286 template <typename From, typename To>
0287 Vc_INTRINSIC auto convert(
0288     typename std::conditional<(sizeof(From) < sizeof(To)),
0289                               typename SSE::VectorTraits<From>::VectorType,
0290                               typename AVX::VectorTypeHelper<From>::Type>::type v)
0291     -> decltype(convert(v, ConvertTag<From, To>()))
0292 {
0293     return convert(v, ConvertTag<From, To>());
0294 }
0295 
0296 template <typename From, typename To, typename = enable_if<(sizeof(From) < sizeof(To))>>
0297 Vc_INTRINSIC auto convert(typename AVX::VectorTypeHelper<From>::Type v)
0298     -> decltype(convert(lo128(v), ConvertTag<From, To>()))
0299 {
0300     return convert(lo128(v), ConvertTag<From, To>());
0301 }
0302 }  // namespace AVX
0303 }  // namespace Vc
0304 
0305 #endif // VC_AVX_CASTS_H_