File indexing completed on 2025-02-28 10:25:31
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #ifndef VC_SSE_CASTS_H_
0029 #define VC_SSE_CASTS_H_
0030
0031 #include "intrinsics.h"
0032 #include "types.h"
0033 #include "macros.h"
0034
0035 namespace Vc_VERSIONED_NAMESPACE
0036 {
0037 namespace SSE
0038 {
0039 using uint = unsigned int;
0040 using ushort = unsigned short;
0041 using uchar = unsigned char;
0042 using schar = signed char;
0043
0044
0045 template <typename To, typename From> Vc_ALWAYS_INLINE Vc_CONST To sse_cast(From v)
0046 {
0047 return v;
0048 }
0049 template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128 >(__m128 v) { return _mm_castps_si128(v); }
0050 template<> Vc_ALWAYS_INLINE Vc_CONST __m128i sse_cast<__m128i, __m128d>(__m128d v) { return _mm_castpd_si128(v); }
0051 template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128d>(__m128d v) { return _mm_castpd_ps(v); }
0052 template<> Vc_ALWAYS_INLINE Vc_CONST __m128 sse_cast<__m128 , __m128i>(__m128i v) { return _mm_castsi128_ps(v); }
0053 template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128i>(__m128i v) { return _mm_castsi128_pd(v); }
0054 template<> Vc_ALWAYS_INLINE Vc_CONST __m128d sse_cast<__m128d, __m128 >(__m128 v) { return _mm_castps_pd(v); }
0055
0056
0057 template <typename From, typename To> struct ConvertTag
0058 {
0059 };
0060 template <typename From, typename To>
0061 Vc_INTRINSIC typename VectorTraits<To>::VectorType convert(
0062 typename VectorTraits<From>::VectorType v)
0063 {
0064 return convert(v, ConvertTag<From, To>());
0065 }
0066
0067 Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , int >) { return _mm_cvttps_epi32(v); }
0068 Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, int >) { return _mm_cvttpd_epi32(v); }
0069 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , int >) { return v; }
0070 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , int >) { return v; }
0071 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , int >) {
0072 #ifdef Vc_IMPL_SSE4_1
0073 return _mm_cvtepi16_epi32(v);
0074 #else
0075 return _mm_srai_epi32(_mm_unpacklo_epi16(v, v), 16);
0076 #endif
0077 }
0078 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, int >) {
0079 #ifdef Vc_IMPL_SSE4_1
0080 return _mm_cvtepu16_epi32(v);
0081 #else
0082 return _mm_srli_epi32(_mm_unpacklo_epi16(v, v), 16);
0083 #endif
0084 }
0085 Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , uint >) {
0086 return _mm_castps_si128(
0087 blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(v)),
0088 _mm_castsi128_ps(_mm_xor_si128(
0089 _mm_cvttps_epi32(_mm_sub_ps(v, _mm_set1_ps(1u << 31))),
0090 _mm_set1_epi32(1 << 31))),
0091 _mm_cmpge_ps(v, _mm_set1_ps(1u << 31))));
0092 }
0093 Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, uint >) {
0094 #ifdef Vc_IMPL_SSE4_1
0095 return _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(_mm_floor_pd(v), _mm_set1_pd(0x80000000u))),
0096 _mm_cvtsi64_si128(0x8000000080000000ull));
0097 #else
0098 return blendv_epi8(_mm_cvttpd_epi32(v),
0099 _mm_xor_si128(_mm_cvttpd_epi32(_mm_sub_pd(v, _mm_set1_pd(0x80000000u))),
0100 _mm_cvtsi64_si128(0x8000000080000000ull)),
0101 _mm_castpd_si128(_mm_cmpge_pd(v, _mm_set1_pd(0x80000000u))));
0102 #endif
0103 }
0104 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , uint >) { return v; }
0105 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , uint >) { return v; }
0106 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , uint >) { return convert(v, ConvertTag<short, int>()); }
0107 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, uint >) { return convert(v, ConvertTag<ushort, int>()); }
0108 Vc_INTRINSIC __m128 convert(__m128 v, ConvertTag<float , float >) { return v; }
0109 Vc_INTRINSIC __m128 convert(__m128d v, ConvertTag<double, float >) { return _mm_cvtpd_ps(v); }
0110 Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<int , float >) { return _mm_cvtepi32_ps(v); }
0111 Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<uint , float >) {
0112
0113
0114 using namespace SSE;
0115 return blendv_ps(_mm_cvtepi32_ps(v),
0116 _mm_add_ps(_mm_cvtepi32_ps(_mm_and_si128(v, _mm_set1_epi32(0x7ffffe00))),
0117 _mm_add_ps(_mm_set1_ps(1u << 31), _mm_cvtepi32_ps(_mm_and_si128(
0118 v, _mm_set1_epi32(0x000001ff))))),
0119 _mm_castsi128_ps(_mm_cmplt_epi32(v, _mm_setzero_si128())));
0120 }
0121 Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<short , float >) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, float>()); }
0122 Vc_INTRINSIC __m128 convert(__m128i v, ConvertTag<ushort, float >) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, float>()); }
0123 Vc_INTRINSIC __m128d convert(__m128 v, ConvertTag<float , double>) { return _mm_cvtps_pd(v); }
0124 Vc_INTRINSIC __m128d convert(__m128d v, ConvertTag<double, double>) { return v; }
0125 Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<int , double>) { return _mm_cvtepi32_pd(v); }
0126 Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<uint , double>) { return _mm_add_pd(_mm_cvtepi32_pd(_mm_xor_si128(v, setmin_epi32())), _mm_set1_pd(1u << 31)); }
0127 Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<short , double>) { return convert(convert(v, ConvertTag<short, int>()), ConvertTag<int, double>()); }
0128 Vc_INTRINSIC __m128d convert(__m128i v, ConvertTag<ushort, double>) { return convert(convert(v, ConvertTag<ushort, int>()), ConvertTag<int, double>()); }
0129 Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , short >) { return _mm_packs_epi32(_mm_cvttps_epi32(v), _mm_setzero_si128()); }
0130 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
0131 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , short >) { return _mm_packs_epi32(v, _mm_setzero_si128()); }
0132 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , short >) { return v; }
0133 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, short >) { return v; }
0134 Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, short >) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, short>()); }
0135 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<int , ushort>) {
0136 auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
0137 auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
0138 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0139 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0140 return _mm_unpacklo_epi16(tmp2, tmp3);
0141 }
0142 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<uint , ushort>) {
0143 auto tmp0 = _mm_unpacklo_epi16(v, _mm_setzero_si128());
0144 auto tmp1 = _mm_unpackhi_epi16(v, _mm_setzero_si128());
0145 auto tmp2 = _mm_unpacklo_epi16(tmp0, tmp1);
0146 auto tmp3 = _mm_unpackhi_epi16(tmp0, tmp1);
0147 return _mm_unpacklo_epi16(tmp2, tmp3);
0148 }
0149 Vc_INTRINSIC __m128i convert(__m128 v, ConvertTag<float , ushort>) { return convert(_mm_cvttps_epi32(v), ConvertTag<int, ushort>()); }
0150 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<short , ushort>) { return v; }
0151 Vc_INTRINSIC __m128i convert(__m128i v, ConvertTag<ushort, ushort>) { return v; }
0152 Vc_INTRINSIC __m128i convert(__m128d v, ConvertTag<double, ushort>) { return convert(convert(v, ConvertTag<double, int>()), ConvertTag<int, ushort>()); }
0153
0154
0155 }
0156 }
0157
0158 #endif
0159
0160