File indexing completed on 2025-01-19 09:51:46
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010 #ifndef EIGEN_PACKET_MATH_SSE_H
0011 #define EIGEN_PACKET_MATH_SSE_H
0012
0013 namespace Eigen {
0014
0015 namespace internal {
0016
0017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
0018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
0019 #endif
0020
0021 #if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
0022
0023
0024 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
0025 #endif
0026
0027 #ifdef EIGEN_VECTORIZE_FMA
0028 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
0029 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
0030 #endif
0031 #endif
0032
0033 #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX
0034
0035
0036
0037
0038
0039 typedef eigen_packet_wrapper<__m128> Packet4f;
0040 typedef eigen_packet_wrapper<__m128d> Packet2d;
0041 #else
0042 typedef __m128 Packet4f;
0043 typedef __m128d Packet2d;
0044 #endif
0045
0046 typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
0047 typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
0048
0049 template<> struct is_arithmetic<__m128> { enum { value = true }; };
0050 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
0051 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
0052 template<> struct is_arithmetic<Packet4i> { enum { value = true }; };
0053 template<> struct is_arithmetic<Packet16b> { enum { value = true }; };
0054
0055 template<int p, int q, int r, int s>
0056 struct shuffle_mask{
0057 enum { mask = (s)<<6|(r)<<4|(q)<<2|(p) };
0058 };
0059
0060
0061 #define vec4f_swizzle1(v,p,q,r,s) \
0062 Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), (shuffle_mask<p,q,r,s>::mask))))
0063
0064 #define vec4i_swizzle1(v,p,q,r,s) \
0065 Packet4i(_mm_shuffle_epi32( v, (shuffle_mask<p,q,r,s>::mask)))
0066
0067 #define vec2d_swizzle1(v,p,q) \
0068 Packet2d(_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), (shuffle_mask<2*p,2*p+1,2*q,2*q+1>::mask))))
0069
0070 #define vec4f_swizzle2(a,b,p,q,r,s) \
0071 Packet4f(_mm_shuffle_ps( (a), (b), (shuffle_mask<p,q,r,s>::mask)))
0072
0073 #define vec4i_swizzle2(a,b,p,q,r,s) \
0074 Packet4i(_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p,q,r,s>::mask)))))
0075
0076 EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b)
0077 {
0078 return Packet4f(_mm_movelh_ps(a,b));
0079 }
0080 EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b)
0081 {
0082 return Packet4f(_mm_movehl_ps(a,b));
0083 }
0084 EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b)
0085 {
0086 return Packet4f(_mm_unpacklo_ps(a,b));
0087 }
0088 EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b)
0089 {
0090 return Packet4f(_mm_unpackhi_ps(a,b));
0091 }
0092 #define vec4f_duplane(a,p) \
0093 vec4f_swizzle2(a,a,p,p,p,p)
0094
0095 #define vec2d_swizzle2(a,b,mask) \
0096 Packet2d(_mm_shuffle_pd(a,b,mask))
0097
0098 EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b)
0099 {
0100 return Packet2d(_mm_unpacklo_pd(a,b));
0101 }
0102 EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b)
0103 {
0104 return Packet2d(_mm_unpackhi_pd(a,b));
0105 }
0106 #define vec2d_duplane(a,p) \
0107 vec2d_swizzle2(a,a,(p<<1)|p)
0108
0109 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
0110 const Packet4f p4f_##NAME = pset1<Packet4f>(X)
0111
0112 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
0113 const Packet2d p2d_##NAME = pset1<Packet2d>(X)
0114
0115 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
0116 const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
0117
0118 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
0119 const Packet4i p4i_##NAME = pset1<Packet4i>(X)
0120
0121
0122
0123
0124 #ifndef EIGEN_VECTORIZE_AVX
0125 template <>
0126 struct packet_traits<float> : default_packet_traits {
0127 typedef Packet4f type;
0128 typedef Packet4f half;
0129 enum {
0130 Vectorizable = 1,
0131 AlignedOnScalar = 1,
0132 size = 4,
0133 HasHalfPacket = 0,
0134
0135 HasCmp = 1,
0136 HasDiv = 1,
0137 HasSin = EIGEN_FAST_MATH,
0138 HasCos = EIGEN_FAST_MATH,
0139 HasLog = 1,
0140 HasLog1p = 1,
0141 HasExpm1 = 1,
0142 HasNdtri = 1,
0143 HasExp = 1,
0144 HasBessel = 1,
0145 HasSqrt = 1,
0146 HasRsqrt = 1,
0147 HasTanh = EIGEN_FAST_MATH,
0148 HasErf = EIGEN_FAST_MATH,
0149 HasBlend = 1,
0150 HasCeil = 1,
0151 HasFloor = 1,
0152 #ifdef EIGEN_VECTORIZE_SSE4_1
0153 HasRound = 1,
0154 #endif
0155 HasRint = 1
0156 };
0157 };
0158 template <>
0159 struct packet_traits<double> : default_packet_traits {
0160 typedef Packet2d type;
0161 typedef Packet2d half;
0162 enum {
0163 Vectorizable = 1,
0164 AlignedOnScalar = 1,
0165 size=2,
0166 HasHalfPacket = 0,
0167
0168 HasCmp = 1,
0169 HasDiv = 1,
0170 HasLog = 1,
0171 HasExp = 1,
0172 HasSqrt = 1,
0173 HasRsqrt = 1,
0174 HasBlend = 1,
0175 HasFloor = 1,
0176 HasCeil = 1,
0177 #ifdef EIGEN_VECTORIZE_SSE4_1
0178 HasRound = 1,
0179 #endif
0180 HasRint = 1
0181 };
0182 };
0183 #endif
0184 template<> struct packet_traits<int> : default_packet_traits
0185 {
0186 typedef Packet4i type;
0187 typedef Packet4i half;
0188 enum {
0189 Vectorizable = 1,
0190 AlignedOnScalar = 1,
0191 size=4,
0192
0193 HasShift = 1,
0194 HasBlend = 1
0195 };
0196 };
0197
0198 template<> struct packet_traits<bool> : default_packet_traits
0199 {
0200 typedef Packet16b type;
0201 typedef Packet16b half;
0202 enum {
0203 Vectorizable = 1,
0204 AlignedOnScalar = 1,
0205 HasHalfPacket = 0,
0206 size=16,
0207
0208 HasAdd = 1,
0209 HasSub = 1,
0210 HasShift = 0,
0211 HasMul = 1,
0212 HasNegate = 1,
0213 HasAbs = 0,
0214 HasAbs2 = 0,
0215 HasMin = 0,
0216 HasMax = 0,
0217 HasConj = 0,
0218 HasSqrt = 1
0219 };
0220 };
0221
0222 template<> struct unpacket_traits<Packet4f> {
0223 typedef float type;
0224 typedef Packet4f half;
0225 typedef Packet4i integer_packet;
0226 enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
0227 };
0228 template<> struct unpacket_traits<Packet2d> {
0229 typedef double type;
0230 typedef Packet2d half;
0231 enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
0232 };
0233 template<> struct unpacket_traits<Packet4i> {
0234 typedef int type;
0235 typedef Packet4i half;
0236 enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
0237 };
0238 template<> struct unpacket_traits<Packet16b> {
0239 typedef bool type;
0240 typedef Packet16b half;
0241 enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
0242 };
0243
0244 #ifndef EIGEN_VECTORIZE_AVX
0245 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
0246 template<> struct scalar_div_cost<double,true> { enum { value = 8 }; };
0247 #endif
0248
0249 #if EIGEN_COMP_MSVC==1500
0250
0251
0252
0253 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); }
0254 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
0255 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); }
0256 #else
0257 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); }
0258 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
0259 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); }
0260 #endif
0261 template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) { return _mm_set1_epi8(static_cast<char>(from)); }
0262
0263 template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
0264 template<> EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) { return _mm_castsi128_pd(_mm_set1_epi64x(from)); }
0265
0266 template<> EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& ) { return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1)); }
0267 template<> EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& ) { return _mm_set_epi32(0, -1, 0, -1); }
0268 template<> EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& ) { return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1)); }
0269
0270 template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& ) { return _mm_setzero_ps(); }
0271 template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& ) { return _mm_setzero_pd(); }
0272 template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& ) { return _mm_setzero_si128(); }
0273
0274
0275
0276
0277
0278
0279 #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
0280 template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
0281 return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
0282 }
0283 #endif
0284
0285 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
0286 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
0287 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
0288
0289 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
0290 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
0291 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
0292
0293 template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
0294
0295 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
0296 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
0297 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
0298 template<> EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
0299
0300 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
0301 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b)
0302 {
0303 #ifdef EIGEN_VECTORIZE_SSE3
0304 return _mm_addsub_ps(a,b);
0305 #else
0306 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x0,0x80000000,0x0));
0307 return padd(a, pxor(mask, b));
0308 #endif
0309 }
0310
0311 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
0312 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b)
0313 {
0314 #ifdef EIGEN_VECTORIZE_SSE3
0315 return _mm_addsub_pd(a,b);
0316 #else
0317 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x0));
0318 return padd(a, pxor(mask, b));
0319 #endif
0320 }
0321
0322 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
0323 {
0324 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
0325 return _mm_xor_ps(a,mask);
0326 }
0327 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
0328 {
0329 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
0330 return _mm_xor_pd(a,mask);
0331 }
0332 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
0333 {
0334 return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a);
0335 }
0336
0337 template<> EIGEN_STRONG_INLINE Packet16b pnegate(const Packet16b& a)
0338 {
0339 return psub(pset1<Packet16b>(false), a);
0340 }
0341
0342 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
0343 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
0344 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
0345
0346 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
0347 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
0348 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
0349 {
0350 #ifdef EIGEN_VECTORIZE_SSE4_1
0351 return _mm_mullo_epi32(a,b);
0352 #else
0353
0354 return vec4i_swizzle1(
0355 vec4i_swizzle2(
0356 _mm_mul_epu32(a,b),
0357 _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
0358 vec4i_swizzle1(b,1,0,3,2)),
0359 0,2,0,2),
0360 0,2,1,3);
0361 #endif
0362 }
0363
0364 template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
0365
0366 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
0367 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
0368
0369
0370 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
0371 #ifdef EIGEN_VECTORIZE_FMA
0372 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
0373 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
0374 #endif
0375
0376 #ifdef EIGEN_VECTORIZE_SSE4_1
0377 template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
0378 return _mm_blendv_ps(b,a,mask);
0379 }
0380
0381 template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
0382 return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b),_mm_castsi128_ps(a),_mm_castsi128_ps(mask)));
0383 }
0384
0385 template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); }
0386
0387 template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
0388 return _mm_blendv_epi8(b,a,mask);
0389 }
0390 #else
0391 template<> EIGEN_DEVICE_FUNC inline Packet16b pselect(const Packet16b& mask, const Packet16b& a, const Packet16b& b) {
0392 Packet16b a_part = _mm_and_si128(mask, a);
0393 Packet16b b_part = _mm_andnot_si128(mask, b);
0394 return _mm_or_si128(a_part, b_part);
0395 }
0396 #endif
0397
0398 template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
0399 template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
0400 template<> EIGEN_STRONG_INLINE Packet4f
0401 ptrue<Packet4f>(const Packet4f& a) {
0402 Packet4i b = _mm_castps_si128(a);
0403 return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
0404 }
0405 template<> EIGEN_STRONG_INLINE Packet2d
0406 ptrue<Packet2d>(const Packet2d& a) {
0407 Packet4i b = _mm_castpd_si128(a);
0408 return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
0409 }
0410
0411
0412 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
0413 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
0414 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
0415 template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
0416
0417 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
0418 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
0419 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
0420 template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
0421
0422 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
0423 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
0424 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
0425 template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
0426
0427 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
0428 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
0429 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
0430
0431 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
0432 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
0433 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
0434 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
0435
0436 template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
0437 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
0438 template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
0439 template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
0440
0441 template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
0442 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
0443 template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
0444 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
0445
0446 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
0447 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
0448
0449
0450
0451
0452 #ifdef EIGEN_VECTORIZE_AVX
0453 Packet4f res;
0454 asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
0455 #else
0456 Packet4f res = b;
0457 asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
0458 #endif
0459 return res;
0460 #else
0461
0462 return _mm_min_ps(b, a);
0463 #endif
0464 }
0465 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
0466 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
0467
0468
0469
0470
0471 #ifdef EIGEN_VECTORIZE_AVX
0472 Packet2d res;
0473 asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
0474 #else
0475 Packet2d res = b;
0476 asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
0477 #endif
0478 return res;
0479 #else
0480
0481 return _mm_min_pd(b, a);
0482 #endif
0483 }
0484 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
0485 {
0486 #ifdef EIGEN_VECTORIZE_SSE4_1
0487 return _mm_min_epi32(a,b);
0488 #else
0489
0490 Packet4i mask = _mm_cmplt_epi32(a,b);
0491 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
0492 #endif
0493 }
0494
0495
0496 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
0497 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
0498
0499
0500
0501
0502 #ifdef EIGEN_VECTORIZE_AVX
0503 Packet4f res;
0504 asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
0505 #else
0506 Packet4f res = b;
0507 asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
0508 #endif
0509 return res;
0510 #else
0511
0512 return _mm_max_ps(b, a);
0513 #endif
0514 }
0515 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
0516 #if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
0517
0518
0519
0520
0521 #ifdef EIGEN_VECTORIZE_AVX
0522 Packet2d res;
0523 asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
0524 #else
0525 Packet2d res = b;
0526 asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
0527 #endif
0528 return res;
0529 #else
0530
0531 return _mm_max_pd(b, a);
0532 #endif
0533 }
0534 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
0535 {
0536 #ifdef EIGEN_VECTORIZE_SSE4_1
0537 return _mm_max_epi32(a,b);
0538 #else
0539
0540 Packet4i mask = _mm_cmpgt_epi32(a,b);
0541 return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
0542 #endif
0543 }
0544
0545 template <typename Packet, typename Op>
0546 EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
0547
0548
0549 Packet not_nan_mask_a = pcmp_eq(a, a);
0550 Packet m = op(a, b);
0551 return pselect<Packet>(not_nan_mask_a, m, b);
0552 }
0553
0554 template <typename Packet, typename Op>
0555 EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
0556
0557
0558 Packet not_nan_mask_a = pcmp_eq(a, a);
0559 Packet m = op(b, a);
0560 return pselect<Packet>(not_nan_mask_a, m, a);
0561 }
0562
0563
0564 template<>
0565 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
0566 return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
0567 }
0568 template<>
0569 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
0570 return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
0571 }
0572 template<>
0573 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
0574 return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
0575 }
0576 template<>
0577 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
0578 return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
0579 }
0580 template<>
0581 EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
0582 return pminmax_propagate_nan(a, b, pmin<Packet4f>);
0583 }
0584 template<>
0585 EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
0586 return pminmax_propagate_nan(a, b, pmin<Packet2d>);
0587 }
0588 template<>
0589 EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
0590 return pminmax_propagate_nan(a, b, pmax<Packet4f>);
0591 }
0592 template<>
0593 EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
0594 return pminmax_propagate_nan(a, b, pmax<Packet2d>);
0595 }
0596
0597 template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) { return _mm_srai_epi32(a,N); }
0598 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right (const Packet4i& a) { return _mm_srli_epi32(a,N); }
0599 template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left (const Packet4i& a) { return _mm_slli_epi32(a,N); }
0600
0601 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
0602 {
0603 const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
0604 return _mm_and_ps(a,mask);
0605 }
0606 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
0607 {
0608 const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
0609 return _mm_and_pd(a,mask);
0610 }
0611 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
0612 {
0613 #ifdef EIGEN_VECTORIZE_SSSE3
0614 return _mm_abs_epi32(a);
0615 #else
0616 Packet4i aux = _mm_srai_epi32(a,31);
0617 return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
0618 #endif
0619 }
0620
0621 #ifdef EIGEN_VECTORIZE_SSE4_1
0622 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
0623 {
0624
0625 const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
0626 const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
0627 return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
0628 }
0629
0630 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
0631 {
0632 const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
0633 const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
0634 return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
0635 }
0636
0637 template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
0638 template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
0639
0640 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
0641 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
0642
0643 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
0644 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
0645 #else
0646 template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
0647
0648 const Packet4f limit = pset1<Packet4f>(static_cast<float>(1<<23));
0649 const Packet4f abs_a = pabs(a);
0650 Packet4f r = padd(abs_a, limit);
0651
0652 EIGEN_OPTIMIZATION_BARRIER(r);
0653 r = psub(r, limit);
0654
0655 r = pselect(pcmp_lt(abs_a, limit),
0656 pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
0657 return r;
0658 }
0659
0660 template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
0661
0662 const Packet2d limit = pset1<Packet2d>(static_cast<double>(1ull<<52));
0663 const Packet2d abs_a = pabs(a);
0664 Packet2d r = padd(abs_a, limit);
0665
0666 EIGEN_OPTIMIZATION_BARRIER(r);
0667 r = psub(r, limit);
0668
0669 r = pselect(pcmp_lt(abs_a, limit),
0670 pselect(pcmp_lt(a, pzero(a)), pnegate(r), r), a);
0671 return r;
0672 }
0673
0674 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
0675 {
0676 const Packet4f cst_1 = pset1<Packet4f>(1.0f);
0677 Packet4f tmp = print<Packet4f>(a);
0678
0679 Packet4f mask = _mm_cmpgt_ps(tmp, a);
0680 mask = pand(mask, cst_1);
0681 return psub(tmp, mask);
0682 }
0683
0684 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
0685 {
0686 const Packet2d cst_1 = pset1<Packet2d>(1.0);
0687 Packet2d tmp = print<Packet2d>(a);
0688
0689 Packet2d mask = _mm_cmpgt_pd(tmp, a);
0690 mask = pand(mask, cst_1);
0691 return psub(tmp, mask);
0692 }
0693
0694 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
0695 {
0696 const Packet4f cst_1 = pset1<Packet4f>(1.0f);
0697 Packet4f tmp = print<Packet4f>(a);
0698
0699 Packet4f mask = _mm_cmplt_ps(tmp, a);
0700 mask = pand(mask, cst_1);
0701 return padd(tmp, mask);
0702 }
0703
0704 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
0705 {
0706 const Packet2d cst_1 = pset1<Packet2d>(1.0);
0707 Packet2d tmp = print<Packet2d>(a);
0708
0709 Packet2d mask = _mm_cmplt_pd(tmp, a);
0710 mask = pand(mask, cst_1);
0711 return padd(tmp, mask);
0712 }
0713 #endif
0714
0715 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
0716 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
0717 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
0718 template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
0719
0720 #if EIGEN_COMP_MSVC
0721 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
0722 EIGEN_DEBUG_UNALIGNED_LOAD
0723 #if (EIGEN_COMP_MSVC==1600)
0724
0725
0726 __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
0727 res = _mm_loadh_pi(res, (const __m64*)(from+2));
0728 return res;
0729 #else
0730 return _mm_loadu_ps(from);
0731 #endif
0732 }
0733 #else
0734
0735
0736 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
0737 {
0738 EIGEN_DEBUG_UNALIGNED_LOAD
0739 return _mm_loadu_ps(from);
0740 }
0741 #endif
0742
0743 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
0744 {
0745 EIGEN_DEBUG_UNALIGNED_LOAD
0746 return _mm_loadu_pd(from);
0747 }
0748 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
0749 {
0750 EIGEN_DEBUG_UNALIGNED_LOAD
0751 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
0752 }
0753 template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
0754 EIGEN_DEBUG_UNALIGNED_LOAD
0755 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
0756 }
0757
0758
0759 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
0760 {
0761 return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
0762 }
0763 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
0764 { return pset1<Packet2d>(from[0]); }
0765 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from)
0766 {
0767 Packet4i tmp;
0768 tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
0769 return vec4i_swizzle1(tmp, 0, 0, 1, 1);
0770 }
0771
0772
0773
0774 template<> EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from)
0775 {
0776 __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
0777 return _mm_unpacklo_epi8(tmp, tmp);
0778 }
0779
0780
0781
0782 template<> EIGEN_STRONG_INLINE Packet16b
0783 ploadquad<Packet16b>(const bool* from) {
0784 __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
0785 tmp = _mm_unpacklo_epi8(tmp, tmp);
0786 return _mm_unpacklo_epi16(tmp, tmp);
0787 }
0788
0789 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
0790 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
0791 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
0792 template<> EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
0793
0794 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
0795 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
0796 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
0797 template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
0798
0799 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
0800 {
0801 return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
0802 }
0803 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
0804 {
0805 return _mm_set_pd(from[1*stride], from[0*stride]);
0806 }
0807 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
0808 {
0809 return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
0810 }
0811
0812 template<> EIGEN_DEVICE_FUNC inline Packet16b pgather<bool, Packet16b>(const bool* from, Index stride)
0813 {
0814 return _mm_set_epi8(from[15*stride], from[14*stride], from[13*stride], from[12*stride],
0815 from[11*stride], from[10*stride], from[9*stride], from[8*stride],
0816 from[7*stride], from[6*stride], from[5*stride], from[4*stride],
0817 from[3*stride], from[2*stride], from[1*stride], from[0*stride]);
0818 }
0819
0820 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
0821 {
0822 to[stride*0] = _mm_cvtss_f32(from);
0823 to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1));
0824 to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2));
0825 to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3));
0826 }
0827 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
0828 {
0829 to[stride*0] = _mm_cvtsd_f64(from);
0830 to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1));
0831 }
0832 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
0833 {
0834 to[stride*0] = _mm_cvtsi128_si32(from);
0835 to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
0836 to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
0837 to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
0838 }
0839 template<> EIGEN_DEVICE_FUNC inline void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride)
0840 {
0841 to[4*stride*0] = _mm_cvtsi128_si32(from);
0842 to[4*stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
0843 to[4*stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
0844 to[4*stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
0845 }
0846
0847
0848
0849 template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
0850 {
0851 Packet4f pa = _mm_set_ss(a);
0852 pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0)));
0853 }
0854
0855 template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
0856 {
0857 Packet2d pa = _mm_set_sd(a);
0858 pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
0859 }
0860
0861 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
0862 typedef const void * SsePrefetchPtrType;
0863 #else
0864 typedef const char * SsePrefetchPtrType;
0865 #endif
0866
0867 #ifndef EIGEN_VECTORIZE_AVX
0868 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
0869 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
0870 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); }
0871 #endif
0872
0873 #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
0874
0875
0876 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
0877 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
0878 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
0879 #elif EIGEN_COMP_MSVC_STRICT
0880
0881 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
0882 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
0883 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
0884 #else
0885 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
0886 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
0887 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
0888 #endif
0889 template<> EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
0890
0891 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return _mm_shuffle_ps(a,a,0x1B); }
0892 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return _mm_shuffle_pd(a,a,0x1); }
0893 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return _mm_shuffle_epi32(a,0x1B); }
0894 template<> EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
0895 #ifdef EIGEN_VECTORIZE_SSSE3
0896 __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
0897 return _mm_shuffle_epi8(a, mask);
0898 #else
0899 Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
0900 tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
0901 return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
0902 #endif
0903 }
0904
0905 template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
0906 return pfrexp_generic(a,exponent);
0907 }
0908
0909
0910 template<>
0911 EIGEN_STRONG_INLINE
0912 Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
0913 const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
0914 __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
0915 return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
0916 }
0917
0918 template<> EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
0919 return pfrexp_generic(a, exponent);
0920 }
0921
0922 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
0923 return pldexp_generic(a,exponent);
0924 }
0925
0926
0927
0928 template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
0929
0930 const Packet2d max_exponent = pset1<Packet2d>(2099.0);
0931 const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
0932
0933
0934 const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
0935
0936
0937 const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
0938 Packet4i b = parithmetic_shift_right<2>(ei);
0939 Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));
0940 Packet2d out = pmul(pmul(pmul(a, c), c), c);
0941 b = psub(psub(psub(ei, b), b), b);
0942 c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));
0943 out = pmul(out, c);
0944 return out;
0945 }
0946
0947
0948 #ifndef __AVX__
0949 template<> EIGEN_STRONG_INLINE void
0950 pbroadcast4<Packet4f>(const float *a,
0951 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
0952 {
0953 a3 = pload<Packet4f>(a);
0954 a0 = vec4f_swizzle1(a3, 0,0,0,0);
0955 a1 = vec4f_swizzle1(a3, 1,1,1,1);
0956 a2 = vec4f_swizzle1(a3, 2,2,2,2);
0957 a3 = vec4f_swizzle1(a3, 3,3,3,3);
0958 }
0959 template<> EIGEN_STRONG_INLINE void
0960 pbroadcast4<Packet2d>(const double *a,
0961 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
0962 {
0963 #ifdef EIGEN_VECTORIZE_SSE3
0964 a0 = _mm_loaddup_pd(a+0);
0965 a1 = _mm_loaddup_pd(a+1);
0966 a2 = _mm_loaddup_pd(a+2);
0967 a3 = _mm_loaddup_pd(a+3);
0968 #else
0969 a1 = pload<Packet2d>(a);
0970 a0 = vec2d_swizzle1(a1, 0,0);
0971 a1 = vec2d_swizzle1(a1, 1,1);
0972 a3 = pload<Packet2d>(a+2);
0973 a2 = vec2d_swizzle1(a3, 0,0);
0974 a3 = vec2d_swizzle1(a3, 1,1);
0975 #endif
0976 }
0977 #endif
0978
0979 EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
0980 {
0981 vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
0982 vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
0983 vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
0984 vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
0985 }
0986
0987 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
0988 {
0989
0990
0991
0992
0993
0994
0995 Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
0996 return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
0997
0998 }
0999
1000 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
1001 {
1002
1003
1004
1005
1006
1007 return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
1008
1009 }
1010
1011 #ifdef EIGEN_VECTORIZE_SSSE3
1012 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1013 {
1014 Packet4i tmp0 = _mm_hadd_epi32(a,a);
1015 return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
1016 }
1017
1018 #else
1019 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
1020 {
1021 Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
1022 return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
1023 }
1024 #endif
1025
1026 template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
1027 Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
1028 return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
1029 }
1030
1031
1032
1033
1034
1035 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
1036 {
1037 Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
1038 return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1039 }
1040 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
1041 {
1042 return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
1043 }
1044 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
1045 {
1046
1047
1048
1049 EIGEN_ALIGN16 int aux[4];
1050 pstore(aux, a);
1051 return (aux[0] * aux[1]) * (aux[2] * aux[3]);
1052 }
1053
1054 template<> EIGEN_STRONG_INLINE bool predux_mul<Packet16b>(const Packet16b& a) {
1055 Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a,a));
1056 return ((pfirst<Packet4i>(tmp) == 0x01010101) &&
1057 (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
1058 }
1059
1060
1061 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
1062 {
1063 Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
1064 return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1065 }
1066 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
1067 {
1068 return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
1069 }
1070 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
1071 {
1072 #ifdef EIGEN_VECTORIZE_SSE4_1
1073 Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
1074 return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
1075 #else
1076
1077
1078 EIGEN_ALIGN16 int aux[4];
1079 pstore(aux, a);
1080 int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
1081 int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
1082 return aux0<aux2 ? aux0 : aux2;
1083 #endif
1084 }
1085
1086
1087 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
1088 {
1089 Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
1090 return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
1091 }
1092 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
1093 {
1094 return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
1095 }
1096 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
1097 {
1098 #ifdef EIGEN_VECTORIZE_SSE4_1
1099 Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2)));
1100 return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1)));
1101 #else
1102
1103
1104 EIGEN_ALIGN16 int aux[4];
1105 pstore(aux, a);
1106 int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
1107 int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
1108 return aux0>aux2 ? aux0 : aux2;
1109 #endif
1110 }
1111
1112
1113
1114
1115
1116
1117
1118 template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
1119 {
1120 return _mm_movemask_ps(x) != 0x0;
1121 }
1122
1123 EIGEN_DEVICE_FUNC inline void
1124 ptranspose(PacketBlock<Packet4f,4>& kernel) {
1125 _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
1126 }
1127
1128 EIGEN_DEVICE_FUNC inline void
1129 ptranspose(PacketBlock<Packet2d,2>& kernel) {
1130 __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
1131 kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
1132 kernel.packet[1] = tmp;
1133 }
1134
1135 EIGEN_DEVICE_FUNC inline void
1136 ptranspose(PacketBlock<Packet4i,4>& kernel) {
1137 __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
1138 __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
1139 __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
1140 __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
1141
1142 kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
1143 kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
1144 kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
1145 kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
1146 }
1147
1148 EIGEN_DEVICE_FUNC inline void
1149 ptranspose(PacketBlock<Packet16b,4>& kernel) {
1150 __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1151 __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1152 __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1153 __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1154 kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
1155 kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
1156 kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
1157 kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
1158 }
1159
1160 EIGEN_DEVICE_FUNC inline void
1161 ptranspose(PacketBlock<Packet16b,16>& kernel) {
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173 __m128i t0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
1174 __m128i t1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
1175 __m128i t2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
1176 __m128i t3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
1177 __m128i t4 = _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]);
1178 __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]);
1179 __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
1180 __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
1181 __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
1182 __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
1183 __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
1184 __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
1185 __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
1186 __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
1187 __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
1188 __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
1189
1190 __m128i s0 = _mm_unpacklo_epi16(t0, t2);
1191 __m128i s1 = _mm_unpackhi_epi16(t0, t2);
1192 __m128i s2 = _mm_unpacklo_epi16(t1, t3);
1193 __m128i s3 = _mm_unpackhi_epi16(t1, t3);
1194 __m128i s4 = _mm_unpacklo_epi16(t4, t6);
1195 __m128i s5 = _mm_unpackhi_epi16(t4, t6);
1196 __m128i s6 = _mm_unpacklo_epi16(t5, t7);
1197 __m128i s7 = _mm_unpackhi_epi16(t5, t7);
1198 __m128i s8 = _mm_unpacklo_epi16(t8, ta);
1199 __m128i s9 = _mm_unpackhi_epi16(t8, ta);
1200 __m128i sa = _mm_unpacklo_epi16(t9, tb);
1201 __m128i sb = _mm_unpackhi_epi16(t9, tb);
1202 __m128i sc = _mm_unpacklo_epi16(tc, te);
1203 __m128i sd = _mm_unpackhi_epi16(tc, te);
1204 __m128i se = _mm_unpacklo_epi16(td, tf);
1205 __m128i sf = _mm_unpackhi_epi16(td, tf);
1206
1207 __m128i u0 = _mm_unpacklo_epi32(s0, s4);
1208 __m128i u1 = _mm_unpackhi_epi32(s0, s4);
1209 __m128i u2 = _mm_unpacklo_epi32(s1, s5);
1210 __m128i u3 = _mm_unpackhi_epi32(s1, s5);
1211 __m128i u4 = _mm_unpacklo_epi32(s2, s6);
1212 __m128i u5 = _mm_unpackhi_epi32(s2, s6);
1213 __m128i u6 = _mm_unpacklo_epi32(s3, s7);
1214 __m128i u7 = _mm_unpackhi_epi32(s3, s7);
1215 __m128i u8 = _mm_unpacklo_epi32(s8, sc);
1216 __m128i u9 = _mm_unpackhi_epi32(s8, sc);
1217 __m128i ua = _mm_unpacklo_epi32(s9, sd);
1218 __m128i ub = _mm_unpackhi_epi32(s9, sd);
1219 __m128i uc = _mm_unpacklo_epi32(sa, se);
1220 __m128i ud = _mm_unpackhi_epi32(sa, se);
1221 __m128i ue = _mm_unpacklo_epi32(sb, sf);
1222 __m128i uf = _mm_unpackhi_epi32(sb, sf);
1223
1224 kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
1225 kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
1226 kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
1227 kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
1228 kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
1229 kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
1230 kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
1231 kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
1232 kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
1233 kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
1234 kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
1235 kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
1236 kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
1237 kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
1238 kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
1239 kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
1240 }
1241
1242 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
1243 const __m128i zero = _mm_setzero_si128();
1244 const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1245 __m128i false_mask = _mm_cmpeq_epi32(select, zero);
1246 #ifdef EIGEN_VECTORIZE_SSE4_1
1247 return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
1248 #else
1249 return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
1250 #endif
1251 }
1252 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
1253 const __m128 zero = _mm_setzero_ps();
1254 const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
1255 __m128 false_mask = _mm_cmpeq_ps(select, zero);
1256 #ifdef EIGEN_VECTORIZE_SSE4_1
1257 return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
1258 #else
1259 return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
1260 #endif
1261 }
1262 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
1263 const __m128d zero = _mm_setzero_pd();
1264 const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
1265 __m128d false_mask = _mm_cmpeq_pd(select, zero);
1266 #ifdef EIGEN_VECTORIZE_SSE4_1
1267 return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
1268 #else
1269 return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
1270 #endif
1271 }
1272
1273
1274 #ifdef EIGEN_VECTORIZE_FMA
1275 template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
1276 return ::fmaf(a,b,c);
1277 }
1278 template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
1279 return ::fma(a,b,c);
1280 }
1281 #endif
1282
1283
1284
1285
1286
1287 #if 0
1288
1289 typedef struct {
1290 __m64 x;
1291 } Packet4h;
1292
1293
1294 template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
1295
1296 template <>
1297 struct packet_traits<Eigen::half> : default_packet_traits {
1298 typedef Packet4h type;
1299
1300 typedef Packet4h half;
1301 enum {
1302 Vectorizable = 1,
1303 AlignedOnScalar = 1,
1304 size = 4,
1305 HasHalfPacket = 0,
1306 HasAdd = 1,
1307 HasSub = 1,
1308 HasMul = 1,
1309 HasDiv = 1,
1310 HasNegate = 0,
1311 HasAbs = 0,
1312 HasAbs2 = 0,
1313 HasMin = 0,
1314 HasMax = 0,
1315 HasConj = 0,
1316 HasSetLinear = 0,
1317 HasSqrt = 0,
1318 HasRsqrt = 0,
1319 HasExp = 0,
1320 HasLog = 0,
1321 HasBlend = 0
1322 };
1323 };
1324
1325
1326 template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
1327
1328 template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
1329 Packet4h result;
1330 result.x = _mm_set1_pi16(from.x);
1331 return result;
1332 }
1333
1334 template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
1335 return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
1336 }
1337
1338 template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
1339
1340 template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
1341 __int64_t a64 = _mm_cvtm64_si64(a.x);
1342 __int64_t b64 = _mm_cvtm64_si64(b.x);
1343
1344 Eigen::half h[4];
1345
1346 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1347 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1348 h[0] = ha + hb;
1349 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1350 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1351 h[1] = ha + hb;
1352 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1353 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1354 h[2] = ha + hb;
1355 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1356 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1357 h[3] = ha + hb;
1358 Packet4h result;
1359 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1360 return result;
1361 }
1362
1363 template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
1364 __int64_t a64 = _mm_cvtm64_si64(a.x);
1365 __int64_t b64 = _mm_cvtm64_si64(b.x);
1366
1367 Eigen::half h[4];
1368
1369 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1370 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1371 h[0] = ha - hb;
1372 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1373 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1374 h[1] = ha - hb;
1375 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1376 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1377 h[2] = ha - hb;
1378 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1379 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1380 h[3] = ha - hb;
1381 Packet4h result;
1382 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1383 return result;
1384 }
1385
1386 template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
1387 __int64_t a64 = _mm_cvtm64_si64(a.x);
1388 __int64_t b64 = _mm_cvtm64_si64(b.x);
1389
1390 Eigen::half h[4];
1391
1392 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1393 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1394 h[0] = ha * hb;
1395 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1396 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1397 h[1] = ha * hb;
1398 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1399 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1400 h[2] = ha * hb;
1401 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1402 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1403 h[3] = ha * hb;
1404 Packet4h result;
1405 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1406 return result;
1407 }
1408
1409 template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
1410 __int64_t a64 = _mm_cvtm64_si64(a.x);
1411 __int64_t b64 = _mm_cvtm64_si64(b.x);
1412
1413 Eigen::half h[4];
1414
1415 Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1416 Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1417 h[0] = ha / hb;
1418 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
1419 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
1420 h[1] = ha / hb;
1421 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
1422 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
1423 h[2] = ha / hb;
1424 ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
1425 hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
1426 h[3] = ha / hb;
1427 Packet4h result;
1428 result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
1429 return result;
1430 }
1431
1432 template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
1433 Packet4h result;
1434 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1435 return result;
1436 }
1437
1438 template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
1439 Packet4h result;
1440 result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1441 return result;
1442 }
1443
1444 template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1445 __int64_t r = _mm_cvtm64_si64(from.x);
1446 *(reinterpret_cast<__int64_t*>(to)) = r;
1447 }
1448
1449 template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1450 __int64_t r = _mm_cvtm64_si64(from.x);
1451 *(reinterpret_cast<__int64_t*>(to)) = r;
1452 }
1453
1454 template<> EIGEN_STRONG_INLINE Packet4h
1455 ploadquad<Packet4h>(const Eigen::half* from) {
1456 return pset1<Packet4h>(*from);
1457 }
1458
1459 template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
1460 {
1461 Packet4h result;
1462 result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
1463 return result;
1464 }
1465
1466 template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
1467 {
1468 __int64_t a = _mm_cvtm64_si64(from.x);
1469 to[stride*0].x = static_cast<unsigned short>(a);
1470 to[stride*1].x = static_cast<unsigned short>(a >> 16);
1471 to[stride*2].x = static_cast<unsigned short>(a >> 32);
1472 to[stride*3].x = static_cast<unsigned short>(a >> 48);
1473 }
1474
1475 EIGEN_STRONG_INLINE void
1476 ptranspose(PacketBlock<Packet4h,4>& kernel) {
1477 __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
1478 __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
1479 __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
1480 __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
1481
1482 kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
1483 kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
1484 kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
1485 kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
1486 }
1487
1488 #endif
1489
1490
1491 }
1492
1493 }
1494
1495 #if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
1496
1497 static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); }
1498 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
1499 static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); }
1500 static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
1501 static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
1502 static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
1503 #endif
1504
1505 #endif