Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-19 09:51:41

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Copyright (C) 2018 Wave Computing, Inc.
0005 // Written by:
0006 //   Chris Larsen
0007 //   Alexey Frunze (afrunze@wavecomp.com)
0008 //
0009 // This Source Code Form is subject to the terms of the Mozilla
0010 // Public License v. 2.0. If a copy of the MPL was not distributed
0011 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0012 
0013 #ifndef EIGEN_PACKET_MATH_MSA_H
0014 #define EIGEN_PACKET_MATH_MSA_H
0015 
0016 #include <iostream>
0017 #include <string>
0018 
0019 namespace Eigen {
0020 
0021 namespace internal {
0022 
0023 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
0024 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
0025 #endif
0026 
0027 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
0028 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
0029 #endif
0030 
0031 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
0032 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
0033 #endif
0034 
0035 #if 0
0036 #define EIGEN_MSA_DEBUG                                                             \
0037   static bool firstTime = true;                                                     \
0038   do {                                                                              \
0039     if (firstTime) {                                                                \
0040       std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
0041       firstTime = false;                                                            \
0042     }                                                                               \
0043   } while (0)
0044 #else
0045 #define EIGEN_MSA_DEBUG
0046 #endif
0047 
0048 #define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
0049 
0050 typedef v4f32 Packet4f;
0051 typedef v4i32 Packet4i;
0052 typedef v4u32 Packet4ui;
0053 
0054 #define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
0055 #define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
0056 #define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
0057 
0058 inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
0059   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
0060   return os;
0061 }
0062 
0063 inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
0064   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
0065   return os;
0066 }
0067 
0068 inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
0069   os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
0070   return os;
0071 }
0072 
0073 template <>
0074 struct packet_traits<float> : default_packet_traits {
0075   typedef Packet4f type;
0076   typedef Packet4f half;  // Packet2f intrinsics not implemented yet
0077   enum {
0078     Vectorizable = 1,
0079     AlignedOnScalar = 1,
0080     size = 4,
0081     HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
0082     // FIXME check the Has*
0083     HasDiv = 1,
0084     HasSin = EIGEN_FAST_MATH,
0085     HasCos = EIGEN_FAST_MATH,
0086     HasTanh = EIGEN_FAST_MATH,
0087     HasErf = EIGEN_FAST_MATH,
0088     HasLog = 1,
0089     HasExp = 1,
0090     HasSqrt = 1,
0091     HasRsqrt = 1,
0092     HasRound = 1,
0093     HasFloor = 1,
0094     HasCeil = 1,
0095     HasBlend = 1
0096   };
0097 };
0098 
0099 template <>
0100 struct packet_traits<int32_t> : default_packet_traits {
0101   typedef Packet4i type;
0102   typedef Packet4i half;  // Packet2i intrinsics not implemented yet
0103   enum {
0104     Vectorizable = 1,
0105     AlignedOnScalar = 1,
0106     size = 4,
0107     HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
0108     // FIXME check the Has*
0109     HasDiv = 1,
0110     HasBlend = 1
0111   };
0112 };
0113 
0114 template <>
0115 struct unpacket_traits<Packet4f> {
0116   typedef float type;
0117   enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
0118   typedef Packet4f half;
0119 };
0120 
0121 template <>
0122 struct unpacket_traits<Packet4i> {
0123   typedef int32_t type;
0124   enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
0125   typedef Packet4i half;
0126 };
0127 
0128 template <>
0129 EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
0130   EIGEN_MSA_DEBUG;
0131 
0132   Packet4f v = { from, from, from, from };
0133   return v;
0134 }
0135 
0136 template <>
0137 EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
0138   EIGEN_MSA_DEBUG;
0139 
0140   return __builtin_msa_fill_w(from);
0141 }
0142 
0143 template <>
0144 EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
0145   EIGEN_MSA_DEBUG;
0146 
0147   float f = *from;
0148   Packet4f v = { f, f, f, f };
0149   return v;
0150 }
0151 
0152 template <>
0153 EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
0154   EIGEN_MSA_DEBUG;
0155 
0156   return __builtin_msa_fill_w(*from);
0157 }
0158 
0159 template <>
0160 EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
0161   EIGEN_MSA_DEBUG;
0162 
0163   return __builtin_msa_fadd_w(a, b);
0164 }
0165 
0166 template <>
0167 EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
0168   EIGEN_MSA_DEBUG;
0169 
0170   return __builtin_msa_addv_w(a, b);
0171 }
0172 
0173 template <>
0174 EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
0175   EIGEN_MSA_DEBUG;
0176 
0177   static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
0178   return padd(pset1<Packet4f>(a), countdown);
0179 }
0180 
0181 template <>
0182 EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
0183   EIGEN_MSA_DEBUG;
0184 
0185   static const Packet4i countdown = { 0, 1, 2, 3 };
0186   return padd(pset1<Packet4i>(a), countdown);
0187 }
0188 
0189 template <>
0190 EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
0191   EIGEN_MSA_DEBUG;
0192 
0193   return __builtin_msa_fsub_w(a, b);
0194 }
0195 
0196 template <>
0197 EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
0198   EIGEN_MSA_DEBUG;
0199 
0200   return __builtin_msa_subv_w(a, b);
0201 }
0202 
0203 template <>
0204 EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
0205   EIGEN_MSA_DEBUG;
0206 
0207   return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
0208 }
0209 
0210 template <>
0211 EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
0212   EIGEN_MSA_DEBUG;
0213 
0214   return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
0215 }
0216 
0217 template <>
0218 EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
0219   EIGEN_MSA_DEBUG;
0220 
0221   return a;
0222 }
0223 
0224 template <>
0225 EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
0226   EIGEN_MSA_DEBUG;
0227 
0228   return a;
0229 }
0230 
0231 template <>
0232 EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
0233   EIGEN_MSA_DEBUG;
0234 
0235   return __builtin_msa_fmul_w(a, b);
0236 }
0237 
0238 template <>
0239 EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
0240   EIGEN_MSA_DEBUG;
0241 
0242   return __builtin_msa_mulv_w(a, b);
0243 }
0244 
0245 template <>
0246 EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
0247   EIGEN_MSA_DEBUG;
0248 
0249   return __builtin_msa_fdiv_w(a, b);
0250 }
0251 
0252 template <>
0253 EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
0254   EIGEN_MSA_DEBUG;
0255 
0256   return __builtin_msa_div_s_w(a, b);
0257 }
0258 
0259 template <>
0260 EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
0261   EIGEN_MSA_DEBUG;
0262 
0263   return __builtin_msa_fmadd_w(c, a, b);
0264 }
0265 
0266 template <>
0267 EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
0268   EIGEN_MSA_DEBUG;
0269 
0270   // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
0271   Packet4i value = c;
0272   __asm__("maddv.w %w[value], %w[a], %w[b]\n"
0273           // Outputs
0274           : [value] "+f"(value)
0275           // Inputs
0276           : [a] "f"(a), [b] "f"(b));
0277   return value;
0278 }
0279 
0280 template <>
0281 EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
0282   EIGEN_MSA_DEBUG;
0283 
0284   return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
0285 }
0286 
0287 template <>
0288 EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
0289   EIGEN_MSA_DEBUG;
0290 
0291   return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
0292 }
0293 
0294 template <>
0295 EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
0296   EIGEN_MSA_DEBUG;
0297 
0298   return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
0299 }
0300 
0301 template <>
0302 EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
0303   EIGEN_MSA_DEBUG;
0304 
0305   return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
0306 }
0307 
0308 template <>
0309 EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
0310   EIGEN_MSA_DEBUG;
0311 
0312   return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
0313 }
0314 
0315 template <>
0316 EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
0317   EIGEN_MSA_DEBUG;
0318 
0319   return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
0320 }
0321 
0322 template <>
0323 EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
0324   EIGEN_MSA_DEBUG;
0325 
0326   return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
0327 }
0328 
0329 template <>
0330 EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
0331   EIGEN_MSA_DEBUG;
0332 
0333   return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
0334 }
0335 
0336 template <>
0337 EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
0338   EIGEN_MSA_DEBUG;
0339 
0340 #if EIGEN_FAST_MATH
0341   // This prefers numbers to NaNs.
0342   return __builtin_msa_fmin_w(a, b);
0343 #else
0344   // This prefers NaNs to numbers.
0345   Packet4i aNaN = __builtin_msa_fcun_w(a, a);
0346   Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
0347   return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
0348 #endif
0349 }
0350 
0351 template <>
0352 EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
0353   EIGEN_MSA_DEBUG;
0354 
0355   return __builtin_msa_min_s_w(a, b);
0356 }
0357 
0358 template <>
0359 EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
0360   EIGEN_MSA_DEBUG;
0361 
0362 #if EIGEN_FAST_MATH
0363   // This prefers numbers to NaNs.
0364   return __builtin_msa_fmax_w(a, b);
0365 #else
0366   // This prefers NaNs to numbers.
0367   Packet4i aNaN = __builtin_msa_fcun_w(a, a);
0368   Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
0369   return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
0370 #endif
0371 }
0372 
0373 template <>
0374 EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
0375   EIGEN_MSA_DEBUG;
0376 
0377   return __builtin_msa_max_s_w(a, b);
0378 }
0379 
0380 template <>
0381 EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
0382   EIGEN_MSA_DEBUG;
0383 
0384   EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
0385 }
0386 
0387 template <>
0388 EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
0389   EIGEN_MSA_DEBUG;
0390 
0391   EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
0392 }
0393 
0394 template <>
0395 EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
0396   EIGEN_MSA_DEBUG;
0397 
0398   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
0399 }
0400 
0401 template <>
0402 EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
0403   EIGEN_MSA_DEBUG;
0404 
0405   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
0406 }
0407 
0408 template <>
0409 EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
0410   EIGEN_MSA_DEBUG;
0411 
0412   float f0 = from[0], f1 = from[1];
0413   Packet4f v0 = { f0, f0, f0, f0 };
0414   Packet4f v1 = { f1, f1, f1, f1 };
0415   return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
0416 }
0417 
0418 template <>
0419 EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
0420   EIGEN_MSA_DEBUG;
0421 
0422   int32_t i0 = from[0], i1 = from[1];
0423   Packet4i v0 = { i0, i0, i0, i0 };
0424   Packet4i v1 = { i1, i1, i1, i1 };
0425   return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
0426 }
0427 
0428 template <>
0429 EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
0430   EIGEN_MSA_DEBUG;
0431 
0432   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
0433 }
0434 
0435 template <>
0436 EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
0437   EIGEN_MSA_DEBUG;
0438 
0439   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
0440 }
0441 
0442 template <>
0443 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
0444   EIGEN_MSA_DEBUG;
0445 
0446   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
0447 }
0448 
0449 template <>
0450 EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
0451   EIGEN_MSA_DEBUG;
0452 
0453   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
0454 }
0455 
0456 template <>
0457 EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
0458   EIGEN_MSA_DEBUG;
0459 
0460   float f = *from;
0461   Packet4f v = { f, f, f, f };
0462   v[1] = from[stride];
0463   v[2] = from[2 * stride];
0464   v[3] = from[3 * stride];
0465   return v;
0466 }
0467 
0468 template <>
0469 EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
0470   EIGEN_MSA_DEBUG;
0471 
0472   int32_t i = *from;
0473   Packet4i v = { i, i, i, i };
0474   v[1] = from[stride];
0475   v[2] = from[2 * stride];
0476   v[3] = from[3 * stride];
0477   return v;
0478 }
0479 
0480 template <>
0481 EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
0482                                                         Index stride) {
0483   EIGEN_MSA_DEBUG;
0484 
0485   *to = from[0];
0486   to += stride;
0487   *to = from[1];
0488   to += stride;
0489   *to = from[2];
0490   to += stride;
0491   *to = from[3];
0492 }
0493 
0494 template <>
0495 EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
0496                                                           Index stride) {
0497   EIGEN_MSA_DEBUG;
0498 
0499   *to = from[0];
0500   to += stride;
0501   *to = from[1];
0502   to += stride;
0503   *to = from[2];
0504   to += stride;
0505   *to = from[3];
0506 }
0507 
0508 template <>
0509 EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
0510   EIGEN_MSA_DEBUG;
0511 
0512   __builtin_prefetch(addr);
0513 }
0514 
0515 template <>
0516 EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
0517   EIGEN_MSA_DEBUG;
0518 
0519   __builtin_prefetch(addr);
0520 }
0521 
0522 template <>
0523 EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
0524   EIGEN_MSA_DEBUG;
0525 
0526   return a[0];
0527 }
0528 
0529 template <>
0530 EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
0531   EIGEN_MSA_DEBUG;
0532 
0533   return a[0];
0534 }
0535 
0536 template <>
0537 EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
0538   EIGEN_MSA_DEBUG;
0539 
0540   return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
0541 }
0542 
0543 template <>
0544 EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
0545   EIGEN_MSA_DEBUG;
0546 
0547   return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
0548 }
0549 
0550 template <>
0551 EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
0552   EIGEN_MSA_DEBUG;
0553 
0554   return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
0555 }
0556 
0557 template <>
0558 EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
0559   EIGEN_MSA_DEBUG;
0560 
0561   Packet4i zero = __builtin_msa_ldi_w(0);
0562   return __builtin_msa_add_a_w(zero, a);
0563 }
0564 
0565 template <>
0566 EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
0567   EIGEN_MSA_DEBUG;
0568 
0569   Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
0570   s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0571   return s[0];
0572 }
0573 
0574 
0575 template <>
0576 EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
0577   EIGEN_MSA_DEBUG;
0578 
0579   Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
0580   s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0581   return s[0];
0582 }
0583 
0584 // Other reduction functions:
0585 // mul
0586 template <>
0587 EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
0588   EIGEN_MSA_DEBUG;
0589 
0590   Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
0591   p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0592   return p[0];
0593 }
0594 
0595 template <>
0596 EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
0597   EIGEN_MSA_DEBUG;
0598 
0599   Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
0600   p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0601   return p[0];
0602 }
0603 
0604 // min
0605 template <>
0606 EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
0607   EIGEN_MSA_DEBUG;
0608 
0609   // Swap 64-bit halves of a.
0610   Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
0611 #if !EIGEN_FAST_MATH
0612   // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
0613   // masks of all zeroes/ones in low 64 bits.
0614   v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
0615   // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
0616   unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
0617 #endif
0618   // Continue with min computation.
0619   Packet4f v = __builtin_msa_fmin_w(a, swapped);
0620   v = __builtin_msa_fmin_w(
0621       v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0622 #if !EIGEN_FAST_MATH
0623   // Based on the mask select between v and 4 qNaNs.
0624   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
0625   v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
0626 #endif
0627   return v[0];
0628 }
0629 
0630 template <>
0631 EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
0632   EIGEN_MSA_DEBUG;
0633 
0634   Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
0635   m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0636   return m[0];
0637 }
0638 
0639 // max
0640 template <>
0641 EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
0642   EIGEN_MSA_DEBUG;
0643 
0644   // Swap 64-bit halves of a.
0645   Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
0646 #if !EIGEN_FAST_MATH
0647   // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
0648   // masks of all zeroes/ones in low 64 bits.
0649   v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
0650   // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
0651   unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
0652 #endif
0653   // Continue with max computation.
0654   Packet4f v = __builtin_msa_fmax_w(a, swapped);
0655   v = __builtin_msa_fmax_w(
0656       v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0657 #if !EIGEN_FAST_MATH
0658   // Based on the mask select between v and 4 qNaNs.
0659   v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
0660   v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
0661 #endif
0662   return v[0];
0663 }
0664 
0665 template <>
0666 EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
0667   EIGEN_MSA_DEBUG;
0668 
0669   Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
0670   m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
0671   return m[0];
0672 }
0673 
0674 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
0675   os << "[ " << value.packet[0] << "," << std::endl
0676      << "  " << value.packet[1] << "," << std::endl
0677      << "  " << value.packet[2] << "," << std::endl
0678      << "  " << value.packet[3] << " ]";
0679   return os;
0680 }
0681 
0682 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
0683   EIGEN_MSA_DEBUG;
0684 
0685   v4i32 tmp1, tmp2, tmp3, tmp4;
0686 
0687   tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
0688   tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
0689   tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
0690   tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
0691 
0692   kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
0693   kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
0694   kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
0695   kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
0696 }
0697 
0698 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
0699   os << "[ " << value.packet[0] << "," << std::endl
0700      << "  " << value.packet[1] << "," << std::endl
0701      << "  " << value.packet[2] << "," << std::endl
0702      << "  " << value.packet[3] << " ]";
0703   return os;
0704 }
0705 
0706 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
0707   EIGEN_MSA_DEBUG;
0708 
0709   v4i32 tmp1, tmp2, tmp3, tmp4;
0710 
0711   tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
0712   tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
0713   tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
0714   tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
0715 
0716   kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
0717   kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
0718   kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
0719   kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
0720 }
0721 
0722 template <>
0723 EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
0724   EIGEN_MSA_DEBUG;
0725 
0726   return __builtin_msa_fsqrt_w(a);
0727 }
0728 
0729 template <>
0730 EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
0731   EIGEN_MSA_DEBUG;
0732 
0733 #if EIGEN_FAST_MATH
0734   return __builtin_msa_frsqrt_w(a);
0735 #else
0736   Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
0737   return pdiv(ones, psqrt(a));
0738 #endif
0739 }
0740 
0741 template <>
0742 EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
0743   Packet4f v = a;
0744   int32_t old_mode, new_mode;
0745   asm volatile(
0746       "cfcmsa  %[old_mode], $1\n"
0747       "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
0748       "ctcmsa  $1, %[new_mode]\n"
0749       "frint.w %w[v], %w[v]\n"
0750       "ctcmsa  $1, %[old_mode]\n"
0751       :  // outputs
0752       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
0753       [v] "+f"(v)
0754       :  // inputs
0755       :  // clobbers
0756   );
0757   return v;
0758 }
0759 
0760 template <>
0761 EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
0762   Packet4f v = a;
0763   int32_t old_mode, new_mode;
0764   asm volatile(
0765       "cfcmsa  %[old_mode], $1\n"
0766       "ori     %[new_mode], %[old_mode], 3\n"
0767       "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
0768       "ctcmsa  $1, %[new_mode]\n"
0769       "frint.w %w[v], %w[v]\n"
0770       "ctcmsa  $1, %[old_mode]\n"
0771       :  // outputs
0772       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
0773       [v] "+f"(v)
0774       :  // inputs
0775       :  // clobbers
0776   );
0777   return v;
0778 }
0779 
0780 template <>
0781 EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
0782   Packet4f v = a;
0783   int32_t old_mode, new_mode;
0784   asm volatile(
0785       "cfcmsa  %[old_mode], $1\n"
0786       "ori     %[new_mode], %[old_mode], 3\n"
0787       "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
0788       "ctcmsa  $1, %[new_mode]\n"
0789       "frint.w %w[v], %w[v]\n"
0790       "ctcmsa  $1, %[old_mode]\n"
0791       :  // outputs
0792       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
0793       [v] "+f"(v)
0794       :  // inputs
0795       :  // clobbers
0796   );
0797   return v;
0798 }
0799 
0800 template <>
0801 EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
0802                                     const Packet4f& elsePacket) {
0803   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
0804                        ifPacket.select[3] };
0805   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
0806   return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
0807 }
0808 
0809 template <>
0810 EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
0811                                     const Packet4i& elsePacket) {
0812   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
0813                        ifPacket.select[3] };
0814   Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
0815   return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
0816 }
0817 
0818 //---------- double ----------
0819 
0820 typedef v2f64 Packet2d;
0821 typedef v2i64 Packet2l;
0822 typedef v2u64 Packet2ul;
0823 
0824 #define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
0825 #define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
0826 #define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
0827 
0828 inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
0829   os << "[ " << value[0] << ", " << value[1] << " ]";
0830   return os;
0831 }
0832 
0833 inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
0834   os << "[ " << value[0] << ", " << value[1] << " ]";
0835   return os;
0836 }
0837 
0838 inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
0839   os << "[ " << value[0] << ", " << value[1] << " ]";
0840   return os;
0841 }
0842 
0843 template <>
0844 struct packet_traits<double> : default_packet_traits {
0845   typedef Packet2d type;
0846   typedef Packet2d half;
0847   enum {
0848     Vectorizable = 1,
0849     AlignedOnScalar = 1,
0850     size = 2,
0851     HasHalfPacket = 0,
0852     // FIXME check the Has*
0853     HasDiv = 1,
0854     HasExp = 1,
0855     HasSqrt = 1,
0856     HasRsqrt = 1,
0857     HasRound = 1,
0858     HasFloor = 1,
0859     HasCeil = 1,
0860     HasBlend = 1
0861   };
0862 };
0863 
0864 template <>
0865 struct unpacket_traits<Packet2d> {
0866   typedef double type;
0867   enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
0868   typedef Packet2d half;
0869 };
0870 
0871 template <>
0872 EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
0873   EIGEN_MSA_DEBUG;
0874 
0875   Packet2d value = { from, from };
0876   return value;
0877 }
0878 
0879 template <>
0880 EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
0881   EIGEN_MSA_DEBUG;
0882 
0883   return __builtin_msa_fadd_d(a, b);
0884 }
0885 
0886 template <>
0887 EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
0888   EIGEN_MSA_DEBUG;
0889 
0890   static const Packet2d countdown = { 0.0, 1.0 };
0891   return padd(pset1<Packet2d>(a), countdown);
0892 }
0893 
0894 template <>
0895 EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
0896   EIGEN_MSA_DEBUG;
0897 
0898   return __builtin_msa_fsub_d(a, b);
0899 }
0900 
0901 template <>
0902 EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
0903   EIGEN_MSA_DEBUG;
0904 
0905   return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
0906 }
0907 
0908 template <>
0909 EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
0910   EIGEN_MSA_DEBUG;
0911 
0912   return a;
0913 }
0914 
0915 template <>
0916 EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
0917   EIGEN_MSA_DEBUG;
0918 
0919   return __builtin_msa_fmul_d(a, b);
0920 }
0921 
0922 template <>
0923 EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
0924   EIGEN_MSA_DEBUG;
0925 
0926   return __builtin_msa_fdiv_d(a, b);
0927 }
0928 
0929 template <>
0930 EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
0931   EIGEN_MSA_DEBUG;
0932 
0933   return __builtin_msa_fmadd_d(c, a, b);
0934 }
0935 
0936 // Logical Operations are not supported for float, so we have to reinterpret casts using MSA
0937 // intrinsics
0938 template <>
0939 EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
0940   EIGEN_MSA_DEBUG;
0941 
0942   return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
0943 }
0944 
0945 template <>
0946 EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
0947   EIGEN_MSA_DEBUG;
0948 
0949   return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
0950 }
0951 
0952 template <>
0953 EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
0954   EIGEN_MSA_DEBUG;
0955 
0956   return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
0957 }
0958 
0959 template <>
0960 EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
0961   EIGEN_MSA_DEBUG;
0962 
0963   return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
0964 }
0965 
0966 template <>
0967 EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
0968   EIGEN_MSA_DEBUG;
0969 
0970   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
0971 }
0972 
0973 template <>
0974 EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
0975   EIGEN_MSA_DEBUG;
0976 
0977 #if EIGEN_FAST_MATH
0978   // This prefers numbers to NaNs.
0979   return __builtin_msa_fmin_d(a, b);
0980 #else
0981   // This prefers NaNs to numbers.
0982   v2i64 aNaN = __builtin_msa_fcun_d(a, a);
0983   v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
0984   return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
0985 #endif
0986 }
0987 
0988 template <>
0989 EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
0990   EIGEN_MSA_DEBUG;
0991 
0992 #if EIGEN_FAST_MATH
0993   // This prefers numbers to NaNs.
0994   return __builtin_msa_fmax_d(a, b);
0995 #else
0996   // This prefers NaNs to numbers.
0997   v2i64 aNaN = __builtin_msa_fcun_d(a, a);
0998   v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
0999   return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
1000 #endif
1001 }
1002 
1003 template <>
1004 EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
1005   EIGEN_MSA_DEBUG;
1006 
1007   EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
1008 }
1009 
1010 template <>
1011 EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
1012   EIGEN_MSA_DEBUG;
1013 
1014   Packet2d value = { *from, *from };
1015   return value;
1016 }
1017 
1018 template <>
1019 EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
1020   EIGEN_MSA_DEBUG;
1021 
1022   EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1023 }
1024 
1025 template <>
1026 EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
1027   EIGEN_MSA_DEBUG;
1028 
1029   EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1030 }
1031 
1032 template <>
1033 EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
1034   EIGEN_MSA_DEBUG;
1035 
1036   Packet2d value;
1037   value[0] = *from;
1038   from += stride;
1039   value[1] = *from;
1040   return value;
1041 }
1042 
1043 template <>
1044 EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
1045                                                          Index stride) {
1046   EIGEN_MSA_DEBUG;
1047 
1048   *to = from[0];
1049   to += stride;
1050   *to = from[1];
1051 }
1052 
1053 template <>
1054 EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
1055   EIGEN_MSA_DEBUG;
1056 
1057   __builtin_prefetch(addr);
1058 }
1059 
1060 template <>
1061 EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
1062   EIGEN_MSA_DEBUG;
1063 
1064   return a[0];
1065 }
1066 
1067 template <>
1068 EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
1069   EIGEN_MSA_DEBUG;
1070 
1071   return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1072 }
1073 
1074 template <>
1075 EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
1076   EIGEN_MSA_DEBUG;
1077 
1078   return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1079 }
1080 
1081 template <>
1082 EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
1083   EIGEN_MSA_DEBUG;
1084 
1085   Packet2d s = padd(a, preverse(a));
1086   return s[0];
1087 }
1088 
1089 // Other reduction functions:
1090 // mul
1091 template <>
1092 EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
1093   EIGEN_MSA_DEBUG;
1094 
1095   Packet2d p = pmul(a, preverse(a));
1096   return p[0];
1097 }
1098 
1099 // min
1100 template <>
1101 EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
1102   EIGEN_MSA_DEBUG;
1103 
1104 #if EIGEN_FAST_MATH
1105   Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1106   Packet2d v = __builtin_msa_fmin_d(a, swapped);
1107   return v[0];
1108 #else
1109   double a0 = a[0], a1 = a[1];
1110   return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1111 #endif
1112 }
1113 
1114 // max
1115 template <>
1116 EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
1117   EIGEN_MSA_DEBUG;
1118 
1119 #if EIGEN_FAST_MATH
1120   Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1121   Packet2d v = __builtin_msa_fmax_d(a, swapped);
1122   return v[0];
1123 #else
1124   double a0 = a[0], a1 = a[1];
1125   return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1126 #endif
1127 }
1128 
1129 template <>
1130 EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
1131   EIGEN_MSA_DEBUG;
1132 
1133   return __builtin_msa_fsqrt_d(a);
1134 }
1135 
1136 template <>
1137 EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
1138   EIGEN_MSA_DEBUG;
1139 
1140 #if EIGEN_FAST_MATH
1141   return __builtin_msa_frsqrt_d(a);
1142 #else
1143   Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1144   return pdiv(ones, psqrt(a));
1145 #endif
1146 }
1147 
1148 inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
1149   os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
1150   return os;
1151 }
1152 
1153 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1154   EIGEN_MSA_DEBUG;
1155 
1156   Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1157   Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1158   kernel.packet[0] = trn1;
1159   kernel.packet[1] = trn2;
1160 }
1161 
1162 template <>
1163 EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
1164   Packet2d v = a;
1165   int32_t old_mode, new_mode;
1166   asm volatile(
1167       "cfcmsa  %[old_mode], $1\n"
1168       "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
1169       "ctcmsa  $1, %[new_mode]\n"
1170       "frint.d %w[v], %w[v]\n"
1171       "ctcmsa  $1, %[old_mode]\n"
1172       :  // outputs
1173       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1174       [v] "+f"(v)
1175       :  // inputs
1176       :  // clobbers
1177   );
1178   return v;
1179 }
1180 
1181 template <>
1182 EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
1183   Packet2d v = a;
1184   int32_t old_mode, new_mode;
1185   asm volatile(
1186       "cfcmsa  %[old_mode], $1\n"
1187       "ori     %[new_mode], %[old_mode], 3\n"
1188       "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
1189       "ctcmsa  $1, %[new_mode]\n"
1190       "frint.d %w[v], %w[v]\n"
1191       "ctcmsa  $1, %[old_mode]\n"
1192       :  // outputs
1193       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1194       [v] "+f"(v)
1195       :  // inputs
1196       :  // clobbers
1197   );
1198   return v;
1199 }
1200 
1201 template <>
1202 EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
1203   Packet2d v = a;
1204   int32_t old_mode, new_mode;
1205   asm volatile(
1206       "cfcmsa  %[old_mode], $1\n"
1207       "ori     %[new_mode], %[old_mode], 3\n"
1208       "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
1209       "ctcmsa  $1, %[new_mode]\n"
1210       "frint.d %w[v], %w[v]\n"
1211       "ctcmsa  $1, %[old_mode]\n"
1212       :  // outputs
1213       [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
1214       [v] "+f"(v)
1215       :  // inputs
1216       :  // clobbers
1217   );
1218   return v;
1219 }
1220 
1221 template <>
1222 EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
1223                                     const Packet2d& elsePacket) {
1224   Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
1225   Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
1226   return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
1227 }
1228 
1229 }  // end namespace internal
1230 
1231 }  // end namespace Eigen
1232 
1233 #endif  // EIGEN_PACKET_MATH_MSA_H