Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-19 09:51:42

0001 namespace Eigen {
0002 namespace internal {
0003   
0004 #if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
0005 
0006 // Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
0007 // Here we specialize gebp_traits to eliminate these register spills.
0008 // See #2138.
0009 template<>
0010 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
0011  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
0012 {
0013   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
0014   { 
0015     // This volatile inline ASM both acts as a barrier to prevent reordering,
0016     // as well as enforces strict register use.
0017     asm volatile(
0018       "vmla.f32 %q[r], %q[c], %q[alpha]"
0019       : [r] "+w" (r)
0020       : [c] "w" (c),
0021         [alpha] "w" (alpha)
0022       : );
0023   }
0024 
0025   template <typename LaneIdType>
0026   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
0027                                 Packet4f& c, Packet4f& tmp,
0028                                 const LaneIdType&) const {
0029     acc(a, b, c);
0030   }
0031   
0032   template <typename LaneIdType>
0033   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
0034                                 Packet4f& c, Packet4f& tmp,
0035                                 const LaneIdType& lane) const {
0036     madd(a, b.get(lane), c, tmp, lane);
0037   }
0038 };
0039 
0040 #endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
0041 
0042 #if EIGEN_ARCH_ARM64
0043 
0044 template<>
0045 struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
0046  : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
0047 {
0048   typedef float RhsPacket;
0049   typedef float32x4_t RhsPacketx4;
0050 
0051   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
0052   {
0053     dest = *b;
0054   }
0055 
0056   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
0057   {
0058     dest = vld1q_f32(b);
0059   }
0060 
0061   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
0062   {
0063     dest = *b;
0064   }
0065 
0066   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
0067   {}
0068 
0069   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
0070   {
0071     loadRhs(b,dest);
0072   }
0073 
0074   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
0075   {
0076     c = vfmaq_n_f32(c, a, b);
0077   }
0078 
0079   // NOTE: Template parameter inference failed when compiled with Android NDK:
0080   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
0081 
0082   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
0083   { madd_helper<0>(a, b, c); }
0084   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
0085   { madd_helper<1>(a, b, c); }
0086   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
0087   { madd_helper<2>(a, b, c); }
0088   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
0089   { madd_helper<3>(a, b, c); }
0090 
0091  private:
0092   template<int LaneID>
0093   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
0094   {
0095     #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
0096     // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
0097     // vfmaq_laneq_f32 is implemented through a costly dup
0098          if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
0099     else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
0100     else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
0101     else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
0102     #else
0103     c = vfmaq_laneq_f32(c, a, b, LaneID);
0104     #endif
0105   }
0106 };
0107 
0108 
0109 template<>
0110 struct gebp_traits <double,double,false,false,Architecture::NEON>
0111  : gebp_traits<double,double,false,false,Architecture::Generic>
0112 {
0113   typedef double RhsPacket;
0114 
0115   struct RhsPacketx4 {
0116     float64x2_t B_0, B_1;
0117   };
0118 
0119   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
0120   {
0121     dest = *b;
0122   }
0123 
0124   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
0125   {
0126     dest.B_0 = vld1q_f64(b);
0127     dest.B_1 = vld1q_f64(b+2);
0128   }
0129 
0130   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
0131   {
0132     loadRhs(b,dest);
0133   }
0134 
0135   EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
0136   {}
0137 
0138   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
0139   {
0140     loadRhs(b,dest);
0141   }
0142 
0143   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
0144   {
0145     c = vfmaq_n_f64(c, a, b);
0146   }
0147 
0148   // NOTE: Template parameter inference failed when compiled with Android NDK:
0149   // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
0150 
0151   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
0152   { madd_helper<0>(a, b, c); }
0153   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
0154   { madd_helper<1>(a, b, c); }
0155   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
0156   { madd_helper<2>(a, b, c); }
0157   EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
0158   { madd_helper<3>(a, b, c); }
0159 
0160  private:
0161   template <int LaneID>
0162   EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
0163   {
0164     #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
0165     // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
0166     // vfmaq_laneq_f64 is implemented through a costly dup
0167          if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
0168     else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
0169     else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
0170     else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
0171     #else
0172          if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
0173     else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
0174     else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
0175     else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
0176     #endif
0177   }
0178 };
0179 
0180 #endif // EIGEN_ARCH_ARM64
0181 
0182 }  // namespace internal
0183 }  // namespace Eigen