arch/SVE/PacketMath.h

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // Copyright (C) 2020, Arm Limited and Contributors
0005 //
0006 // This Source Code Form is subject to the terms of the Mozilla
0007 // Public License v. 2.0. If a copy of the MPL was not distributed
0008 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0009
0010 #ifndef EIGEN_PACKET_MATH_SVE_H
0011 #define EIGEN_PACKET_MATH_SVE_H
0012
0013 namespace Eigen
0014 {
0015 namespace internal
0016 {
0017 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
0018 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
0019 #endif
0020
0021 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
0022 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
0023 #endif
0024
0025 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
0026
0027 template <typename Scalar, int SVEVectorLength>
0028 struct sve_packet_size_selector {
0029   enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
0030 };
0031
0032 /********************************* int32 **************************************/
0033 typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
0034
0035 template <>
0036 struct packet_traits<numext::int32_t> : default_packet_traits {
0037   typedef PacketXi type;
0038   typedef PacketXi half;  // Half not implemented yet
0039   enum {
0040     Vectorizable = 1,
0041     AlignedOnScalar = 1,
0042     size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
0043     HasHalfPacket = 0,
0044
0045     HasAdd = 1,
0046     HasSub = 1,
0047     HasShift = 1,
0048     HasMul = 1,
0049     HasNegate = 1,
0050     HasAbs = 1,
0051     HasArg = 0,
0052     HasAbs2 = 1,
0053     HasMin = 1,
0054     HasMax = 1,
0055     HasConj = 1,
0056     HasSetLinear = 0,
0057     HasBlend = 0,
0058     HasReduxp = 0  // Not implemented in SVE
0059   };
0060 };
0061
0062 template <>
0063 struct unpacket_traits<PacketXi> {
0064   typedef numext::int32_t type;
0065   typedef PacketXi half;  // Half not yet implemented
0066   enum {
0067     size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
0068     alignment = Aligned64,
0069     vectorizable = true,
0070     masked_load_available = false,
0071     masked_store_available = false
0072   };
0073 };
0074
0075 template <>
0076 EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr)
0077 {
0078   svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
0079 }
0080
0081 template <>
0082 EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from)
0083 {
0084   return svdup_n_s32(from);
0085 }
0086
0087 template <>
0088 EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a)
0089 {
0090   numext::int32_t c[packet_traits<numext::int32_t>::size];
0091   for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
0092   return svadd_s32_z(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
0093 }
0094
0095 template <>
0096 EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b)
0097 {
0098   return svadd_s32_z(svptrue_b32(), a, b);
0099 }
0100
0101 template <>
0102 EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b)
0103 {
0104   return svsub_s32_z(svptrue_b32(), a, b);
0105 }
0106
0107 template <>
0108 EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a)
0109 {
0110   return svneg_s32_z(svptrue_b32(), a);
0111 }
0112
0113 template <>
0114 EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a)
0115 {
0116   return a;
0117 }
0118
0119 template <>
0120 EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b)
0121 {
0122   return svmul_s32_z(svptrue_b32(), a, b);
0123 }
0124
0125 template <>
0126 EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b)
0127 {
0128   return svdiv_s32_z(svptrue_b32(), a, b);
0129 }
0130
0131 template <>
0132 EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c)
0133 {
0134   return svmla_s32_z(svptrue_b32(), c, a, b);
0135 }
0136
0137 template <>
0138 EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b)
0139 {
0140   return svmin_s32_z(svptrue_b32(), a, b);
0141 }
0142
0143 template <>
0144 EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b)
0145 {
0146   return svmax_s32_z(svptrue_b32(), a, b);
0147 }
0148
0149 template <>
0150 EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b)
0151 {
0152   return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
0153 }
0154
0155 template <>
0156 EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b)
0157 {
0158   return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
0159 }
0160
0161 template <>
0162 EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b)
0163 {
0164   return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
0165 }
0166
0167 template <>
0168 EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/)
0169 {
0170   return svdup_n_s32_z(svptrue_b32(), 0xffffffffu);
0171 }
0172
0173 template <>
0174 EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/)
0175 {
0176   return svdup_n_s32_z(svptrue_b32(), 0);
0177 }
0178
0179 template <>
0180 EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b)
0181 {
0182   return svand_s32_z(svptrue_b32(), a, b);
0183 }
0184
0185 template <>
0186 EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b)
0187 {
0188   return svorr_s32_z(svptrue_b32(), a, b);
0189 }
0190
0191 template <>
0192 EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b)
0193 {
0194   return sveor_s32_z(svptrue_b32(), a, b);
0195 }
0196
0197 template <>
0198 EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b)
0199 {
0200   return svbic_s32_z(svptrue_b32(), a, b);
0201 }
0202
0203 template <int N>
0204 EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a)
0205 {
0206   return svasrd_n_s32_z(svptrue_b32(), a, N);
0207 }
0208
0209 template <int N>
0210 EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a)
0211 {
0212   return svreinterpret_s32_u32(svlsr_u32_z(svptrue_b32(), svreinterpret_u32_s32(a), svdup_n_u32_z(svptrue_b32(), N)));
0213 }
0214
0215 template <int N>
0216 EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a)
0217 {
0218   return svlsl_s32_z(svptrue_b32(), a, svdup_n_u32_z(svptrue_b32(), N));
0219 }
0220
0221 template <>
0222 EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from)
0223 {
0224   EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
0225 }
0226
0227 template <>
0228 EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from)
0229 {
0230   EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
0231 }
0232
0233 template <>
0234 EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from)
0235 {
0236   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
0237   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
0238   return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
0239 }
0240
0241 template <>
0242 EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from)
0243 {
0244   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
0245   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
0246   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
0247   return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
0248 }
0249
0250 template <>
0251 EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
0252 {
0253   EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
0254 }
0255
0256 template <>
0257 EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from)
0258 {
0259   EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
0260 }
0261
0262 template <>
0263 EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride)
0264 {
0265   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
0266   svint32_t indices = svindex_s32(0, stride);
0267   return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
0268 }
0269
0270 template <>
0271 EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from, Index stride)
0272 {
0273   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
0274   svint32_t indices = svindex_s32(0, stride);
0275   svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
0276 }
0277
0278 template <>
0279 EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a)
0280 {
0281   // svlasta returns the first element if all predicate bits are 0
0282   return svlasta_s32(svpfalse_b(), a);
0283 }
0284
0285 template <>
0286 EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a)
0287 {
0288   return svrev_s32(a);
0289 }
0290
0291 template <>
0292 EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a)
0293 {
0294   return svabs_s32_z(svptrue_b32(), a);
0295 }
0296
0297 template <>
0298 EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a)
0299 {
0300   return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
0301 }
0302
0303 template <>
0304 EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a)
0305 {
0306   EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
0307                       EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
0308
0309   // Multiply the vector by its reverse
0310   svint32_t prod = svmul_s32_z(svptrue_b32(), a, svrev_s32(a));
0311   svint32_t half_prod;
0312
0313   // Extract the high half of the vector. Depending on the VL more reductions need to be done
0314   if (EIGEN_ARM64_SVE_VL >= 2048) {
0315     half_prod = svtbl_s32(prod, svindex_u32(32, 1));
0316     prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
0317   }
0318   if (EIGEN_ARM64_SVE_VL >= 1024) {
0319     half_prod = svtbl_s32(prod, svindex_u32(16, 1));
0320     prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
0321   }
0322   if (EIGEN_ARM64_SVE_VL >= 512) {
0323     half_prod = svtbl_s32(prod, svindex_u32(8, 1));
0324     prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
0325   }
0326   if (EIGEN_ARM64_SVE_VL >= 256) {
0327     half_prod = svtbl_s32(prod, svindex_u32(4, 1));
0328     prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
0329   }
0330   // Last reduction
0331   half_prod = svtbl_s32(prod, svindex_u32(2, 1));
0332   prod = svmul_s32_z(svptrue_b32(), prod, half_prod);
0333
0334   // The reduction is done to the first element.
0335   return pfirst<PacketXi>(prod);
0336 }
0337
0338 template <>
0339 EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a)
0340 {
0341   return svminv_s32(svptrue_b32(), a);
0342 }
0343
0344 template <>
0345 EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a)
0346 {
0347   return svmaxv_s32(svptrue_b32(), a);
0348 }
0349
0350 template <int N>
0351 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
0352   int buffer[packet_traits<numext::int32_t>::size * N] = {0};
0353   int i = 0;
0354
0355   PacketXi stride_index = svindex_s32(0, N);
0356
0357   for (i = 0; i < N; i++) {
0358     svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
0359   }
0360   for (i = 0; i < N; i++) {
0361     kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
0362   }
0363 }
0364
0365 /********************************* float32 ************************************/
0366
0367 typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
0368
0369 template <>
0370 struct packet_traits<float> : default_packet_traits {
0371   typedef PacketXf type;
0372   typedef PacketXf half;
0373
0374   enum {
0375     Vectorizable = 1,
0376     AlignedOnScalar = 1,
0377     size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
0378     HasHalfPacket = 0,
0379
0380     HasAdd = 1,
0381     HasSub = 1,
0382     HasShift = 1,
0383     HasMul = 1,
0384     HasNegate = 1,
0385     HasAbs = 1,
0386     HasArg = 0,
0387     HasAbs2 = 1,
0388     HasMin = 1,
0389     HasMax = 1,
0390     HasConj = 1,
0391     HasSetLinear = 0,
0392     HasBlend = 0,
0393     HasReduxp = 0,  // Not implemented in SVE
0394
0395     HasDiv = 1,
0396     HasFloor = 1,
0397
0398     HasSin = EIGEN_FAST_MATH,
0399     HasCos = EIGEN_FAST_MATH,
0400     HasLog = 1,
0401     HasExp = 1,
0402     HasSqrt = 0,
0403     HasTanh = EIGEN_FAST_MATH,
0404     HasErf = EIGEN_FAST_MATH
0405   };
0406 };
0407
0408 template <>
0409 struct unpacket_traits<PacketXf> {
0410   typedef float type;
0411   typedef PacketXf half;  // Half not yet implemented
0412   typedef PacketXi integer_packet;
0413
0414   enum {
0415     size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
0416     alignment = Aligned64,
0417     vectorizable = true,
0418     masked_load_available = false,
0419     masked_store_available = false
0420   };
0421 };
0422
0423 template <>
0424 EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from)
0425 {
0426   return svdup_n_f32(from);
0427 }
0428
0429 template <>
0430 EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from)
0431 {
0432   return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), from));
0433 }
0434
0435 template <>
0436 EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a)
0437 {
0438   float c[packet_traits<float>::size];
0439   for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
0440   return svadd_f32_z(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
0441 }
0442
0443 template <>
0444 EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b)
0445 {
0446   return svadd_f32_z(svptrue_b32(), a, b);
0447 }
0448
0449 template <>
0450 EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b)
0451 {
0452   return svsub_f32_z(svptrue_b32(), a, b);
0453 }
0454
0455 template <>
0456 EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a)
0457 {
0458   return svneg_f32_z(svptrue_b32(), a);
0459 }
0460
0461 template <>
0462 EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a)
0463 {
0464   return a;
0465 }
0466
0467 template <>
0468 EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b)
0469 {
0470   return svmul_f32_z(svptrue_b32(), a, b);
0471 }
0472
0473 template <>
0474 EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b)
0475 {
0476   return svdiv_f32_z(svptrue_b32(), a, b);
0477 }
0478
0479 template <>
0480 EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c)
0481 {
0482   return svmla_f32_z(svptrue_b32(), c, a, b);
0483 }
0484
0485 template <>
0486 EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b)
0487 {
0488   return svmin_f32_z(svptrue_b32(), a, b);
0489 }
0490
0491 template <>
0492 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
0493 {
0494   return pmin<PacketXf>(a, b);
0495 }
0496
0497 template <>
0498 EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
0499 {
0500   return svminnm_f32_z(svptrue_b32(), a, b);
0501 }
0502
0503 template <>
0504 EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b)
0505 {
0506   return svmax_f32_z(svptrue_b32(), a, b);
0507 }
0508
0509 template <>
0510 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b)
0511 {
0512   return pmax<PacketXf>(a, b);
0513 }
0514
0515 template <>
0516 EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b)
0517 {
0518   return svmaxnm_f32_z(svptrue_b32(), a, b);
0519 }
0520
0521 // Float comparisons in SVE return svbool (predicate). Use svdup to set active
0522 // lanes to 1 (0xffffffffu) and inactive lanes to 0.
0523 template <>
0524 EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b)
0525 {
0526   return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
0527 }
0528
0529 template <>
0530 EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b)
0531 {
0532   return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
0533 }
0534
0535 template <>
0536 EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b)
0537 {
0538   return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
0539 }
0540
0541 // Do a predicate inverse (svnot_b_z) on the predicate resulted from the
0542 // greater/equal comparison (svcmpge_f32). Then fill a float vector with the
0543 // active elements.
0544 template <>
0545 EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b)
0546 {
0547   return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
0548 }
0549
0550 template <>
0551 EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a)
0552 {
0553   return svrintm_f32_z(svptrue_b32(), a);
0554 }
0555
0556 template <>
0557 EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/)
0558 {
0559   return svreinterpret_f32_u32(svdup_n_u32_z(svptrue_b32(), 0xffffffffu));
0560 }
0561
0562 // Logical Operations are not supported for float, so reinterpret casts
0563 template <>
0564 EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b)
0565 {
0566   return svreinterpret_f32_u32(svand_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
0567 }
0568
0569 template <>
0570 EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b)
0571 {
0572   return svreinterpret_f32_u32(svorr_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
0573 }
0574
0575 template <>
0576 EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b)
0577 {
0578   return svreinterpret_f32_u32(sveor_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
0579 }
0580
0581 template <>
0582 EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b)
0583 {
0584   return svreinterpret_f32_u32(svbic_u32_z(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
0585 }
0586
0587 template <>
0588 EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from)
0589 {
0590   EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
0591 }
0592
0593 template <>
0594 EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from)
0595 {
0596   EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
0597 }
0598
0599 template <>
0600 EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from)
0601 {
0602   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
0603   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
0604   return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
0605 }
0606
0607 template <>
0608 EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from)
0609 {
0610   svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
0611   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
0612   indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
0613   return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
0614 }
0615
0616 template <>
0617 EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from)
0618 {
0619   EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
0620 }
0621
0622 template <>
0623 EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from)
0624 {
0625   EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
0626 }
0627
0628 template <>
0629 EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride)
0630 {
0631   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
0632   svint32_t indices = svindex_s32(0, stride);
0633   return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
0634 }
0635
0636 template <>
0637 EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride)
0638 {
0639   // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
0640   svint32_t indices = svindex_s32(0, stride);
0641   svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
0642 }
0643
0644 template <>
0645 EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a)
0646 {
0647   // svlasta returns the first element if all predicate bits are 0
0648   return svlasta_f32(svpfalse_b(), a);
0649 }
0650
0651 template <>
0652 EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a)
0653 {
0654   return svrev_f32(a);
0655 }
0656
0657 template <>
0658 EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a)
0659 {
0660   return svabs_f32_z(svptrue_b32(), a);
0661 }
0662
0663 // TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
0664 // all vector extensions and the generic version.
0665 template <>
0666 EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent)
0667 {
0668   return pfrexp_generic(a, exponent);
0669 }
0670
0671 template <>
0672 EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a)
0673 {
0674   return svaddv_f32(svptrue_b32(), a);
0675 }
0676
0677 // Other reduction functions:
0678 // mul
0679 // Only works for SVE Vls multiple of 128
0680 template <>
0681 EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a)
0682 {
0683   EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0),
0684                       EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
0685   // Multiply the vector by its reverse
0686   svfloat32_t prod = svmul_f32_z(svptrue_b32(), a, svrev_f32(a));
0687   svfloat32_t half_prod;
0688
0689   // Extract the high half of the vector. Depending on the VL more reductions need to be done
0690   if (EIGEN_ARM64_SVE_VL >= 2048) {
0691     half_prod = svtbl_f32(prod, svindex_u32(32, 1));
0692     prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
0693   }
0694   if (EIGEN_ARM64_SVE_VL >= 1024) {
0695     half_prod = svtbl_f32(prod, svindex_u32(16, 1));
0696     prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
0697   }
0698   if (EIGEN_ARM64_SVE_VL >= 512) {
0699     half_prod = svtbl_f32(prod, svindex_u32(8, 1));
0700     prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
0701   }
0702   if (EIGEN_ARM64_SVE_VL >= 256) {
0703     half_prod = svtbl_f32(prod, svindex_u32(4, 1));
0704     prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
0705   }
0706   // Last reduction
0707   half_prod = svtbl_f32(prod, svindex_u32(2, 1));
0708   prod = svmul_f32_z(svptrue_b32(), prod, half_prod);
0709
0710   // The reduction is done to the first element.
0711   return pfirst<PacketXf>(prod);
0712 }
0713
0714 template <>
0715 EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a)
0716 {
0717   return svminv_f32(svptrue_b32(), a);
0718 }
0719
0720 template <>
0721 EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a)
0722 {
0723   return svmaxv_f32(svptrue_b32(), a);
0724 }
0725
0726 template<int N>
0727 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel)
0728 {
0729   float buffer[packet_traits<float>::size * N] = {0};
0730   int i = 0;
0731
0732   PacketXi stride_index = svindex_s32(0, N);
0733
0734   for (i = 0; i < N; i++) {
0735     svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
0736   }
0737
0738   for (i = 0; i < N; i++) {
0739     kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
0740   }
0741 }
0742
0743 template<>
0744 EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent)
0745 {
0746   return pldexp_generic(a, exponent);
0747 }
0748
0749 }  // namespace internal
0750 }  // namespace Eigen
0751
0752 #endif  // EIGEN_PACKET_MATH_SVE_H