File indexing completed on 2025-08-28 09:11:34
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_NEON64_HPP
0013 #define XSIMD_NEON64_HPP
0014
0015 #include <complex>
0016 #include <cstddef>
0017 #include <tuple>
0018
0019 #include "../types/xsimd_neon64_register.hpp"
0020 #include "../types/xsimd_utils.hpp"
0021
0022 namespace xsimd
0023 {
0024 template <typename T, class A, bool... Values>
0025 struct batch_bool_constant;
0026
0027 namespace kernel
0028 {
0029 using namespace types;
0030
0031
0032
0033
0034
0035 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
0036 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0037 {
0038 return vminvq_u32(arg) == ~0U;
0039 }
0040
0041 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
0042 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0043 {
0044 return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
0045 }
0046
0047 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
0048 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0049 {
0050 return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
0051 }
0052
0053 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
0054 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0055 {
0056 return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
0057 }
0058
0059
0060
0061
0062
0063 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
0064 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0065 {
0066 return vmaxvq_u32(arg) != 0;
0067 }
0068
0069 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
0070 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0071 {
0072 return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
0073 }
0074
0075 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
0076 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0077 {
0078 return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
0079 }
0080
0081 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
0082 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0083 {
0084 return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
0085 }
0086
0087
0088
0089
0090
0091
0092 template <class A, class T>
0093 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
0094 {
0095 return broadcast<A>(val, neon {});
0096 }
0097
0098 template <class A>
0099 XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
0100 {
0101 return vdupq_n_f64(val);
0102 }
0103
0104
0105
0106
0107
0108 template <class A>
0109 XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
0110 {
0111 return float64x2_t { d0, d1 };
0112 }
0113
0114 template <class A>
0115 XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
0116 {
0117 using register_type = typename batch_bool<double, A>::register_type;
0118 using unsigned_type = as_unsigned_integer_t<double>;
0119 return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
0120 static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
0121 }
0122
0123
0124
0125
0126
0127 template <class A>
0128 XSIMD_INLINE batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
0129 {
0130 return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
0131 }
0132
0133
0134
0135
0136 #if defined(__clang__) || defined(__GNUC__)
0137 #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
0138 #elif defined(_MSC_VER)
0139 #define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
0140 #else
0141 #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
0142 #endif
0143
0144 template <class A>
0145 XSIMD_INLINE batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
0146 {
0147 return xsimd_aligned_load(vld1q_f64, double*, src);
0148 }
0149
0150 template <class A>
0151 XSIMD_INLINE batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
0152 {
0153 return vld1q_f64(src);
0154 }
0155 #undef xsimd_aligned_load
0156
0157
0158
0159
0160
0161 template <class A>
0162 XSIMD_INLINE void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
0163 {
0164 vst1q_f64(dst, src);
0165 }
0166
0167 template <class A>
0168 XSIMD_INLINE void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
0169 {
0170 return store_aligned<A>(dst, src, A {});
0171 }
0172
0173
0174
0175
0176
0177 template <class A>
0178 XSIMD_INLINE batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
0179 {
0180 using real_batch = batch<double, A>;
0181 const double* buf = reinterpret_cast<const double*>(mem);
0182 float64x2x2_t tmp = vld2q_f64(buf);
0183 real_batch real = tmp.val[0],
0184 imag = tmp.val[1];
0185 return batch<std::complex<double>, A> { real, imag };
0186 }
0187
0188 template <class A>
0189 XSIMD_INLINE batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
0190 {
0191 return load_complex_aligned<A>(mem, cvt, A {});
0192 }
0193
0194
0195
0196
0197
0198 template <class A>
0199 XSIMD_INLINE void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
0200 {
0201 float64x2x2_t tmp;
0202 tmp.val[0] = src.real();
0203 tmp.val[1] = src.imag();
0204 double* buf = reinterpret_cast<double*>(dst);
0205 vst2q_f64(buf, tmp);
0206 }
0207
0208 template <class A>
0209 XSIMD_INLINE void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
0210 {
0211 store_complex_aligned(dst, src, A {});
0212 }
0213
0214
0215
0216
0217
0218 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0219 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0220 {
0221 return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
0222 }
0223
0224 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0225 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0226 {
0227 return vnegq_s64(rhs);
0228 }
0229
0230 template <class A>
0231 XSIMD_INLINE batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0232 {
0233 return vnegq_f64(rhs);
0234 }
0235
0236
0237
0238
0239
0240 template <class A>
0241 XSIMD_INLINE batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0242 {
0243 return vaddq_f64(lhs, rhs);
0244 }
0245
0246
0247
0248
0249
0250 template <class A>
0251 XSIMD_INLINE batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0252 {
0253 return add(lhs, rhs, neon64 {});
0254 }
0255
0256
0257
0258
0259
0260 template <class A>
0261 XSIMD_INLINE batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0262 {
0263 return vsubq_f64(lhs, rhs);
0264 }
0265
0266
0267
0268
0269
0270 template <class A>
0271 XSIMD_INLINE batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0272 {
0273 return sub(lhs, rhs, neon64 {});
0274 }
0275
0276
0277
0278
0279
0280 template <class A>
0281 XSIMD_INLINE batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0282 {
0283 return vmulq_f64(lhs, rhs);
0284 }
0285
0286
0287
0288
0289
0290 #if defined(XSIMD_FAST_INTEGER_DIVISION)
0291 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0292 XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0293 {
0294 return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
0295 }
0296
0297 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0298 XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0299 {
0300 return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
0301 }
0302 #endif
0303 template <class A>
0304 XSIMD_INLINE batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0305 {
0306 return vdivq_f64(lhs, rhs);
0307 }
0308
0309
0310
0311
0312
0313 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0314 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0315 {
0316 return vceqq_u64(lhs, rhs);
0317 }
0318
0319 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0320 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0321 {
0322 return vceqq_s64(lhs, rhs);
0323 }
0324
0325 template <class A>
0326 XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0327 {
0328 return vceqq_f64(lhs, rhs);
0329 }
0330
0331 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0332 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
0333 {
0334 return vceqq_u64(lhs, rhs);
0335 }
0336
0337 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0338 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
0339 {
0340 return vceqq_u64(lhs, rhs);
0341 }
0342
0343 template <class A>
0344 XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0345 {
0346 return vceqq_u64(lhs, rhs);
0347 }
0348
0349
0350
0351
0352 namespace detail
0353 {
0354 template <class A>
0355 XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
0356 {
0357 return vcvtq_f64_s64(x);
0358 }
0359
0360 template <class A>
0361 XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
0362 {
0363 return vcvtq_f64_u64(x);
0364 }
0365
0366 template <class A>
0367 XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
0368 {
0369 return vcvtq_s64_f64(x);
0370 }
0371
0372 template <class A>
0373 XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
0374 {
0375 return vcvtq_u64_f64(x);
0376 }
0377
0378 }
0379
0380
0381
0382
0383
0384 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0385 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0386 {
0387 return vcltq_u64(lhs, rhs);
0388 }
0389
0390 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0391 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0392 {
0393 return vcltq_s64(lhs, rhs);
0394 }
0395
0396 template <class A>
0397 XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0398 {
0399 return vcltq_f64(lhs, rhs);
0400 }
0401
0402
0403
0404
0405
0406 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0407 XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0408 {
0409 return vcleq_u64(lhs, rhs);
0410 }
0411
0412 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0413 XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0414 {
0415 return vcleq_s64(lhs, rhs);
0416 }
0417
0418 template <class A>
0419 XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0420 {
0421 return vcleq_f64(lhs, rhs);
0422 }
0423
0424
0425
0426
0427
0428 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0429 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0430 {
0431 return vcgtq_u64(lhs, rhs);
0432 }
0433
0434 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0435 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0436 {
0437 return vcgtq_s64(lhs, rhs);
0438 }
0439
0440 template <class A>
0441 XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0442 {
0443 return vcgtq_f64(lhs, rhs);
0444 }
0445
0446
0447
0448
0449
0450 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0451 XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0452 {
0453 return vcgeq_u64(lhs, rhs);
0454 }
0455
0456 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0457 XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0458 {
0459 return vcgeq_s64(lhs, rhs);
0460 }
0461
0462 template <class A>
0463 XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0464 {
0465 return vcgeq_f64(lhs, rhs);
0466 }
0467
0468
0469
0470
0471
0472 template <class A, class T_out, class T_in>
0473 XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
0474 {
0475 using register_type = typename batch_bool<T_out, A>::register_type;
0476 return register_type(self);
0477 }
0478
0479
0480
0481
0482
0483 template <class A>
0484 XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0485 {
0486 return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
0487 vreinterpretq_u64_f64(rhs)));
0488 }
0489
0490 template <class A>
0491 XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0492 {
0493 return vandq_u64(lhs, rhs);
0494 }
0495
0496
0497
0498
0499
0500 template <class A>
0501 XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0502 {
0503 return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
0504 vreinterpretq_u64_f64(rhs)));
0505 }
0506
0507 template <class A>
0508 XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0509 {
0510 return vorrq_u64(lhs, rhs);
0511 }
0512
0513
0514
0515
0516
0517 template <class A>
0518 XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0519 {
0520 return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
0521 vreinterpretq_u64_f64(rhs)));
0522 }
0523
0524 template <class A>
0525 XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0526 {
0527 return veorq_u64(lhs, rhs);
0528 }
0529
0530
0531
0532
0533
0534 template <class A>
0535 XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0536 {
0537 return bitwise_xor(lhs, rhs, A {});
0538 }
0539
0540
0541
0542
0543
0544 template <class A>
0545 XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0546 {
0547 return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
0548 }
0549
0550 template <class A>
0551 XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0552 {
0553 return detail::bitwise_not_u64(rhs);
0554 }
0555
0556
0557
0558
0559
0560 template <class A>
0561 XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0562 {
0563 return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
0564 vreinterpretq_u64_f64(rhs)));
0565 }
0566
0567 template <class A>
0568 XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0569 {
0570 return vbicq_u64(lhs, rhs);
0571 }
0572
0573
0574
0575
0576
0577 template <class A>
0578 XSIMD_INLINE batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0579 {
0580 return vminq_f64(lhs, rhs);
0581 }
0582
0583
0584
0585
0586
0587 template <class A>
0588 XSIMD_INLINE batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0589 {
0590 return vmaxq_f64(lhs, rhs);
0591 }
0592
0593
0594
0595
0596
0597 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0598 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0599 {
0600 return rhs;
0601 }
0602
0603 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0604 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0605 {
0606 return vabsq_s64(rhs);
0607 }
0608
0609 template <class A>
0610 XSIMD_INLINE batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0611 {
0612 return vabsq_f64(rhs);
0613 }
0614
0615 template <class A>
0616 XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
0617 requires_arch<neon64>) noexcept
0618 {
0619 return vcvtnq_s32_f32(self);
0620 }
0621
0622 #if !defined(__GNUC__)
0623 template <class A>
0624 XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
0625 requires_arch<neon64>) noexcept
0626 {
0627 return vcvtnq_s64_f64(self);
0628 }
0629 #endif
0630
0631
0632
0633
0634
0635 template <class A>
0636 XSIMD_INLINE batch<double, A>
0637 reciprocal(const batch<double, A>& x,
0638 kernel::requires_arch<neon64>) noexcept
0639 {
0640 return vrecpeq_f64(x);
0641 }
0642
0643
0644
0645
0646
0647 template <class A>
0648 XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0649 {
0650 return vrsqrteq_f64(rhs);
0651 }
0652
0653
0654
0655
0656
0657 template <class A>
0658 XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0659 {
0660 return vsqrtq_f64(rhs);
0661 }
0662
0663
0664
0665
0666
0667 #ifdef __ARM_FEATURE_FMA
0668 template <class A>
0669 XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
0670 {
0671 return vfmaq_f64(z, x, y);
0672 }
0673
0674 template <class A>
0675 XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
0676 {
0677 return vfmaq_f64(-z, x, y);
0678 }
0679 #endif
0680
0681
0682
0683
0684
0685 template <class A>
0686 XSIMD_INLINE batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
0687 {
0688 return vpaddq_f64(row[0], row[1]);
0689 }
0690
0691
0692
0693
0694
0695 template <class A, size_t I>
0696 XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
0697 {
0698 return vsetq_lane_f64(val, self, I);
0699 }
0700
0701
0702
0703
0704
0705
0706
0707
0708 #define WRAP_REDUCER_INT_EXCLUDING_64(OP) \
0709 namespace wrap \
0710 { \
0711 XSIMD_INLINE uint8_t OP##_u8(uint8x16_t a) noexcept \
0712 { \
0713 return ::OP##_u8(a); \
0714 } \
0715 XSIMD_INLINE int8_t OP##_s8(int8x16_t a) noexcept \
0716 { \
0717 return ::OP##_s8(a); \
0718 } \
0719 XSIMD_INLINE uint16_t OP##_u16(uint16x8_t a) noexcept \
0720 { \
0721 return ::OP##_u16(a); \
0722 } \
0723 XSIMD_INLINE int16_t OP##_s16(int16x8_t a) noexcept \
0724 { \
0725 return ::OP##_s16(a); \
0726 } \
0727 XSIMD_INLINE uint32_t OP##_u32(uint32x4_t a) noexcept \
0728 { \
0729 return ::OP##_u32(a); \
0730 } \
0731 XSIMD_INLINE int32_t OP##_s32(int32x4_t a) noexcept \
0732 { \
0733 return ::OP##_s32(a); \
0734 } \
0735 }
0736
0737 #define WRAP_REDUCER_INT(OP) \
0738 WRAP_REDUCER_INT_EXCLUDING_64(OP) \
0739 namespace wrap \
0740 { \
0741 XSIMD_INLINE uint64_t OP##_u64(uint64x2_t a) noexcept \
0742 { \
0743 return ::OP##_u64(a); \
0744 } \
0745 XSIMD_INLINE int64_t OP##_s64(int64x2_t a) noexcept \
0746 { \
0747 return ::OP##_s64(a); \
0748 } \
0749 }
0750
0751 #define WRAP_REDUCER_FLOAT(OP) \
0752 namespace wrap \
0753 { \
0754 XSIMD_INLINE float OP##_f32(float32x4_t a) noexcept \
0755 { \
0756 return ::OP##_f32(a); \
0757 } \
0758 XSIMD_INLINE double OP##_f64(float64x2_t a) noexcept \
0759 { \
0760 return ::OP##_f64(a); \
0761 } \
0762 }
0763
0764 namespace detail
0765 {
0766 template <class R>
0767 struct reducer_return_type_impl;
0768
0769 template <>
0770 struct reducer_return_type_impl<uint8x16_t>
0771 {
0772 using type = uint8_t;
0773 };
0774
0775 template <>
0776 struct reducer_return_type_impl<int8x16_t>
0777 {
0778 using type = int8_t;
0779 };
0780
0781 template <>
0782 struct reducer_return_type_impl<uint16x8_t>
0783 {
0784 using type = uint16_t;
0785 };
0786
0787 template <>
0788 struct reducer_return_type_impl<int16x8_t>
0789 {
0790 using type = int16_t;
0791 };
0792
0793 template <>
0794 struct reducer_return_type_impl<uint32x4_t>
0795 {
0796 using type = uint32_t;
0797 };
0798
0799 template <>
0800 struct reducer_return_type_impl<int32x4_t>
0801 {
0802 using type = int32_t;
0803 };
0804
0805 template <>
0806 struct reducer_return_type_impl<uint64x2_t>
0807 {
0808 using type = uint64_t;
0809 };
0810
0811 template <>
0812 struct reducer_return_type_impl<int64x2_t>
0813 {
0814 using type = int64_t;
0815 };
0816
0817 template <>
0818 struct reducer_return_type_impl<float32x4_t>
0819 {
0820 using type = float;
0821 };
0822
0823 template <>
0824 struct reducer_return_type_impl<float64x2_t>
0825 {
0826 using type = double;
0827 };
0828
0829 template <class R>
0830 using reducer_return_type = typename reducer_return_type_impl<R>::type;
0831
0832 template <class... T>
0833 struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
0834 {
0835 };
0836
0837 using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
0838 uint16x8_t, int16x8_t,
0839 uint32x4_t, int32x4_t,
0840 uint64x2_t, int64x2_t,
0841 float32x4_t, float64x2_t>;
0842 template <class T>
0843 using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
0844 int>::type;
0845 }
0846
0847
0848
0849
0850
0851 WRAP_REDUCER_INT(vaddvq)
0852 WRAP_REDUCER_FLOAT(vaddvq)
0853
0854 template <class A, class T, detail::enable_neon64_type_t<T> = 0>
0855 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
0856 {
0857 using register_type = typename batch<T, A>::register_type;
0858 const detail::neon_reducer_dispatcher::unary dispatcher = {
0859 std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
0860 wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
0861 wrap::vaddvq_f32, wrap::vaddvq_f64)
0862 };
0863 return dispatcher.apply(register_type(arg));
0864 }
0865
0866
0867
0868
0869
0870 WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
0871 WRAP_REDUCER_FLOAT(vmaxvq)
0872
0873 namespace wrap
0874 {
0875 XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept
0876 {
0877 return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
0878 }
0879
0880 XSIMD_INLINE int64_t vmaxvq_s64(int64x2_t a) noexcept
0881 {
0882 return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
0883 }
0884 }
0885
0886 template <class A, class T, detail::enable_neon64_type_t<T> = 0>
0887 XSIMD_INLINE typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
0888 {
0889 using register_type = typename batch<T, A>::register_type;
0890 const detail::neon_reducer_dispatcher::unary dispatcher = {
0891 std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
0892 wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
0893 wrap::vmaxvq_f32, wrap::vmaxvq_f64)
0894 };
0895 return dispatcher.apply(register_type(arg));
0896 }
0897
0898
0899
0900
0901
0902 WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
0903 WRAP_REDUCER_FLOAT(vminvq)
0904
0905 namespace wrap
0906 {
0907 XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept
0908 {
0909 return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
0910 }
0911
0912 XSIMD_INLINE int64_t vminvq_s64(int64x2_t a) noexcept
0913 {
0914 return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
0915 }
0916 }
0917
0918 template <class A, class T, detail::enable_neon64_type_t<T> = 0>
0919 XSIMD_INLINE typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
0920 {
0921 using register_type = typename batch<T, A>::register_type;
0922 const detail::neon_reducer_dispatcher::unary dispatcher = {
0923 std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
0924 wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
0925 wrap::vminvq_f32, wrap::vminvq_f64)
0926 };
0927 return dispatcher.apply(register_type(arg));
0928 }
0929
0930 #undef WRAP_REDUCER_INT_EXCLUDING_64
0931 #undef WRAP_REDUCER_INT
0932 #undef WRAP_REDUCER_FLOAT
0933
0934
0935
0936
0937
0938 template <class A>
0939 XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
0940 {
0941 return vbslq_f64(cond, a, b);
0942 }
0943
0944 template <class A, bool... b>
0945 XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, b...> const&,
0946 batch<double, A> const& true_br,
0947 batch<double, A> const& false_br,
0948 requires_arch<neon64>) noexcept
0949 {
0950 return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
0951 }
0952
0953 template <class A>
0954 XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<neon64>) noexcept
0955 {
0956 assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
0957 (void)matrix_end;
0958 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
0959 matrix_begin[0] = vzip1q_f64(r0, r1);
0960 matrix_begin[1] = vzip2q_f64(r0, r1);
0961 }
0962
0963 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0964 XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
0965 {
0966 assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
0967 (void)matrix_end;
0968 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
0969 matrix_begin[0] = vzip1q_u64(r0, r1);
0970 matrix_begin[1] = vzip2q_u64(r0, r1);
0971 }
0972
0973 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0974 XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
0975 {
0976 assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
0977 (void)matrix_end;
0978 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
0979 matrix_begin[0] = vzip1q_s64(r0, r1);
0980 matrix_begin[1] = vzip2q_s64(r0, r1);
0981 }
0982
0983
0984
0985
0986 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0987 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0988 {
0989 return vzip1q_u8(lhs, rhs);
0990 }
0991
0992 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0993 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0994 {
0995 return vzip1q_s8(lhs, rhs);
0996 }
0997
0998 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0999 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1000 {
1001 return vzip1q_u16(lhs, rhs);
1002 }
1003
1004 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1005 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1006 {
1007 return vzip1q_s16(lhs, rhs);
1008 }
1009
1010 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1011 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1012 {
1013 return vzip1q_u32(lhs, rhs);
1014 }
1015
1016 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1017 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1018 {
1019 return vzip1q_s32(lhs, rhs);
1020 }
1021
1022 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1023 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1024 {
1025 return vzip1q_u64(lhs, rhs);
1026 }
1027
1028 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1029 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1030 {
1031 return vzip1q_s64(lhs, rhs);
1032 }
1033
1034 template <class A>
1035 XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
1036 {
1037 return vzip1q_f32(lhs, rhs);
1038 }
1039
1040 template <class A>
1041 XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
1042 {
1043 return vzip1q_f64(lhs, rhs);
1044 }
1045
1046
1047
1048
1049
1050 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1051 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1052 {
1053 return vzip2q_u8(lhs, rhs);
1054 }
1055
1056 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1057 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1058 {
1059 return vzip2q_s8(lhs, rhs);
1060 }
1061
1062 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1063 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1064 {
1065 return vzip2q_u16(lhs, rhs);
1066 }
1067
1068 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1069 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1070 {
1071 return vzip2q_s16(lhs, rhs);
1072 }
1073
1074 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1075 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1076 {
1077 return vzip2q_u32(lhs, rhs);
1078 }
1079
1080 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1081 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1082 {
1083 return vzip2q_s32(lhs, rhs);
1084 }
1085
1086 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1087 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1088 {
1089 return vzip2q_u64(lhs, rhs);
1090 }
1091
1092 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1093 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1094 {
1095 return vzip2q_s64(lhs, rhs);
1096 }
1097
1098 template <class A>
1099 XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
1100 {
1101 return vzip2q_f32(lhs, rhs);
1102 }
1103
1104 template <class A>
1105 XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
1106 {
1107 return vzip2q_f64(lhs, rhs);
1108 }
1109
1110
1111
1112
1113
1114 namespace detail
1115 {
1116 template <class A, size_t I, size_t... Is>
1117 XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
1118 ::xsimd::detail::index_sequence<I, Is...>) noexcept
1119 {
1120 if (n == I)
1121 {
1122 return vextq_f64(rhs, lhs, I);
1123 }
1124 else
1125 {
1126 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1127 }
1128 }
1129 }
1130
1131 template <class A>
1132 XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
1133 {
1134 constexpr std::size_t size = batch<double, A>::size;
1135 assert(n < size && "index in bounds");
1136 return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
1137 }
1138
1139
1140
1141
1142
1143 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1144 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1145 {
1146 return bitwise_rshift<A>(lhs, n, neon {});
1147 }
1148
1149 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1150 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
1151 {
1152 return vshlq_u64(lhs, vnegq_s64(rhs));
1153 }
1154
1155 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1156 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1157 {
1158 return bitwise_rshift<A>(lhs, n, neon {});
1159 }
1160
1161 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1162 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1163 {
1164 return vshlq_s64(lhs, vnegq_s64(rhs));
1165 }
1166
1167
1168
1169
1170
1171 #define WRAP_CAST(SUFFIX, TYPE) \
1172 namespace wrap \
1173 { \
1174 XSIMD_INLINE float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept \
1175 { \
1176 return ::vreinterpretq_f64_##SUFFIX(a); \
1177 } \
1178 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
1179 { \
1180 return ::vreinterpretq_##SUFFIX##_f64(a); \
1181 } \
1182 }
1183
1184 WRAP_CAST(u8, uint8x16_t)
1185 WRAP_CAST(s8, int8x16_t)
1186 WRAP_CAST(u16, uint16x8_t)
1187 WRAP_CAST(s16, int16x8_t)
1188 WRAP_CAST(u32, uint32x4_t)
1189 WRAP_CAST(s32, int32x4_t)
1190 WRAP_CAST(u64, uint64x2_t)
1191 WRAP_CAST(s64, int64x2_t)
1192 WRAP_CAST(f32, float32x4_t)
1193
1194 #undef WRAP_CAST
1195
1196 template <class A, class T>
1197 XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1198 {
1199 using caster_type = detail::bitwise_caster_impl<float64x2_t,
1200 uint8x16_t, int8x16_t,
1201 uint16x8_t, int16x8_t,
1202 uint32x4_t, int32x4_t,
1203 uint64x2_t, int64x2_t,
1204 float32x4_t>;
1205 const caster_type caster = {
1206 std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
1207 wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
1208 wrap::vreinterpretq_f64_f32)
1209 };
1210 using register_type = typename batch<T, A>::register_type;
1211 return caster.apply(register_type(arg));
1212 }
1213
1214 namespace detail
1215 {
1216 template <class S, class... R>
1217 struct bitwise_caster_neon64
1218 {
1219 using container_type = std::tuple<R (*)(S)...>;
1220 container_type m_func;
1221
1222 template <class V>
1223 V apply(float64x2_t rhs) const
1224 {
1225 using func_type = V (*)(float64x2_t);
1226 auto func = xsimd::detail::get<func_type>(m_func);
1227 return func(rhs);
1228 }
1229 };
1230 }
1231
1232 template <class A, class R>
1233 XSIMD_INLINE batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
1234 {
1235 using caster_type = detail::bitwise_caster_neon64<float64x2_t,
1236 uint8x16_t, int8x16_t,
1237 uint16x8_t, int16x8_t,
1238 uint32x4_t, int32x4_t,
1239 uint64x2_t, int64x2_t,
1240 float32x4_t>;
1241 const caster_type caster = {
1242 std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
1243 wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
1244 wrap::vreinterpretq_f32_f64)
1245 };
1246 using src_register_type = typename batch<double, A>::register_type;
1247 using dst_register_type = typename batch<R, A>::register_type;
1248 return caster.apply<dst_register_type>(src_register_type(arg));
1249 }
1250
1251 template <class A>
1252 XSIMD_INLINE batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1253 {
1254 return arg;
1255 }
1256
1257
1258
1259
1260
1261 template <class A>
1262 XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
1263 {
1264 return !(arg == arg);
1265 }
1266
1267
1268
1269
1270 template <size_t N, class A>
1271 XSIMD_INLINE batch<double, A> rotate_left(batch<double, A> const& a, requires_arch<neon64>) noexcept
1272 {
1273 return vextq_f64(a, a, N);
1274 }
1275 }
1276
1277 template <typename T, class A, T... Values>
1278 struct batch_constant;
1279
1280 namespace kernel
1281 {
1282
1283
1284
1285 template <class A>
1286 XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> idx,
1287 requires_arch<neon64>) noexcept
1288 {
1289 return vqtbl1q_u8(self, idx);
1290 }
1291
1292 template <class A>
1293 XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> idx,
1294 requires_arch<neon64>) noexcept
1295 {
1296 return vqtbl1q_s8(self, idx);
1297 }
1298
1299 template <class A>
1300 XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
1301 batch<uint16_t, A> idx,
1302 requires_arch<neon64>) noexcept
1303 {
1304 using batch_type = batch<uint8_t, A>;
1305 using index_type = batch<uint8_t, A>;
1306 return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)),
1307 index_type(vreinterpretq_u8_u16(idx * 0x0202 + 0x0100)),
1308 neon64 {}));
1309 }
1310
1311 template <class A>
1312 XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
1313 batch<uint16_t, A> idx,
1314 requires_arch<neon64>) noexcept
1315 {
1316 return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), idx, neon64 {}));
1317 }
1318
1319 template <class A>
1320 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
1321 batch<uint32_t, A> idx,
1322 requires_arch<neon64>) noexcept
1323 {
1324 using batch_type = batch<uint8_t, A>;
1325 using index_type = batch<uint8_t, A>;
1326 return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)),
1327 index_type(vreinterpretq_u8_u32(idx * 0x04040404 + 0x03020100)),
1328 neon64 {}));
1329 }
1330
1331 template <class A>
1332 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
1333 batch<uint32_t, A> idx,
1334 requires_arch<neon64>) noexcept
1335 {
1336 return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
1337 }
1338
1339 template <class A>
1340 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
1341 batch<uint64_t, A> idx,
1342 requires_arch<neon64>) noexcept
1343 {
1344 using batch_type = batch<uint8_t, A>;
1345 using index_type = batch<uint8_t, A>;
1346 return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)),
1347 index_type(vreinterpretq_u8_u64(idx * 0x0808080808080808ull + 0x0706050403020100ull)),
1348 neon64 {}));
1349 }
1350
1351 template <class A>
1352 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
1353 batch<uint64_t, A> idx,
1354 requires_arch<neon64>) noexcept
1355 {
1356 return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
1357 }
1358
1359 template <class A>
1360 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
1361 batch<uint32_t, A> idx,
1362 requires_arch<neon64>) noexcept
1363 {
1364 return bitwise_cast<float>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
1365 }
1366
1367 template <class A>
1368 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
1369 batch<uint64_t, A> idx,
1370 requires_arch<neon64>) noexcept
1371 {
1372 return bitwise_cast<double>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
1373 }
1374
1375
1376
1377
1378
1379 namespace detail
1380 {
1381 using ::xsimd::batch_constant;
1382 using ::xsimd::detail::integer_sequence;
1383 using ::xsimd::detail::make_integer_sequence;
1384
1385 template <class CB1, class CB2, class IS>
1386 struct index_burst_impl;
1387
1388 template <typename T1, class A, typename T2, T2... V,
1389 T2... incr>
1390 struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
1391 integer_sequence<T2, incr...>>
1392 {
1393 using type = batch_constant<T2, A, V...>;
1394 };
1395
1396 template <typename T1, class A, T1 V0, T1... V1,
1397 typename T2, T2... V2, T2... incr>
1398 struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
1399 integer_sequence<T2, incr...>>
1400 {
1401 using next_input = batch_constant<T1, A, V1...>;
1402 using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
1403 using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
1404 };
1405
1406 template <class B, class T>
1407 struct index_burst;
1408
1409 template <typename Tp, class A, Tp... V, typename T>
1410 struct index_burst<batch_constant<Tp, A, V...>, T>
1411 {
1412 static constexpr size_t mul = sizeof(Tp) / sizeof(T);
1413 using input = batch_constant<Tp, A, (mul * V)...>;
1414 using output = batch_constant<T, A>;
1415 using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
1416 };
1417
1418 template <class B, typename T>
1419 using index_burst_t = typename index_burst<B, T>::type;
1420
1421 template <typename T, class B>
1422 XSIMD_INLINE index_burst_t<B, T> burst_index(B)
1423 {
1424 return index_burst_t<B, T>();
1425 }
1426 }
1427
1428 template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1429 uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1430 XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
1431 batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1432 requires_arch<neon64>) noexcept
1433 {
1434 return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
1435 }
1436
1437 template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1438 uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1439 XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
1440 batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1441 requires_arch<neon64>) noexcept
1442 {
1443 return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
1444 }
1445
1446 template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1447 XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
1448 batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1449 requires_arch<neon64>) noexcept
1450 {
1451 using batch_type = batch<uint8_t, A>;
1452 return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
1453 }
1454
1455 template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1456 XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
1457 batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1458 requires_arch<neon64>) noexcept
1459 {
1460 using batch_type = batch<int8_t, A>;
1461 return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
1462 }
1463
1464 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1465 XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
1466 batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1467 requires_arch<neon64>) noexcept
1468 {
1469 using batch_type = batch<uint8_t, A>;
1470 return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
1471 }
1472
1473 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1474 XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
1475 batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1476 requires_arch<neon64>) noexcept
1477 {
1478 using batch_type = batch<int8_t, A>;
1479 return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
1480 }
1481
1482 template <class A, uint64_t V0, uint64_t V1>
1483 XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
1484 batch_constant<uint64_t, A, V0, V1> idx,
1485 requires_arch<neon64>) noexcept
1486 {
1487 using batch_type = batch<uint8_t, A>;
1488 return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
1489 }
1490
1491 template <class A, uint64_t V0, uint64_t V1>
1492 XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
1493 batch_constant<uint64_t, A, V0, V1> idx,
1494 requires_arch<neon64>) noexcept
1495 {
1496 using batch_type = batch<int8_t, A>;
1497 return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
1498 }
1499
1500 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1501 XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
1502 batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1503 requires_arch<neon64>) noexcept
1504 {
1505 using batch_type = batch<uint8_t, A>;
1506 return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
1507 }
1508
1509 template <class A, uint64_t V0, uint64_t V1>
1510 XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
1511 batch_constant<uint64_t, A, V0, V1> idx,
1512 requires_arch<neon64>) noexcept
1513 {
1514 using batch_type = batch<uint8_t, A>;
1515 return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
1516 }
1517
1518 template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1519 XSIMD_INLINE batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
1520 batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1521 requires_arch<neon64>) noexcept
1522 {
1523 return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1524 }
1525
1526 template <class A, uint64_t V0, uint64_t V1>
1527 XSIMD_INLINE batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
1528 batch_constant<uint64_t, A, V0, V1> idx,
1529 requires_arch<neon64>) noexcept
1530 {
1531 return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1532 }
1533 }
1534 }
1535
1536 #endif