xsimd/arch/xsimd_neon64.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_NEON64_HPP
0013 #define XSIMD_NEON64_HPP
0014
0015 #include <complex>
0016 #include <cstddef>
0017 #include <tuple>
0018
0019 #include "../types/xsimd_neon64_register.hpp"
0020 #include "../types/xsimd_utils.hpp"
0021
0022 namespace xsimd
0023 {
0024     template <typename T, class A, bool... Values>
0025     struct batch_bool_constant;
0026
0027     namespace kernel
0028     {
0029         using namespace types;
0030
0031         /*******
0032          * all *
0033          *******/
0034
0035         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
0036         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0037         {
0038             return vminvq_u32(arg) == ~0U;
0039         }
0040
0041         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
0042         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0043         {
0044             return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
0045         }
0046
0047         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
0048         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0049         {
0050             return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
0051         }
0052
0053         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
0054         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0055         {
0056             return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
0057         }
0058
0059         /*******
0060          * any *
0061          *******/
0062
0063         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
0064         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0065         {
0066             return vmaxvq_u32(arg) != 0;
0067         }
0068
0069         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
0070         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0071         {
0072             return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
0073         }
0074
0075         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
0076         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0077         {
0078             return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
0079         }
0080
0081         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
0082         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
0083         {
0084             return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
0085         }
0086
0087         /*************
0088          * broadcast *
0089          *************/
0090
0091         // Required to avoid ambiguous call
0092         template <class A, class T>
0093         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
0094         {
0095             return broadcast<A>(val, neon {});
0096         }
0097
0098         template <class A>
0099         XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
0100         {
0101             return vdupq_n_f64(val);
0102         }
0103
0104         /*******
0105          * set *
0106          *******/
0107
0108         template <class A>
0109         XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
0110         {
0111             return float64x2_t { d0, d1 };
0112         }
0113
0114         template <class A>
0115         XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
0116         {
0117             using register_type = typename batch_bool<double, A>::register_type;
0118             using unsigned_type = as_unsigned_integer_t<double>;
0119             return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
0120                                    static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
0121         }
0122
0123         /*************
0124          * from_bool *
0125          *************/
0126
0127         template <class A>
0128         XSIMD_INLINE batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
0129         {
0130             return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
0131         }
0132
0133         /********
0134          * load *
0135          ********/
0136 #if defined(__clang__) || defined(__GNUC__)
0137 #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
0138 #elif defined(_MSC_VER)
0139 #define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
0140 #else
0141 #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
0142 #endif
0143
0144         template <class A>
0145         XSIMD_INLINE batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
0146         {
0147             return xsimd_aligned_load(vld1q_f64, double*, src);
0148         }
0149
0150         template <class A>
0151         XSIMD_INLINE batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
0152         {
0153             return vld1q_f64(src);
0154         }
0155 #undef xsimd_aligned_load
0156
0157         /*********
0158          * store *
0159          *********/
0160
0161         template <class A>
0162         XSIMD_INLINE void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
0163         {
0164             vst1q_f64(dst, src);
0165         }
0166
0167         template <class A>
0168         XSIMD_INLINE void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
0169         {
0170             return store_aligned<A>(dst, src, A {});
0171         }
0172
0173         /****************
0174          * load_complex *
0175          ****************/
0176
0177         template <class A>
0178         XSIMD_INLINE batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
0179         {
0180             using real_batch = batch<double, A>;
0181             const double* buf = reinterpret_cast<const double*>(mem);
0182             float64x2x2_t tmp = vld2q_f64(buf);
0183             real_batch real = tmp.val[0],
0184                        imag = tmp.val[1];
0185             return batch<std::complex<double>, A> { real, imag };
0186         }
0187
0188         template <class A>
0189         XSIMD_INLINE batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
0190         {
0191             return load_complex_aligned<A>(mem, cvt, A {});
0192         }
0193
0194         /*****************
0195          * store_complex *
0196          *****************/
0197
0198         template <class A>
0199         XSIMD_INLINE void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
0200         {
0201             float64x2x2_t tmp;
0202             tmp.val[0] = src.real();
0203             tmp.val[1] = src.imag();
0204             double* buf = reinterpret_cast<double*>(dst);
0205             vst2q_f64(buf, tmp);
0206         }
0207
0208         template <class A>
0209         XSIMD_INLINE void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
0210         {
0211             store_complex_aligned(dst, src, A {});
0212         }
0213
0214         /*******
0215          * neg *
0216          *******/
0217
0218         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0219         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0220         {
0221             return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
0222         }
0223
0224         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0225         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0226         {
0227             return vnegq_s64(rhs);
0228         }
0229
0230         template <class A>
0231         XSIMD_INLINE batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0232         {
0233             return vnegq_f64(rhs);
0234         }
0235
0236         /*******
0237          * add *
0238          *******/
0239
0240         template <class A>
0241         XSIMD_INLINE batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0242         {
0243             return vaddq_f64(lhs, rhs);
0244         }
0245
0246         /********
0247          * sadd *
0248          ********/
0249
0250         template <class A>
0251         XSIMD_INLINE batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0252         {
0253             return add(lhs, rhs, neon64 {});
0254         }
0255
0256         /*******
0257          * sub *
0258          *******/
0259
0260         template <class A>
0261         XSIMD_INLINE batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0262         {
0263             return vsubq_f64(lhs, rhs);
0264         }
0265
0266         /********
0267          * ssub *
0268          ********/
0269
0270         template <class A>
0271         XSIMD_INLINE batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0272         {
0273             return sub(lhs, rhs, neon64 {});
0274         }
0275
0276         /*******
0277          * mul *
0278          *******/
0279
0280         template <class A>
0281         XSIMD_INLINE batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0282         {
0283             return vmulq_f64(lhs, rhs);
0284         }
0285
0286         /*******
0287          * div *
0288          *******/
0289
0290 #if defined(XSIMD_FAST_INTEGER_DIVISION)
0291         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0292         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0293         {
0294             return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
0295         }
0296
0297         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0298         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0299         {
0300             return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
0301         }
0302 #endif
0303         template <class A>
0304         XSIMD_INLINE batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0305         {
0306             return vdivq_f64(lhs, rhs);
0307         }
0308
0309         /******
0310          * eq *
0311          ******/
0312
0313         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0314         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0315         {
0316             return vceqq_u64(lhs, rhs);
0317         }
0318
0319         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0320         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0321         {
0322             return vceqq_s64(lhs, rhs);
0323         }
0324
0325         template <class A>
0326         XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0327         {
0328             return vceqq_f64(lhs, rhs);
0329         }
0330
0331         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0332         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
0333         {
0334             return vceqq_u64(lhs, rhs);
0335         }
0336
0337         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0338         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
0339         {
0340             return vceqq_u64(lhs, rhs);
0341         }
0342
0343         template <class A>
0344         XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0345         {
0346             return vceqq_u64(lhs, rhs);
0347         }
0348
0349         /*************
0350          * fast_cast *
0351          *************/
0352         namespace detail
0353         {
0354             template <class A>
0355             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
0356             {
0357                 return vcvtq_f64_s64(x);
0358             }
0359
0360             template <class A>
0361             XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
0362             {
0363                 return vcvtq_f64_u64(x);
0364             }
0365
0366             template <class A>
0367             XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
0368             {
0369                 return vcvtq_s64_f64(x);
0370             }
0371
0372             template <class A>
0373             XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
0374             {
0375                 return vcvtq_u64_f64(x);
0376             }
0377
0378         }
0379
0380         /******
0381          * lt *
0382          ******/
0383
0384         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0385         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0386         {
0387             return vcltq_u64(lhs, rhs);
0388         }
0389
0390         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0391         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0392         {
0393             return vcltq_s64(lhs, rhs);
0394         }
0395
0396         template <class A>
0397         XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0398         {
0399             return vcltq_f64(lhs, rhs);
0400         }
0401
0402         /******
0403          * le *
0404          ******/
0405
0406         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0407         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0408         {
0409             return vcleq_u64(lhs, rhs);
0410         }
0411
0412         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0413         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0414         {
0415             return vcleq_s64(lhs, rhs);
0416         }
0417
0418         template <class A>
0419         XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0420         {
0421             return vcleq_f64(lhs, rhs);
0422         }
0423
0424         /******
0425          * gt *
0426          ******/
0427
0428         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0429         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0430         {
0431             return vcgtq_u64(lhs, rhs);
0432         }
0433
0434         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0435         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0436         {
0437             return vcgtq_s64(lhs, rhs);
0438         }
0439
0440         template <class A>
0441         XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0442         {
0443             return vcgtq_f64(lhs, rhs);
0444         }
0445
0446         /******
0447          * ge *
0448          ******/
0449
0450         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0451         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0452         {
0453             return vcgeq_u64(lhs, rhs);
0454         }
0455
0456         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0457         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0458         {
0459             return vcgeq_s64(lhs, rhs);
0460         }
0461
0462         template <class A>
0463         XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0464         {
0465             return vcgeq_f64(lhs, rhs);
0466         }
0467
0468         /*******************
0469          * batch_bool_cast *
0470          *******************/
0471
0472         template <class A, class T_out, class T_in>
0473         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
0474         {
0475             using register_type = typename batch_bool<T_out, A>::register_type;
0476             return register_type(self);
0477         }
0478
0479         /***************
0480          * bitwise_and *
0481          ***************/
0482
0483         template <class A>
0484         XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0485         {
0486             return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
0487                                                    vreinterpretq_u64_f64(rhs)));
0488         }
0489
0490         template <class A>
0491         XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0492         {
0493             return vandq_u64(lhs, rhs);
0494         }
0495
0496         /**************
0497          * bitwise_or *
0498          **************/
0499
0500         template <class A>
0501         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0502         {
0503             return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
0504                                                    vreinterpretq_u64_f64(rhs)));
0505         }
0506
0507         template <class A>
0508         XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0509         {
0510             return vorrq_u64(lhs, rhs);
0511         }
0512
0513         /***************
0514          * bitwise_xor *
0515          ***************/
0516
0517         template <class A>
0518         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0519         {
0520             return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
0521                                                    vreinterpretq_u64_f64(rhs)));
0522         }
0523
0524         template <class A>
0525         XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0526         {
0527             return veorq_u64(lhs, rhs);
0528         }
0529
0530         /*******
0531          * neq *
0532          *******/
0533
0534         template <class A>
0535         XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0536         {
0537             return bitwise_xor(lhs, rhs, A {});
0538         }
0539
0540         /***************
0541          * bitwise_not *
0542          ***************/
0543
0544         template <class A>
0545         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0546         {
0547             return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
0548         }
0549
0550         template <class A>
0551         XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0552         {
0553             return detail::bitwise_not_u64(rhs);
0554         }
0555
0556         /******************
0557          * bitwise_andnot *
0558          ******************/
0559
0560         template <class A>
0561         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0562         {
0563             return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
0564                                                    vreinterpretq_u64_f64(rhs)));
0565         }
0566
0567         template <class A>
0568         XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
0569         {
0570             return vbicq_u64(lhs, rhs);
0571         }
0572
0573         /*******
0574          * min *
0575          *******/
0576
0577         template <class A>
0578         XSIMD_INLINE batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0579         {
0580             return vminq_f64(lhs, rhs);
0581         }
0582
0583         /*******
0584          * max *
0585          *******/
0586
0587         template <class A>
0588         XSIMD_INLINE batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0589         {
0590             return vmaxq_f64(lhs, rhs);
0591         }
0592
0593         /*******
0594          * abs *
0595          *******/
0596
0597         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0598         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0599         {
0600             return rhs;
0601         }
0602
0603         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0604         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0605         {
0606             return vabsq_s64(rhs);
0607         }
0608
0609         template <class A>
0610         XSIMD_INLINE batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0611         {
0612             return vabsq_f64(rhs);
0613         }
0614
0615         template <class A>
0616         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
0617                                                         requires_arch<neon64>) noexcept
0618         {
0619             return vcvtnq_s32_f32(self);
0620         }
0621
0622 #if !defined(__GNUC__)
0623         template <class A>
0624         XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
0625                                                         requires_arch<neon64>) noexcept
0626         {
0627             return vcvtnq_s64_f64(self);
0628         }
0629 #endif
0630
0631         /**************
0632          * reciprocal *
0633          **************/
0634
0635         template <class A>
0636         XSIMD_INLINE batch<double, A>
0637         reciprocal(const batch<double, A>& x,
0638                    kernel::requires_arch<neon64>) noexcept
0639         {
0640             return vrecpeq_f64(x);
0641         }
0642
0643         /********
0644          * rsqrt *
0645          ********/
0646
0647         template <class A>
0648         XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0649         {
0650             return vrsqrteq_f64(rhs);
0651         }
0652
0653         /********
0654          * sqrt *
0655          ********/
0656
0657         template <class A>
0658         XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
0659         {
0660             return vsqrtq_f64(rhs);
0661         }
0662
0663         /********************
0664          * Fused operations *
0665          ********************/
0666
0667 #ifdef __ARM_FEATURE_FMA
0668         template <class A>
0669         XSIMD_INLINE batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
0670         {
0671             return vfmaq_f64(z, x, y);
0672         }
0673
0674         template <class A>
0675         XSIMD_INLINE batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
0676         {
0677             return vfmaq_f64(-z, x, y);
0678         }
0679 #endif
0680
0681         /*********
0682          * haddp *
0683          *********/
0684
0685         template <class A>
0686         XSIMD_INLINE batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
0687         {
0688             return vpaddq_f64(row[0], row[1]);
0689         }
0690
0691         /**********
0692          * insert *
0693          **********/
0694
0695         template <class A, size_t I>
0696         XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
0697         {
0698             return vsetq_lane_f64(val, self, I);
0699         }
0700
0701         /******************
0702          * reducer macros *
0703          ******************/
0704
0705         // Wrap reducer intrinsics so we can pass them as function pointers
0706         // - OP: intrinsics name prefix, e.g., vorrq
0707
0708 #define WRAP_REDUCER_INT_EXCLUDING_64(OP)                     \
0709     namespace wrap                                            \
0710     {                                                         \
0711         XSIMD_INLINE uint8_t OP##_u8(uint8x16_t a) noexcept   \
0712         {                                                     \
0713             return ::OP##_u8(a);                              \
0714         }                                                     \
0715         XSIMD_INLINE int8_t OP##_s8(int8x16_t a) noexcept     \
0716         {                                                     \
0717             return ::OP##_s8(a);                              \
0718         }                                                     \
0719         XSIMD_INLINE uint16_t OP##_u16(uint16x8_t a) noexcept \
0720         {                                                     \
0721             return ::OP##_u16(a);                             \
0722         }                                                     \
0723         XSIMD_INLINE int16_t OP##_s16(int16x8_t a) noexcept   \
0724         {                                                     \
0725             return ::OP##_s16(a);                             \
0726         }                                                     \
0727         XSIMD_INLINE uint32_t OP##_u32(uint32x4_t a) noexcept \
0728         {                                                     \
0729             return ::OP##_u32(a);                             \
0730         }                                                     \
0731         XSIMD_INLINE int32_t OP##_s32(int32x4_t a) noexcept   \
0732         {                                                     \
0733             return ::OP##_s32(a);                             \
0734         }                                                     \
0735     }
0736
0737 #define WRAP_REDUCER_INT(OP)                                  \
0738     WRAP_REDUCER_INT_EXCLUDING_64(OP)                         \
0739     namespace wrap                                            \
0740     {                                                         \
0741         XSIMD_INLINE uint64_t OP##_u64(uint64x2_t a) noexcept \
0742         {                                                     \
0743             return ::OP##_u64(a);                             \
0744         }                                                     \
0745         XSIMD_INLINE int64_t OP##_s64(int64x2_t a) noexcept   \
0746         {                                                     \
0747             return ::OP##_s64(a);                             \
0748         }                                                     \
0749     }
0750
0751 #define WRAP_REDUCER_FLOAT(OP)                               \
0752     namespace wrap                                           \
0753     {                                                        \
0754         XSIMD_INLINE float OP##_f32(float32x4_t a) noexcept  \
0755         {                                                    \
0756             return ::OP##_f32(a);                            \
0757         }                                                    \
0758         XSIMD_INLINE double OP##_f64(float64x2_t a) noexcept \
0759         {                                                    \
0760             return ::OP##_f64(a);                            \
0761         }                                                    \
0762     }
0763
0764         namespace detail
0765         {
0766             template <class R>
0767             struct reducer_return_type_impl;
0768
0769             template <>
0770             struct reducer_return_type_impl<uint8x16_t>
0771             {
0772                 using type = uint8_t;
0773             };
0774
0775             template <>
0776             struct reducer_return_type_impl<int8x16_t>
0777             {
0778                 using type = int8_t;
0779             };
0780
0781             template <>
0782             struct reducer_return_type_impl<uint16x8_t>
0783             {
0784                 using type = uint16_t;
0785             };
0786
0787             template <>
0788             struct reducer_return_type_impl<int16x8_t>
0789             {
0790                 using type = int16_t;
0791             };
0792
0793             template <>
0794             struct reducer_return_type_impl<uint32x4_t>
0795             {
0796                 using type = uint32_t;
0797             };
0798
0799             template <>
0800             struct reducer_return_type_impl<int32x4_t>
0801             {
0802                 using type = int32_t;
0803             };
0804
0805             template <>
0806             struct reducer_return_type_impl<uint64x2_t>
0807             {
0808                 using type = uint64_t;
0809             };
0810
0811             template <>
0812             struct reducer_return_type_impl<int64x2_t>
0813             {
0814                 using type = int64_t;
0815             };
0816
0817             template <>
0818             struct reducer_return_type_impl<float32x4_t>
0819             {
0820                 using type = float;
0821             };
0822
0823             template <>
0824             struct reducer_return_type_impl<float64x2_t>
0825             {
0826                 using type = double;
0827             };
0828
0829             template <class R>
0830             using reducer_return_type = typename reducer_return_type_impl<R>::type;
0831
0832             template <class... T>
0833             struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
0834             {
0835             };
0836
0837             using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
0838                                                                          uint16x8_t, int16x8_t,
0839                                                                          uint32x4_t, int32x4_t,
0840                                                                          uint64x2_t, int64x2_t,
0841                                                                          float32x4_t, float64x2_t>;
0842             template <class T>
0843             using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
0844                                                                  int>::type;
0845         }
0846
0847         /**************
0848          * reduce_add *
0849          **************/
0850
0851         WRAP_REDUCER_INT(vaddvq)
0852         WRAP_REDUCER_FLOAT(vaddvq)
0853
0854         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
0855         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
0856         {
0857             using register_type = typename batch<T, A>::register_type;
0858             const detail::neon_reducer_dispatcher::unary dispatcher = {
0859                 std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
0860                                 wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
0861                                 wrap::vaddvq_f32, wrap::vaddvq_f64)
0862             };
0863             return dispatcher.apply(register_type(arg));
0864         }
0865
0866         /**************
0867          * reduce_max *
0868          **************/
0869
0870         WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
0871         WRAP_REDUCER_FLOAT(vmaxvq)
0872
0873         namespace wrap
0874         {
0875             XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept
0876             {
0877                 return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
0878             }
0879
0880             XSIMD_INLINE int64_t vmaxvq_s64(int64x2_t a) noexcept
0881             {
0882                 return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
0883             }
0884         }
0885
0886         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
0887         XSIMD_INLINE typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
0888         {
0889             using register_type = typename batch<T, A>::register_type;
0890             const detail::neon_reducer_dispatcher::unary dispatcher = {
0891                 std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
0892                                 wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
0893                                 wrap::vmaxvq_f32, wrap::vmaxvq_f64)
0894             };
0895             return dispatcher.apply(register_type(arg));
0896         }
0897
0898         /**************
0899          * reduce_min *
0900          **************/
0901
0902         WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
0903         WRAP_REDUCER_FLOAT(vminvq)
0904
0905         namespace wrap
0906         {
0907             XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept
0908             {
0909                 return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
0910             }
0911
0912             XSIMD_INLINE int64_t vminvq_s64(int64x2_t a) noexcept
0913             {
0914                 return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
0915             }
0916         }
0917
0918         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
0919         XSIMD_INLINE typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
0920         {
0921             using register_type = typename batch<T, A>::register_type;
0922             const detail::neon_reducer_dispatcher::unary dispatcher = {
0923                 std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
0924                                 wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
0925                                 wrap::vminvq_f32, wrap::vminvq_f64)
0926             };
0927             return dispatcher.apply(register_type(arg));
0928         }
0929
0930 #undef WRAP_REDUCER_INT_EXCLUDING_64
0931 #undef WRAP_REDUCER_INT
0932 #undef WRAP_REDUCER_FLOAT
0933
0934         /**********
0935          * select *
0936          **********/
0937
0938         template <class A>
0939         XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
0940         {
0941             return vbslq_f64(cond, a, b);
0942         }
0943
0944         template <class A, bool... b>
0945         XSIMD_INLINE batch<double, A> select(batch_bool_constant<double, A, b...> const&,
0946                                              batch<double, A> const& true_br,
0947                                              batch<double, A> const& false_br,
0948                                              requires_arch<neon64>) noexcept
0949         {
0950             return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
0951         }
0952
0953         template <class A>
0954         XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<neon64>) noexcept
0955         {
0956             assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
0957             (void)matrix_end;
0958             auto r0 = matrix_begin[0], r1 = matrix_begin[1];
0959             matrix_begin[0] = vzip1q_f64(r0, r1);
0960             matrix_begin[1] = vzip2q_f64(r0, r1);
0961         }
0962
0963         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0964         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
0965         {
0966             assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
0967             (void)matrix_end;
0968             auto r0 = matrix_begin[0], r1 = matrix_begin[1];
0969             matrix_begin[0] = vzip1q_u64(r0, r1);
0970             matrix_begin[1] = vzip2q_u64(r0, r1);
0971         }
0972
0973         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0974         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
0975         {
0976             assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
0977             (void)matrix_end;
0978             auto r0 = matrix_begin[0], r1 = matrix_begin[1];
0979             matrix_begin[0] = vzip1q_s64(r0, r1);
0980             matrix_begin[1] = vzip2q_s64(r0, r1);
0981         }
0982
0983         /**********
0984          * zip_lo *
0985          **********/
0986         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0987         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0988         {
0989             return vzip1q_u8(lhs, rhs);
0990         }
0991
0992         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0993         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
0994         {
0995             return vzip1q_s8(lhs, rhs);
0996         }
0997
0998         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0999         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1000         {
1001             return vzip1q_u16(lhs, rhs);
1002         }
1003
1004         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1005         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1006         {
1007             return vzip1q_s16(lhs, rhs);
1008         }
1009
1010         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1011         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1012         {
1013             return vzip1q_u32(lhs, rhs);
1014         }
1015
1016         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1017         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1018         {
1019             return vzip1q_s32(lhs, rhs);
1020         }
1021
1022         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1023         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1024         {
1025             return vzip1q_u64(lhs, rhs);
1026         }
1027
1028         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1029         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1030         {
1031             return vzip1q_s64(lhs, rhs);
1032         }
1033
1034         template <class A>
1035         XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
1036         {
1037             return vzip1q_f32(lhs, rhs);
1038         }
1039
1040         template <class A>
1041         XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
1042         {
1043             return vzip1q_f64(lhs, rhs);
1044         }
1045
1046         /**********
1047          * zip_hi *
1048          **********/
1049
1050         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1051         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1052         {
1053             return vzip2q_u8(lhs, rhs);
1054         }
1055
1056         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1057         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1058         {
1059             return vzip2q_s8(lhs, rhs);
1060         }
1061
1062         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1063         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1064         {
1065             return vzip2q_u16(lhs, rhs);
1066         }
1067
1068         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1069         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1070         {
1071             return vzip2q_s16(lhs, rhs);
1072         }
1073
1074         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1075         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1076         {
1077             return vzip2q_u32(lhs, rhs);
1078         }
1079
1080         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1081         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1082         {
1083             return vzip2q_s32(lhs, rhs);
1084         }
1085
1086         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1087         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1088         {
1089             return vzip2q_u64(lhs, rhs);
1090         }
1091
1092         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1093         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1094         {
1095             return vzip2q_s64(lhs, rhs);
1096         }
1097
1098         template <class A>
1099         XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
1100         {
1101             return vzip2q_f32(lhs, rhs);
1102         }
1103
1104         template <class A>
1105         XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
1106         {
1107             return vzip2q_f64(lhs, rhs);
1108         }
1109
1110         /****************
1111          * extract_pair *
1112          ****************/
1113
1114         namespace detail
1115         {
1116             template <class A, size_t I, size_t... Is>
1117             XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
1118                                                        ::xsimd::detail::index_sequence<I, Is...>) noexcept
1119             {
1120                 if (n == I)
1121                 {
1122                     return vextq_f64(rhs, lhs, I);
1123                 }
1124                 else
1125                 {
1126                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1127                 }
1128             }
1129         }
1130
1131         template <class A>
1132         XSIMD_INLINE batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
1133         {
1134             constexpr std::size_t size = batch<double, A>::size;
1135             assert(n < size && "index in bounds");
1136             return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
1137         }
1138
1139         /******************
1140          * bitwise_rshift *
1141          ******************/
1142
1143         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1144         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1145         {
1146             return bitwise_rshift<A>(lhs, n, neon {});
1147         }
1148
1149         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1150         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
1151         {
1152             return vshlq_u64(lhs, vnegq_s64(rhs));
1153         }
1154
1155         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1156         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
1157         {
1158             return bitwise_rshift<A>(lhs, n, neon {});
1159         }
1160
1161         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1162         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1163         {
1164             return vshlq_s64(lhs, vnegq_s64(rhs));
1165         }
1166
1167         /****************
1168          * bitwise_cast *
1169          ****************/
1170
1171 #define WRAP_CAST(SUFFIX, TYPE)                                                \
1172     namespace wrap                                                             \
1173     {                                                                          \
1174         XSIMD_INLINE float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
1175         {                                                                      \
1176             return ::vreinterpretq_f64_##SUFFIX(a);                            \
1177         }                                                                      \
1178         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
1179         {                                                                      \
1180             return ::vreinterpretq_##SUFFIX##_f64(a);                          \
1181         }                                                                      \
1182     }
1183
1184         WRAP_CAST(u8, uint8x16_t)
1185         WRAP_CAST(s8, int8x16_t)
1186         WRAP_CAST(u16, uint16x8_t)
1187         WRAP_CAST(s16, int16x8_t)
1188         WRAP_CAST(u32, uint32x4_t)
1189         WRAP_CAST(s32, int32x4_t)
1190         WRAP_CAST(u64, uint64x2_t)
1191         WRAP_CAST(s64, int64x2_t)
1192         WRAP_CAST(f32, float32x4_t)
1193
1194 #undef WRAP_CAST
1195
1196         template <class A, class T>
1197         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1198         {
1199             using caster_type = detail::bitwise_caster_impl<float64x2_t,
1200                                                             uint8x16_t, int8x16_t,
1201                                                             uint16x8_t, int16x8_t,
1202                                                             uint32x4_t, int32x4_t,
1203                                                             uint64x2_t, int64x2_t,
1204                                                             float32x4_t>;
1205             const caster_type caster = {
1206                 std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
1207                                 wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
1208                                 wrap::vreinterpretq_f64_f32)
1209             };
1210             using register_type = typename batch<T, A>::register_type;
1211             return caster.apply(register_type(arg));
1212         }
1213
1214         namespace detail
1215         {
1216             template <class S, class... R>
1217             struct bitwise_caster_neon64
1218             {
1219                 using container_type = std::tuple<R (*)(S)...>;
1220                 container_type m_func;
1221
1222                 template <class V>
1223                 V apply(float64x2_t rhs) const
1224                 {
1225                     using func_type = V (*)(float64x2_t);
1226                     auto func = xsimd::detail::get<func_type>(m_func);
1227                     return func(rhs);
1228                 }
1229             };
1230         }
1231
1232         template <class A, class R>
1233         XSIMD_INLINE batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
1234         {
1235             using caster_type = detail::bitwise_caster_neon64<float64x2_t,
1236                                                               uint8x16_t, int8x16_t,
1237                                                               uint16x8_t, int16x8_t,
1238                                                               uint32x4_t, int32x4_t,
1239                                                               uint64x2_t, int64x2_t,
1240                                                               float32x4_t>;
1241             const caster_type caster = {
1242                 std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
1243                                 wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
1244                                 wrap::vreinterpretq_f32_f64)
1245             };
1246             using src_register_type = typename batch<double, A>::register_type;
1247             using dst_register_type = typename batch<R, A>::register_type;
1248             return caster.apply<dst_register_type>(src_register_type(arg));
1249         }
1250
1251         template <class A>
1252         XSIMD_INLINE batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
1253         {
1254             return arg;
1255         }
1256
1257         /*********
1258          * isnan *
1259          *********/
1260
1261         template <class A>
1262         XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
1263         {
1264             return !(arg == arg);
1265         }
1266
1267         /****************
1268          * rotate_left *
1269          ****************/
1270         template <size_t N, class A>
1271         XSIMD_INLINE batch<double, A> rotate_left(batch<double, A> const& a, requires_arch<neon64>) noexcept
1272         {
1273             return vextq_f64(a, a, N);
1274         }
1275     }
1276
1277     template <typename T, class A, T... Values>
1278     struct batch_constant;
1279
1280     namespace kernel
1281     {
1282         /*********************
1283          * swizzle (dynamic) *
1284          *********************/
1285         template <class A>
1286         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> idx,
1287                                                requires_arch<neon64>) noexcept
1288         {
1289             return vqtbl1q_u8(self, idx);
1290         }
1291
1292         template <class A>
1293         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> idx,
1294                                               requires_arch<neon64>) noexcept
1295         {
1296             return vqtbl1q_s8(self, idx);
1297         }
1298
1299         template <class A>
1300         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
1301                                                 batch<uint16_t, A> idx,
1302                                                 requires_arch<neon64>) noexcept
1303         {
1304             using batch_type = batch<uint8_t, A>;
1305             using index_type = batch<uint8_t, A>;
1306             return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)),
1307                                                 index_type(vreinterpretq_u8_u16(idx * 0x0202 + 0x0100)),
1308                                                 neon64 {}));
1309         }
1310
1311         template <class A>
1312         XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
1313                                                batch<uint16_t, A> idx,
1314                                                requires_arch<neon64>) noexcept
1315         {
1316             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), idx, neon64 {}));
1317         }
1318
1319         template <class A>
1320         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
1321                                                 batch<uint32_t, A> idx,
1322                                                 requires_arch<neon64>) noexcept
1323         {
1324             using batch_type = batch<uint8_t, A>;
1325             using index_type = batch<uint8_t, A>;
1326             return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)),
1327                                                 index_type(vreinterpretq_u8_u32(idx * 0x04040404 + 0x03020100)),
1328                                                 neon64 {}));
1329         }
1330
1331         template <class A>
1332         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
1333                                                batch<uint32_t, A> idx,
1334                                                requires_arch<neon64>) noexcept
1335         {
1336             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
1337         }
1338
1339         template <class A>
1340         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
1341                                                 batch<uint64_t, A> idx,
1342                                                 requires_arch<neon64>) noexcept
1343         {
1344             using batch_type = batch<uint8_t, A>;
1345             using index_type = batch<uint8_t, A>;
1346             return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)),
1347                                                 index_type(vreinterpretq_u8_u64(idx * 0x0808080808080808ull + 0x0706050403020100ull)),
1348                                                 neon64 {}));
1349         }
1350
1351         template <class A>
1352         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
1353                                                batch<uint64_t, A> idx,
1354                                                requires_arch<neon64>) noexcept
1355         {
1356             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
1357         }
1358
1359         template <class A>
1360         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
1361                                              batch<uint32_t, A> idx,
1362                                              requires_arch<neon64>) noexcept
1363         {
1364             return bitwise_cast<float>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
1365         }
1366
1367         template <class A>
1368         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
1369                                               batch<uint64_t, A> idx,
1370                                               requires_arch<neon64>) noexcept
1371         {
1372             return bitwise_cast<double>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
1373         }
1374
1375         /********************
1376          * swizzle (static) *
1377          ********************/
1378
1379         namespace detail
1380         {
1381             using ::xsimd::batch_constant;
1382             using ::xsimd::detail::integer_sequence;
1383             using ::xsimd::detail::make_integer_sequence;
1384
1385             template <class CB1, class CB2, class IS>
1386             struct index_burst_impl;
1387
1388             template <typename T1, class A, typename T2, T2... V,
1389                       T2... incr>
1390             struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
1391                                     integer_sequence<T2, incr...>>
1392             {
1393                 using type = batch_constant<T2, A, V...>;
1394             };
1395
1396             template <typename T1, class A, T1 V0, T1... V1,
1397                       typename T2, T2... V2, T2... incr>
1398             struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
1399                                     integer_sequence<T2, incr...>>
1400             {
1401                 using next_input = batch_constant<T1, A, V1...>;
1402                 using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
1403                 using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
1404             };
1405
1406             template <class B, class T>
1407             struct index_burst;
1408
1409             template <typename Tp, class A, Tp... V, typename T>
1410             struct index_burst<batch_constant<Tp, A, V...>, T>
1411             {
1412                 static constexpr size_t mul = sizeof(Tp) / sizeof(T);
1413                 using input = batch_constant<Tp, A, (mul * V)...>;
1414                 using output = batch_constant<T, A>;
1415                 using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
1416             };
1417
1418             template <class B, typename T>
1419             using index_burst_t = typename index_burst<B, T>::type;
1420
1421             template <typename T, class B>
1422             XSIMD_INLINE index_burst_t<B, T> burst_index(B)
1423             {
1424                 return index_burst_t<B, T>();
1425             }
1426         }
1427
1428         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1429                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1430         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
1431                                                batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1432                                                requires_arch<neon64>) noexcept
1433         {
1434             return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
1435         }
1436
1437         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1438                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1439         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
1440                                               batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
1441                                               requires_arch<neon64>) noexcept
1442         {
1443             return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
1444         }
1445
1446         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1447         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
1448                                                 batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1449                                                 requires_arch<neon64>) noexcept
1450         {
1451             using batch_type = batch<uint8_t, A>;
1452             return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
1453         }
1454
1455         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1456         XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
1457                                                batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
1458                                                requires_arch<neon64>) noexcept
1459         {
1460             using batch_type = batch<int8_t, A>;
1461             return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
1462         }
1463
1464         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1465         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
1466                                                 batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1467                                                 requires_arch<neon64>) noexcept
1468         {
1469             using batch_type = batch<uint8_t, A>;
1470             return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
1471         }
1472
1473         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1474         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
1475                                                batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1476                                                requires_arch<neon64>) noexcept
1477         {
1478             using batch_type = batch<int8_t, A>;
1479             return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
1480         }
1481
1482         template <class A, uint64_t V0, uint64_t V1>
1483         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
1484                                                 batch_constant<uint64_t, A, V0, V1> idx,
1485                                                 requires_arch<neon64>) noexcept
1486         {
1487             using batch_type = batch<uint8_t, A>;
1488             return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
1489         }
1490
1491         template <class A, uint64_t V0, uint64_t V1>
1492         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
1493                                                batch_constant<uint64_t, A, V0, V1> idx,
1494                                                requires_arch<neon64>) noexcept
1495         {
1496             using batch_type = batch<int8_t, A>;
1497             return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
1498         }
1499
1500         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1501         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self,
1502                                              batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1503                                              requires_arch<neon64>) noexcept
1504         {
1505             using batch_type = batch<uint8_t, A>;
1506             return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
1507         }
1508
1509         template <class A, uint64_t V0, uint64_t V1>
1510         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self,
1511                                               batch_constant<uint64_t, A, V0, V1> idx,
1512                                               requires_arch<neon64>) noexcept
1513         {
1514             using batch_type = batch<uint8_t, A>;
1515             return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
1516         }
1517
1518         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1519         XSIMD_INLINE batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
1520                                                            batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
1521                                                            requires_arch<neon64>) noexcept
1522         {
1523             return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1524         }
1525
1526         template <class A, uint64_t V0, uint64_t V1>
1527         XSIMD_INLINE batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
1528                                                             batch_constant<uint64_t, A, V0, V1> idx,
1529                                                             requires_arch<neon64>) noexcept
1530         {
1531             return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
1532         }
1533     }
1534 }
1535
1536 #endif