xsimd/arch/xsimd_neon.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_NEON_HPP
0013 #define XSIMD_NEON_HPP
0014
0015 #include <algorithm>
0016 #include <complex>
0017 #include <tuple>
0018 #include <type_traits>
0019
0020 #include "../types/xsimd_neon_register.hpp"
0021 #include "../types/xsimd_utils.hpp"
0022
0023 // Wrap intrinsics so we can pass them as function pointers
0024 // - OP: intrinsics name prefix, e.g., vorrq
0025 // - RT: type traits to deduce intrinsics return types
0026 #define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                     \
0027     namespace wrap                                                                \
0028     {                                                                             \
0029         XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
0030         {                                                                         \
0031             return ::OP##_u8(a, b);                                               \
0032         }                                                                         \
0033         XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
0034         {                                                                         \
0035             return ::OP##_u16(a, b);                                              \
0036         }                                                                         \
0037         XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
0038         {                                                                         \
0039             return ::OP##_u32(a, b);                                              \
0040         }                                                                         \
0041     }
0042
0043 #define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                   \
0044     WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                      \
0045     namespace wrap                                                             \
0046     {                                                                          \
0047         XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept  \
0048         {                                                                      \
0049             return ::OP##_s8(a, b);                                            \
0050         }                                                                      \
0051         XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
0052         {                                                                      \
0053             return ::OP##_s16(a, b);                                           \
0054         }                                                                      \
0055         XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
0056         {                                                                      \
0057             return ::OP##_s32(a, b);                                           \
0058         }                                                                      \
0059     }
0060
0061 #define WRAP_BINARY_INT(OP, RT)                                                   \
0062     WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                          \
0063     namespace wrap                                                                \
0064     {                                                                             \
0065         XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
0066         {                                                                         \
0067             return ::OP##_u64(a, b);                                              \
0068         }                                                                         \
0069         XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
0070         {                                                                         \
0071             return ::OP##_s64(a, b);                                              \
0072         }                                                                         \
0073     }
0074
0075 #define WRAP_BINARY_FLOAT(OP, RT)                                                    \
0076     namespace wrap                                                                   \
0077     {                                                                                \
0078         XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
0079         {                                                                            \
0080             return ::OP##_f32(a, b);                                                 \
0081         }                                                                            \
0082     }
0083
0084 #define WRAP_UNARY_INT_EXCLUDING_64(OP)                         \
0085     namespace wrap                                              \
0086     {                                                           \
0087         XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
0088         {                                                       \
0089             return ::OP##_u8(a);                                \
0090         }                                                       \
0091         XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept    \
0092         {                                                       \
0093             return ::OP##_s8(a);                                \
0094         }                                                       \
0095         XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
0096         {                                                       \
0097             return ::OP##_u16(a);                               \
0098         }                                                       \
0099         XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept   \
0100         {                                                       \
0101             return ::OP##_s16(a);                               \
0102         }                                                       \
0103         XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
0104         {                                                       \
0105             return ::OP##_u32(a);                               \
0106         }                                                       \
0107         XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept   \
0108         {                                                       \
0109             return ::OP##_s32(a);                               \
0110         }                                                       \
0111     }
0112
0113 #define WRAP_UNARY_INT(OP)                                      \
0114     WRAP_UNARY_INT_EXCLUDING_64(OP)                             \
0115     namespace wrap                                              \
0116     {                                                           \
0117         XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
0118         {                                                       \
0119             return ::OP##_u64(a);                               \
0120         }                                                       \
0121         XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept   \
0122         {                                                       \
0123             return ::OP##_s64(a);                               \
0124         }                                                       \
0125     }
0126
0127 #define WRAP_UNARY_FLOAT(OP)                                      \
0128     namespace wrap                                                \
0129     {                                                             \
0130         XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
0131         {                                                         \
0132             return ::OP##_f32(a);                                 \
0133         }                                                         \
0134     }
0135
0136 // Dummy identity caster to ease coding
0137 XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
0138 XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
0139 XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
0140 XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
0141 XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
0142 XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
0143 XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
0144 XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
0145 XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
0146
0147 namespace xsimd
0148 {
0149     template <typename T, class A, bool... Values>
0150     struct batch_bool_constant;
0151
0152     namespace kernel
0153     {
0154         using namespace types;
0155
0156         namespace detail
0157         {
0158             template <template <class> class return_type, class... T>
0159             struct neon_dispatcher_base
0160             {
0161                 struct unary
0162                 {
0163                     using container_type = std::tuple<return_type<T> (*)(T)...>;
0164                     const container_type m_func;
0165
0166                     template <class U>
0167                     return_type<U> apply(U rhs) const noexcept
0168                     {
0169                         using func_type = return_type<U> (*)(U);
0170                         auto func = xsimd::detail::get<func_type>(m_func);
0171                         return func(rhs);
0172                     }
0173                 };
0174
0175                 struct binary
0176                 {
0177                     using container_type = std::tuple<return_type<T> (*)(T, T)...>;
0178                     const container_type m_func;
0179
0180                     template <class U>
0181                     return_type<U> apply(U lhs, U rhs) const noexcept
0182                     {
0183                         using func_type = return_type<U> (*)(U, U);
0184                         auto func = xsimd::detail::get<func_type>(m_func);
0185                         return func(lhs, rhs);
0186                     }
0187                 };
0188             };
0189
0190             /***************************
0191              *  arithmetic dispatchers *
0192              ***************************/
0193
0194             template <class T>
0195             using identity_return_type = T;
0196
0197             template <class... T>
0198             struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
0199             {
0200             };
0201
0202             using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
0203                                                          uint16x8_t, int16x8_t,
0204                                                          uint32x4_t, int32x4_t,
0205                                                          uint64x2_t, int64x2_t,
0206                                                          float32x4_t>;
0207
0208             using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
0209                                                                     uint16x8_t, int16x8_t,
0210                                                                     uint32x4_t, int32x4_t,
0211                                                                     float32x4_t>;
0212
0213             using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
0214                                                                        uint16x8_t, int16x8_t,
0215                                                                        uint32x4_t, int32x4_t>;
0216
0217             /**************************
0218              * comparison dispatchers *
0219              **************************/
0220
0221             template <class T>
0222             struct comp_return_type_impl;
0223
0224             template <>
0225             struct comp_return_type_impl<uint8x16_t>
0226             {
0227                 using type = uint8x16_t;
0228             };
0229
0230             template <>
0231             struct comp_return_type_impl<int8x16_t>
0232             {
0233                 using type = uint8x16_t;
0234             };
0235
0236             template <>
0237             struct comp_return_type_impl<uint16x8_t>
0238             {
0239                 using type = uint16x8_t;
0240             };
0241
0242             template <>
0243             struct comp_return_type_impl<int16x8_t>
0244             {
0245                 using type = uint16x8_t;
0246             };
0247
0248             template <>
0249             struct comp_return_type_impl<uint32x4_t>
0250             {
0251                 using type = uint32x4_t;
0252             };
0253
0254             template <>
0255             struct comp_return_type_impl<int32x4_t>
0256             {
0257                 using type = uint32x4_t;
0258             };
0259
0260             template <>
0261             struct comp_return_type_impl<uint64x2_t>
0262             {
0263                 using type = uint64x2_t;
0264             };
0265
0266             template <>
0267             struct comp_return_type_impl<int64x2_t>
0268             {
0269                 using type = uint64x2_t;
0270             };
0271
0272             template <>
0273             struct comp_return_type_impl<float32x4_t>
0274             {
0275                 using type = uint32x4_t;
0276             };
0277
0278             template <class T>
0279             using comp_return_type = typename comp_return_type_impl<T>::type;
0280
0281             template <class... T>
0282             struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
0283             {
0284             };
0285
0286             using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
0287                                                                               uint16x8_t, int16x8_t,
0288                                                                               uint32x4_t, int32x4_t,
0289                                                                               float32x4_t>;
0290
0291             /**************************************
0292              * enabling / disabling metafunctions *
0293              **************************************/
0294
0295             template <class T>
0296             using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
0297                                                                int>::type;
0298
0299             template <class T>
0300             using exclude_int64_neon_t
0301                 = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
0302         }
0303
0304         /*************
0305          * broadcast *
0306          *************/
0307
0308         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0309         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0310         {
0311             return vdupq_n_u8(uint8_t(val));
0312         }
0313
0314         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0315         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0316         {
0317             return vdupq_n_s8(int8_t(val));
0318         }
0319
0320         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0321         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0322         {
0323             return vdupq_n_u16(uint16_t(val));
0324         }
0325
0326         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0327         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0328         {
0329             return vdupq_n_s16(int16_t(val));
0330         }
0331
0332         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0333         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0334         {
0335             return vdupq_n_u32(uint32_t(val));
0336         }
0337
0338         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0339         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0340         {
0341             return vdupq_n_s32(int32_t(val));
0342         }
0343
0344         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0345         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0346         {
0347             return vdupq_n_u64(uint64_t(val));
0348         }
0349
0350         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0351         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0352         {
0353             return vdupq_n_s64(int64_t(val));
0354         }
0355
0356         template <class A>
0357         XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
0358         {
0359             return vdupq_n_f32(val);
0360         }
0361
0362         /*******
0363          * set *
0364          *******/
0365
0366         template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
0367         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
0368         {
0369             return xsimd::types::detail::neon_vector_type<T> { args... };
0370         }
0371
0372         template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
0373         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
0374         {
0375             using register_type = typename batch_bool<T, A>::register_type;
0376             using unsigned_type = as_unsigned_integer_t<T>;
0377             return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
0378         }
0379
0380         template <class A>
0381         XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
0382         {
0383             return float32x4_t { f0, f1, f2, f3 };
0384         }
0385
0386         template <class A>
0387         XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
0388                                                        std::complex<float> c0, std::complex<float> c1,
0389                                                        std::complex<float> c2, std::complex<float> c3) noexcept
0390         {
0391             return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
0392                                                  float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
0393         }
0394
0395         template <class A, class... Args>
0396         XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
0397         {
0398             using register_type = typename batch_bool<float, A>::register_type;
0399             using unsigned_type = as_unsigned_integer_t<float>;
0400             return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
0401         }
0402
0403         /*************
0404          * from_bool *
0405          *************/
0406
0407         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0408         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0409         {
0410             return vandq_u8(arg, vdupq_n_u8(1));
0411         }
0412
0413         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0414         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0415         {
0416             return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
0417         }
0418
0419         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0420         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0421         {
0422             return vandq_u16(arg, vdupq_n_u16(1));
0423         }
0424
0425         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0426         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0427         {
0428             return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
0429         }
0430
0431         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0432         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0433         {
0434             return vandq_u32(arg, vdupq_n_u32(1));
0435         }
0436
0437         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0438         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0439         {
0440             return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
0441         }
0442
0443         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0444         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0445         {
0446             return vandq_u64(arg, vdupq_n_u64(1));
0447         }
0448
0449         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0450         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0451         {
0452             return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
0453         }
0454
0455         template <class A>
0456         XSIMD_INLINE batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
0457         {
0458             return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
0459         }
0460
0461         /********
0462          * load *
0463          ********/
0464
0465         // It is not possible to use a call to A::alignment() here, so use an
0466         // immediate instead.
0467 #if defined(__clang__) || defined(__GNUC__)
0468 #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
0469 #elif defined(_MSC_VER)
0470 #define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
0471 #else
0472 #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
0473 #endif
0474
0475         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0476         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0477         {
0478             return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
0479         }
0480
0481         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0482         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0483         {
0484             return xsimd_aligned_load(vld1q_s8, int8_t*, src);
0485         }
0486
0487         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0488         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0489         {
0490             return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
0491         }
0492         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0493         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0494         {
0495             return xsimd_aligned_load(vld1q_s16, int16_t*, src);
0496         }
0497         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0498         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0499         {
0500             return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
0501         }
0502         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0503         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0504         {
0505             return xsimd_aligned_load(vld1q_s32, int32_t*, src);
0506         }
0507         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0508         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0509         {
0510             return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
0511         }
0512         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0513         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0514         {
0515             return xsimd_aligned_load(vld1q_s64, int64_t*, src);
0516         }
0517
0518         template <class A>
0519         XSIMD_INLINE batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
0520         {
0521             return xsimd_aligned_load(vld1q_f32, float*, src);
0522         }
0523
0524 #undef xsimd_aligned_load
0525
0526         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0527         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0528         {
0529             return vld1q_u8((uint8_t*)src);
0530         }
0531
0532         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0533         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0534         {
0535             return vld1q_s8((int8_t*)src);
0536         }
0537
0538         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0539         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0540         {
0541             return vld1q_u16((uint16_t*)src);
0542         }
0543         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0544         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0545         {
0546             return vld1q_s16((int16_t*)src);
0547         }
0548         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0549         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0550         {
0551             return vld1q_u32((uint32_t*)src);
0552         }
0553         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0554         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0555         {
0556             return vld1q_s32((int32_t*)src);
0557         }
0558         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0559         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0560         {
0561             return vld1q_u64((uint64_t*)src);
0562         }
0563         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0564         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0565         {
0566             return vld1q_s64((int64_t*)src);
0567         }
0568
0569         template <class A>
0570         XSIMD_INLINE batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
0571         {
0572             return vld1q_f32(src);
0573         }
0574
0575         /*********
0576          * store *
0577          *********/
0578
0579         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0580         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0581         {
0582             vst1q_u8((uint8_t*)dst, src);
0583         }
0584
0585         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0586         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0587         {
0588             vst1q_s8((int8_t*)dst, src);
0589         }
0590
0591         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0592         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0593         {
0594             vst1q_u16((uint16_t*)dst, src);
0595         }
0596
0597         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0598         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0599         {
0600             vst1q_s16((int16_t*)dst, src);
0601         }
0602
0603         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0604         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0605         {
0606             vst1q_u32((uint32_t*)dst, src);
0607         }
0608
0609         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0610         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0611         {
0612             vst1q_s32((int32_t*)dst, src);
0613         }
0614
0615         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0616         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0617         {
0618             vst1q_u64((uint64_t*)dst, src);
0619         }
0620
0621         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0622         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0623         {
0624             vst1q_s64((int64_t*)dst, src);
0625         }
0626
0627         template <class A>
0628         XSIMD_INLINE void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
0629         {
0630             vst1q_f32(dst, src);
0631         }
0632
0633         template <class A, class T>
0634         XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0635         {
0636             store_aligned<A>(dst, src, A {});
0637         }
0638
0639         /****************
0640          * load_complex *
0641          ****************/
0642
0643         template <class A>
0644         XSIMD_INLINE batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
0645         {
0646             using real_batch = batch<float, A>;
0647             const float* buf = reinterpret_cast<const float*>(mem);
0648             float32x4x2_t tmp = vld2q_f32(buf);
0649             real_batch real = tmp.val[0],
0650                        imag = tmp.val[1];
0651             return batch<std::complex<float>, A> { real, imag };
0652         }
0653
0654         template <class A>
0655         XSIMD_INLINE batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
0656         {
0657             return load_complex_aligned<A>(mem, cvt, A {});
0658         }
0659
0660         /*****************
0661          * store_complex *
0662          *****************/
0663
0664         template <class A>
0665         XSIMD_INLINE void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
0666         {
0667             float32x4x2_t tmp;
0668             tmp.val[0] = src.real();
0669             tmp.val[1] = src.imag();
0670             float* buf = reinterpret_cast<float*>(dst);
0671             vst2q_f32(buf, tmp);
0672         }
0673
0674         template <class A>
0675         XSIMD_INLINE void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
0676         {
0677             store_complex_aligned(dst, src, A {});
0678         }
0679
0680         /*******
0681          * neg *
0682          *******/
0683
0684         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0685         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0686         {
0687             return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
0688         }
0689
0690         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0691         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0692         {
0693             return vnegq_s8(rhs);
0694         }
0695
0696         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0697         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0698         {
0699             return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
0700         }
0701
0702         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0703         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0704         {
0705             return vnegq_s16(rhs);
0706         }
0707
0708         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0709         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0710         {
0711             return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
0712         }
0713
0714         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0715         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0716         {
0717             return vnegq_s32(rhs);
0718         }
0719
0720         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0721         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0722         {
0723             return batch<T, A> { -rhs.get(0), -rhs.get(1) };
0724         }
0725
0726         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0727         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0728         {
0729             return batch<T, A> { -rhs.get(0), -rhs.get(1) };
0730         }
0731
0732         template <class A>
0733         XSIMD_INLINE batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
0734         {
0735             return vnegq_f32(rhs);
0736         }
0737
0738         /*******
0739          * add *
0740          *******/
0741
0742         WRAP_BINARY_INT(vaddq, detail::identity_return_type)
0743         WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
0744
0745         template <class A, class T, detail::enable_neon_type_t<T> = 0>
0746         XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0747         {
0748             using register_type = typename batch<T, A>::register_type;
0749             const detail::neon_dispatcher::binary dispatcher = {
0750                 std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
0751                                 wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
0752                                 wrap::vaddq_f32)
0753             };
0754             return dispatcher.apply(register_type(lhs), register_type(rhs));
0755         }
0756
0757         /*******
0758          * avg *
0759          *******/
0760
0761         WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
0762
0763         template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
0764         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0765         {
0766             using register_type = typename batch<T, A>::register_type;
0767             const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
0768                 std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
0769             };
0770             return dispatcher.apply(register_type(lhs), register_type(rhs));
0771         }
0772
0773         /********
0774          * avgr *
0775          ********/
0776
0777         WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
0778
0779         template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
0780         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0781         {
0782             using register_type = typename batch<T, A>::register_type;
0783             const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
0784                 std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
0785             };
0786             return dispatcher.apply(register_type(lhs), register_type(rhs));
0787         }
0788
0789         /********
0790          * sadd *
0791          ********/
0792
0793         WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
0794
0795         template <class A, class T, detail::enable_neon_type_t<T> = 0>
0796         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0797         {
0798             using register_type = typename batch<T, A>::register_type;
0799             const detail::neon_dispatcher::binary dispatcher = {
0800                 std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
0801                                 wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
0802                                 wrap::vaddq_f32)
0803             };
0804             return dispatcher.apply(register_type(lhs), register_type(rhs));
0805         }
0806
0807         /*******
0808          * sub *
0809          *******/
0810
0811         WRAP_BINARY_INT(vsubq, detail::identity_return_type)
0812         WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
0813
0814         template <class A, class T, detail::enable_neon_type_t<T> = 0>
0815         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0816         {
0817             using register_type = typename batch<T, A>::register_type;
0818             const detail::neon_dispatcher::binary dispatcher = {
0819                 std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
0820                                 wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
0821                                 wrap::vsubq_f32)
0822             };
0823             return dispatcher.apply(register_type(lhs), register_type(rhs));
0824         }
0825
0826         /********
0827          * ssub *
0828          ********/
0829
0830         WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
0831
0832         template <class A, class T, detail::enable_neon_type_t<T> = 0>
0833         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0834         {
0835             using register_type = typename batch<T, A>::register_type;
0836             const detail::neon_dispatcher::binary dispatcher = {
0837                 std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
0838                                 wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
0839                                 wrap::vsubq_f32)
0840             };
0841             return dispatcher.apply(register_type(lhs), register_type(rhs));
0842         }
0843
0844         /*******
0845          * mul *
0846          *******/
0847
0848         WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
0849         WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
0850
0851         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0852         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0853         {
0854             using register_type = typename batch<T, A>::register_type;
0855             const detail::excluding_int64_dispatcher::binary dispatcher = {
0856                 std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
0857                                 wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
0858             };
0859             return dispatcher.apply(register_type(lhs), register_type(rhs));
0860         }
0861
0862         /*******
0863          * div *
0864          *******/
0865
0866 #if defined(XSIMD_FAST_INTEGER_DIVISION)
0867         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0868         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0869         {
0870             return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
0871         }
0872
0873         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0874         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0875         {
0876             return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
0877         }
0878 #endif
0879
0880         template <class A>
0881         XSIMD_INLINE batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
0882         {
0883             // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
0884             // get an initial estimate of 1/b.
0885             float32x4_t rcp = reciprocal(rhs);
0886
0887             // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
0888             // application's accuracy requirements, you may be able to get away with only
0889             // one refinement (instead of the two used here).  Be sure to test!
0890             rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
0891             rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
0892
0893             // and finally, compute a / b = a * (1 / b)
0894             return vmulq_f32(lhs, rcp);
0895         }
0896
0897         /******
0898          * eq *
0899          ******/
0900
0901         WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
0902         WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
0903
0904         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0905         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0906         {
0907             using register_type = typename batch<T, A>::register_type;
0908             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
0909                 std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
0910                                 wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
0911             };
0912             return dispatcher.apply(register_type(lhs), register_type(rhs));
0913         }
0914
0915         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0916         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
0917         {
0918             using register_type = typename batch_bool<T, A>::register_type;
0919             using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
0920             const dispatcher_type dispatcher = {
0921                 std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
0922             };
0923             return dispatcher.apply(register_type(lhs), register_type(rhs));
0924         }
0925
0926         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0927         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0928         {
0929             return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
0930         }
0931
0932         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0933         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
0934         {
0935             return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
0936         }
0937
0938         /*************
0939          * fast_cast *
0940          *************/
0941
0942         namespace detail
0943         {
0944             template <class A>
0945             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
0946             {
0947                 return vcvtq_f32_s32(self);
0948             }
0949
0950             template <class A>
0951             XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
0952             {
0953                 return vcvtq_f32_u32(self);
0954             }
0955
0956             template <class A>
0957             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
0958             {
0959                 return vcvtq_s32_f32(self);
0960             }
0961
0962             template <class A>
0963             XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
0964             {
0965                 return vcvtq_u32_f32(self);
0966             }
0967
0968         }
0969
0970         /******
0971          * lt *
0972          ******/
0973
0974         WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
0975         WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
0976
0977         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0978         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0979         {
0980             using register_type = typename batch<T, A>::register_type;
0981             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
0982                 std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
0983                                 wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
0984             };
0985             return dispatcher.apply(register_type(lhs), register_type(rhs));
0986         }
0987
0988         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0989         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0990         {
0991             return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
0992         }
0993
0994         /******
0995          * le *
0996          ******/
0997
0998         WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
0999         WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
1000
1001         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1002         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1003         {
1004             using register_type = typename batch<T, A>::register_type;
1005             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1006                 std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
1007                                 wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
1008             };
1009             return dispatcher.apply(register_type(lhs), register_type(rhs));
1010         }
1011
1012         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1013         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1014         {
1015             return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
1016         }
1017
1018         /******
1019          * gt *
1020          ******/
1021
1022         WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
1023         WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
1024
1025         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1026         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1027         {
1028             using register_type = typename batch<T, A>::register_type;
1029             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1030                 std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
1031                                 wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
1032             };
1033             return dispatcher.apply(register_type(lhs), register_type(rhs));
1034         }
1035
1036         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1037         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1038         {
1039             return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
1040         }
1041
1042         /******
1043          * ge *
1044          ******/
1045
1046         WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
1047         WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
1048
1049         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1050         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1051         {
1052             using register_type = typename batch<T, A>::register_type;
1053             const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1054                 std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
1055                                 wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
1056             };
1057             return dispatcher.apply(register_type(lhs), register_type(rhs));
1058         }
1059
1060         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1061         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1062         {
1063             return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
1064         }
1065
1066         /*******************
1067          * batch_bool_cast *
1068          *******************/
1069
1070         template <class A, class T_out, class T_in>
1071         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
1072         {
1073             using register_type = typename batch_bool<T_out, A>::register_type;
1074             return register_type(self);
1075         }
1076
1077         /***************
1078          * bitwise_and *
1079          ***************/
1080
1081         WRAP_BINARY_INT(vandq, detail::identity_return_type)
1082
1083         namespace detail
1084         {
1085             XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1086             {
1087                 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
1088                                                        vreinterpretq_u32_f32(rhs)));
1089             }
1090
1091             template <class V>
1092             V bitwise_and_neon(V const& lhs, V const& rhs)
1093             {
1094                 const neon_dispatcher::binary dispatcher = {
1095                     std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
1096                                     wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
1097                                     bitwise_and_f32)
1098                 };
1099                 return dispatcher.apply(lhs, rhs);
1100             }
1101         }
1102
1103         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1104         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1105         {
1106             using register_type = typename batch<T, A>::register_type;
1107             return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1108         }
1109
1110         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1111         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1112         {
1113             using register_type = typename batch_bool<T, A>::register_type;
1114             return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1115         }
1116
1117         /**************
1118          * bitwise_or *
1119          **************/
1120
1121         WRAP_BINARY_INT(vorrq, detail::identity_return_type)
1122
1123         namespace detail
1124         {
1125             XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1126             {
1127                 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
1128                                                        vreinterpretq_u32_f32(rhs)));
1129             }
1130
1131             template <class V>
1132             XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
1133             {
1134                 const neon_dispatcher::binary dispatcher = {
1135                     std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
1136                                     wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
1137                                     bitwise_or_f32)
1138                 };
1139                 return dispatcher.apply(lhs, rhs);
1140             }
1141         }
1142
1143         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1144         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1145         {
1146             using register_type = typename batch<T, A>::register_type;
1147             return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1148         }
1149
1150         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1151         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1152         {
1153             using register_type = typename batch_bool<T, A>::register_type;
1154             return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1155         }
1156
1157         /***************
1158          * bitwise_xor *
1159          ***************/
1160
1161         WRAP_BINARY_INT(veorq, detail::identity_return_type)
1162
1163         namespace detail
1164         {
1165             XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1166             {
1167                 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
1168                                                        vreinterpretq_u32_f32(rhs)));
1169             }
1170
1171             template <class V>
1172             XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
1173             {
1174                 const neon_dispatcher::binary dispatcher = {
1175                     std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
1176                                     wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
1177                                     bitwise_xor_f32)
1178                 };
1179                 return dispatcher.apply(lhs, rhs);
1180             }
1181         }
1182
1183         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1184         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1185         {
1186             using register_type = typename batch<T, A>::register_type;
1187             return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1188         }
1189
1190         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1191         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1192         {
1193             using register_type = typename batch_bool<T, A>::register_type;
1194             return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1195         }
1196
1197         /*******
1198          * neq *
1199          *******/
1200
1201         template <class A, class T>
1202         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1203         {
1204             return bitwise_xor(lhs, rhs, A {});
1205         }
1206
1207         /***************
1208          * bitwise_not *
1209          ***************/
1210
1211         WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
1212
1213         namespace detail
1214         {
1215             XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1216             {
1217                 return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
1218             }
1219
1220             XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1221             {
1222                 return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
1223             }
1224
1225             XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1226             {
1227                 return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
1228             }
1229
1230             template <class V>
1231             XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1232             {
1233                 const neon_dispatcher::unary dispatcher = {
1234                     std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1235                                     wrap::vmvnq_u32, wrap::vmvnq_s32,
1236                                     bitwise_not_u64, bitwise_not_s64,
1237                                     bitwise_not_f32)
1238                 };
1239                 return dispatcher.apply(arg);
1240             }
1241         }
1242
1243         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1244         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
1245         {
1246             using register_type = typename batch<T, A>::register_type;
1247             return detail::bitwise_not_neon(register_type(arg));
1248         }
1249
1250         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1251         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
1252         {
1253             using register_type = typename batch_bool<T, A>::register_type;
1254             return detail::bitwise_not_neon(register_type(arg));
1255         }
1256
1257         /******************
1258          * bitwise_andnot *
1259          ******************/
1260
1261         WRAP_BINARY_INT(vbicq, detail::identity_return_type)
1262
1263         namespace detail
1264         {
1265             XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1266             {
1267                 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
1268             }
1269
1270             template <class V>
1271             XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
1272             {
1273                 const detail::neon_dispatcher::binary dispatcher = {
1274                     std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
1275                                     wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
1276                                     bitwise_andnot_f32)
1277                 };
1278                 return dispatcher.apply(lhs, rhs);
1279             }
1280         }
1281
1282         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1283         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1284         {
1285             using register_type = typename batch<T, A>::register_type;
1286             return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1287         }
1288
1289         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1290         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1291         {
1292             using register_type = typename batch_bool<T, A>::register_type;
1293             return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1294         }
1295
1296         /*******
1297          * min *
1298          *******/
1299
1300         WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
1301         WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
1302
1303         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1304         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1305         {
1306             using register_type = typename batch<T, A>::register_type;
1307             const detail::excluding_int64_dispatcher::binary dispatcher = {
1308                 std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
1309                                 wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
1310             };
1311             return dispatcher.apply(register_type(lhs), register_type(rhs));
1312         }
1313
1314         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1315         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1316         {
1317             return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
1318         }
1319
1320         /*******
1321          * max *
1322          *******/
1323
1324         WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
1325         WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
1326
1327         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1328         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1329         {
1330             using register_type = typename batch<T, A>::register_type;
1331             const detail::excluding_int64_dispatcher::binary dispatcher = {
1332                 std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
1333                                 wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
1334             };
1335             return dispatcher.apply(register_type(lhs), register_type(rhs));
1336         }
1337
1338         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1339         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1340         {
1341             return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
1342         }
1343
1344         /*******
1345          * abs *
1346          *******/
1347
1348         namespace wrap
1349         {
1350             XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
1351             XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
1352             XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
1353         }
1354         WRAP_UNARY_FLOAT(vabsq)
1355
1356         namespace detail
1357         {
1358             XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
1359             {
1360                 return arg;
1361             }
1362
1363             XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
1364             {
1365                 return arg;
1366             }
1367
1368             XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
1369             {
1370                 return arg;
1371             }
1372         }
1373
1374         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1375         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
1376         {
1377             using register_type = typename batch<T, A>::register_type;
1378             const detail::excluding_int64_dispatcher::unary dispatcher = {
1379                 std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
1380                                 detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
1381             };
1382             return dispatcher.apply(register_type(arg));
1383         }
1384
1385         /********
1386          * rsqrt *
1387          ********/
1388
1389         template <class A>
1390         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1391         {
1392             return vrsqrteq_f32(arg);
1393         }
1394
1395         /********
1396          * sqrt *
1397          ********/
1398
1399         template <class A>
1400         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1401         {
1402             batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
1403             // one iter
1404             sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1405             batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1406             batch<float, A> zero(0.f);
1407             return select(arg == zero, zero, sqrt_approx);
1408         }
1409
1410         /********************
1411          * Fused operations *
1412          ********************/
1413
1414 #ifdef __ARM_FEATURE_FMA
1415         template <class A>
1416         XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1417         {
1418             return vfmaq_f32(z, x, y);
1419         }
1420
1421         template <class A>
1422         XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1423         {
1424             return vfmaq_f32(-z, x, y);
1425         }
1426 #endif
1427
1428         /*********
1429          * haddp *
1430          *********/
1431
1432         template <class A>
1433         XSIMD_INLINE batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
1434         {
1435             // row = (a,b,c,d)
1436             float32x2_t tmp1, tmp2, tmp3;
1437             // tmp1 = (a0 + a2, a1 + a3)
1438             tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
1439             // tmp2 = (b0 + b2, b1 + b3)
1440             tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
1441             // tmp1 = (a0..3, b0..3)
1442             tmp1 = vpadd_f32(tmp1, tmp2);
1443             // tmp2 = (c0 + c2, c1 + c3)
1444             tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
1445             // tmp3 = (d0 + d2, d1 + d3)
1446             tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
1447             // tmp1 = (c0..3, d0..3)
1448             tmp2 = vpadd_f32(tmp2, tmp3);
1449             // return = (a0..3, b0..3, c0..3, d0..3)
1450             return vcombine_f32(tmp1, tmp2);
1451         }
1452
1453         /**************
1454          * reciprocal *
1455          **************/
1456
1457         template <class A>
1458         XSIMD_INLINE batch<float, A>
1459         reciprocal(const batch<float, A>& x,
1460                    kernel::requires_arch<neon>) noexcept
1461         {
1462             return vrecpeq_f32(x);
1463         }
1464
1465         /**********
1466          * insert *
1467          **********/
1468
1469         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
1470         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1471         {
1472             return vsetq_lane_u8(val, self, I);
1473         }
1474
1475         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
1476         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1477         {
1478             return vsetq_lane_s8(val, self, I);
1479         }
1480
1481         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
1482         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1483         {
1484             return vsetq_lane_u16(val, self, I);
1485         }
1486
1487         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
1488         XSIMD_INLINE batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
1489         {
1490             return vsetq_lane_s16(val, self, I);
1491         }
1492
1493         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
1494         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1495         {
1496             return vsetq_lane_u32(val, self, I);
1497         }
1498
1499         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
1500         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1501         {
1502             return vsetq_lane_s32(val, self, I);
1503         }
1504
1505         template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
1506         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1507         {
1508             return vsetq_lane_u64(val, self, I);
1509         }
1510
1511         template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
1512         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1513         {
1514             return vsetq_lane_s64(val, self, I);
1515         }
1516
1517         template <class A, size_t I>
1518         XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
1519         {
1520             return vsetq_lane_f32(val, self, I);
1521         }
1522
1523         /********************
1524          * nearbyint_as_int *
1525          *******************/
1526
1527         template <class A>
1528         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1529                                                         requires_arch<neon>) noexcept
1530         {
1531             /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
1532             //  Contributors to this work are:
1533             //   John W. Ratcliff <jratcliffscarab@gmail.com>
1534             //   Brandon Rowlett <browlett@nvidia.com>
1535             //   Ken Fast <kfast@gdeb.com>
1536             //   Eric van Beurden <evanbeurden@nvidia.com>
1537             //   Alexander Potylitsin <apotylitsin@nvidia.com>
1538             //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
1539             //   Jim Huang <jserv@biilabs.io>
1540             //   Mark Cheng <marktwtn@biilabs.io>
1541             //   Malcolm James MacLeod <malcolm@gulden.com>
1542             //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
1543             //   Sebastian Pop <spop@amazon.com>
1544             //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
1545             //   Danila Kutenin <danilak@google.com>
1546             //   François Turban (JishinMaster) <francois.turban@gmail.com>
1547             //   Pei-Hsuan Hung <afcidk@gmail.com>
1548             //   Yang-Hao Yuan <yanghau@biilabs.io>
1549             //   Syoyo Fujita <syoyo@lighttransport.com>
1550             //   Brecht Van Lommel <brecht@blender.org>
1551
1552             /*
1553              * sse2neon is freely redistributable under the MIT License.
1554              *
1555              * Permission is hereby granted, free of charge, to any person obtaining a copy
1556              * of this software and associated documentation files (the "Software"), to deal
1557              * in the Software without restriction, including without limitation the rights
1558              * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1559              * copies of the Software, and to permit persons to whom the Software is
1560              * furnished to do so, subject to the following conditions:
1561              *
1562              * The above copyright notice and this permission notice shall be included in
1563              * all copies or substantial portions of the Software.
1564              *
1565              * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1566              * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1567              * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1568              * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1569              * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1570              * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1571              * SOFTWARE.
1572              */
1573
1574             const auto signmask = vdupq_n_u32(0x80000000);
1575             const auto half = vbslq_f32(signmask, self,
1576                                         vdupq_n_f32(0.5f)); /* +/- 0.5 */
1577             const auto r_normal = vcvtq_s32_f32(vaddq_f32(
1578                 self, half)); /* round to integer: [a + 0.5]*/
1579             const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */
1580             const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
1581                 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
1582             const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1583                                           vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
1584             const auto delta = vsubq_f32(
1585                 self,
1586                 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
1587             const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
1588             return vbslq_s32(is_delta_half, r_even, r_normal);
1589         }
1590
1591         /**************
1592          * reduce_add *
1593          **************/
1594
1595         namespace detail
1596         {
1597             template <class T, class A, class V>
1598             XSIMD_INLINE T sum_batch(V const& arg) noexcept
1599             {
1600                 T res = T(0);
1601                 for (std::size_t i = 0; i < batch<T, A>::size; ++i)
1602                 {
1603                     res += arg[i];
1604                 }
1605                 return res;
1606             }
1607         }
1608
1609         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1610         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1611         {
1612             uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
1613             tmp = vpadd_u8(tmp, tmp);
1614             tmp = vpadd_u8(tmp, tmp);
1615             tmp = vpadd_u8(tmp, tmp);
1616             return vget_lane_u8(tmp, 0);
1617         }
1618
1619         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1620         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1621         {
1622             int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
1623             tmp = vpadd_s8(tmp, tmp);
1624             tmp = vpadd_s8(tmp, tmp);
1625             tmp = vpadd_s8(tmp, tmp);
1626             return vget_lane_s8(tmp, 0);
1627         }
1628
1629         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1630         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1631         {
1632             uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
1633             tmp = vpadd_u16(tmp, tmp);
1634             tmp = vpadd_u16(tmp, tmp);
1635             return vget_lane_u16(tmp, 0);
1636         }
1637
1638         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1639         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1640         {
1641             int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
1642             tmp = vpadd_s16(tmp, tmp);
1643             tmp = vpadd_s16(tmp, tmp);
1644             return vget_lane_s16(tmp, 0);
1645         }
1646
1647         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1648         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1649         {
1650             uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
1651             tmp = vpadd_u32(tmp, tmp);
1652             return vget_lane_u32(tmp, 0);
1653         }
1654
1655         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1656         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1657         {
1658             int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
1659             tmp = vpadd_s32(tmp, tmp);
1660             return vget_lane_s32(tmp, 0);
1661         }
1662
1663         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1664         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1665         {
1666             return arg.get(0) + arg.get(1);
1667         }
1668
1669         template <class A>
1670         XSIMD_INLINE float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
1671         {
1672             float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
1673             tmp = vpadd_f32(tmp, tmp);
1674             return vget_lane_f32(tmp, 0);
1675         }
1676
1677         /**************
1678          * reduce_max *
1679          **************/
1680
1681         // Using generic implementation because ARM doe snot provide intrinsics
1682         // for this operation
1683
1684         /**************
1685          * reduce_min *
1686          **************/
1687
1688         // Using generic implementation because ARM doe snot provide intrinsics
1689         // for this operation
1690
1691         /**********
1692          * select *
1693          **********/
1694
1695         namespace wrap
1696         {
1697             XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
1698             XSIMD_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
1699             XSIMD_INLINE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
1700             XSIMD_INLINE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
1701             XSIMD_INLINE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
1702             XSIMD_INLINE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
1703             XSIMD_INLINE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
1704             XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
1705             XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
1706         }
1707
1708         namespace detail
1709         {
1710             template <class... T>
1711             struct neon_select_dispatcher_impl
1712             {
1713                 using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
1714                 const container_type m_func;
1715
1716                 template <class U>
1717                 U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
1718                 {
1719                     using func_type = U (*)(comp_return_type<U>, U, U);
1720                     auto func = xsimd::detail::get<func_type>(m_func);
1721                     return func(cond, lhs, rhs);
1722                 }
1723             };
1724
1725             using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
1726                                                                        uint16x8_t, int16x8_t,
1727                                                                        uint32x4_t, int32x4_t,
1728                                                                        uint64x2_t, int64x2_t,
1729                                                                        float32x4_t>;
1730         }
1731
1732         template <class A, class T, detail::enable_neon_type_t<T> = 0>
1733         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
1734         {
1735             using bool_register_type = typename batch_bool<T, A>::register_type;
1736             using register_type = typename batch<T, A>::register_type;
1737             const detail::neon_select_dispatcher dispatcher = {
1738                 std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
1739                                 wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
1740                                 wrap::vbslq_f32)
1741             };
1742             return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
1743         }
1744
1745         template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
1746         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
1747         {
1748             return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
1749         }
1750
1751         /*************
1752          * transpose *
1753          *************/
1754         template <class A>
1755         XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
1756         {
1757             assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1758             (void)matrix_end;
1759             auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1760             auto t01 = vtrnq_f32(r0, r1);
1761             auto t23 = vtrnq_f32(r2, r3);
1762             matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
1763             matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
1764             matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
1765             matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
1766         }
1767         template <class A>
1768         XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
1769         {
1770             assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
1771             (void)matrix_end;
1772             auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1773             auto t01 = vtrnq_u32(r0, r1);
1774             auto t23 = vtrnq_u32(r2, r3);
1775             matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
1776             matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
1777             matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
1778             matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
1779         }
1780         template <class A>
1781         XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
1782         {
1783             assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
1784             (void)matrix_end;
1785             auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1786             auto t01 = vtrnq_s32(r0, r1);
1787             auto t23 = vtrnq_s32(r2, r3);
1788             matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
1789             matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
1790             matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
1791             matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
1792         }
1793
1794         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1795         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1796         {
1797             assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1798             (void)matrix_end;
1799             auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1800             matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
1801             matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
1802         }
1803
1804         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1805         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1806         {
1807             assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1808             (void)matrix_end;
1809             auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1810             matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
1811             matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
1812         }
1813
1814         /**********
1815          * zip_lo *
1816          **********/
1817
1818         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1819         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1820         {
1821             uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
1822             return vcombine_u8(tmp.val[0], tmp.val[1]);
1823         }
1824
1825         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1826         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1827         {
1828             int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
1829             return vcombine_s8(tmp.val[0], tmp.val[1]);
1830         }
1831
1832         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1833         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1834         {
1835             uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
1836             return vcombine_u16(tmp.val[0], tmp.val[1]);
1837         }
1838
1839         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1840         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1841         {
1842             int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
1843             return vcombine_s16(tmp.val[0], tmp.val[1]);
1844         }
1845
1846         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1847         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1848         {
1849             uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
1850             return vcombine_u32(tmp.val[0], tmp.val[1]);
1851         }
1852
1853         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1854         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1855         {
1856             int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
1857             return vcombine_s32(tmp.val[0], tmp.val[1]);
1858         }
1859
1860         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1861         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1862         {
1863             return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
1864         }
1865
1866         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1867         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1868         {
1869             return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
1870         }
1871
1872         template <class A>
1873         XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1874         {
1875             float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
1876             return vcombine_f32(tmp.val[0], tmp.val[1]);
1877         }
1878
1879         /**********
1880          * zip_hi *
1881          **********/
1882
1883         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1884         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1885         {
1886             uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
1887             return vcombine_u8(tmp.val[0], tmp.val[1]);
1888         }
1889
1890         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1891         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1892         {
1893             int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
1894             return vcombine_s8(tmp.val[0], tmp.val[1]);
1895         }
1896
1897         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1898         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1899         {
1900             uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
1901             return vcombine_u16(tmp.val[0], tmp.val[1]);
1902         }
1903
1904         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1905         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1906         {
1907             int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
1908             return vcombine_s16(tmp.val[0], tmp.val[1]);
1909         }
1910
1911         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1912         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1913         {
1914             uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
1915             return vcombine_u32(tmp.val[0], tmp.val[1]);
1916         }
1917
1918         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1919         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1920         {
1921             int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
1922             return vcombine_s32(tmp.val[0], tmp.val[1]);
1923         }
1924
1925         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1926         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1927         {
1928             return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
1929         }
1930
1931         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1932         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1933         {
1934             return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
1935         }
1936
1937         template <class A>
1938         XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1939         {
1940             float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
1941             return vcombine_f32(tmp.val[0], tmp.val[1]);
1942         }
1943
1944         /****************
1945          * extract_pair *
1946          ****************/
1947
1948         namespace detail
1949         {
1950             template <class A, class T>
1951             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
1952             {
1953                 assert(false && "extract_pair out of bounds");
1954                 return batch<T, A> {};
1955             }
1956
1957             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
1958             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1959             {
1960                 if (n == I)
1961                 {
1962                     return vextq_u8(rhs, lhs, I);
1963                 }
1964                 else
1965                 {
1966                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1967                 }
1968             }
1969
1970             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
1971             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1972             {
1973                 if (n == I)
1974                 {
1975                     return vextq_s8(rhs, lhs, I);
1976                 }
1977                 else
1978                 {
1979                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1980                 }
1981             }
1982
1983             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
1984             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1985             {
1986                 if (n == I)
1987                 {
1988                     return vextq_u16(rhs, lhs, I);
1989                 }
1990                 else
1991                 {
1992                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1993                 }
1994             }
1995
1996             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
1997             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1998             {
1999                 if (n == I)
2000                 {
2001                     return vextq_s16(rhs, lhs, I);
2002                 }
2003                 else
2004                 {
2005                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2006                 }
2007             }
2008
2009             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2010             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2011             {
2012                 if (n == I)
2013                 {
2014                     return vextq_u32(rhs, lhs, I);
2015                 }
2016                 else
2017                 {
2018                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2019                 }
2020             }
2021
2022             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
2023             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2024             {
2025                 if (n == I)
2026                 {
2027                     return vextq_s32(rhs, lhs, I);
2028                 }
2029                 else
2030                 {
2031                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2032                 }
2033             }
2034
2035             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2036             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2037             {
2038                 if (n == I)
2039                 {
2040                     return vextq_u64(rhs, lhs, I);
2041                 }
2042                 else
2043                 {
2044                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2045                 }
2046             }
2047
2048             template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
2049             XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2050             {
2051                 if (n == I)
2052                 {
2053                     return vextq_s64(rhs, lhs, I);
2054                 }
2055                 else
2056                 {
2057                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2058                 }
2059             }
2060
2061             template <class A, size_t I, size_t... Is>
2062             XSIMD_INLINE batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2063             {
2064                 if (n == I)
2065                 {
2066                     return vextq_f32(rhs, lhs, I);
2067                 }
2068                 else
2069                 {
2070                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2071                 }
2072             }
2073
2074             template <class A, class T, size_t... Is>
2075             XSIMD_INLINE batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
2076             {
2077                 if (n == 0)
2078                 {
2079                     return rhs;
2080                 }
2081                 else
2082                 {
2083                     return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2084                 }
2085             }
2086         }
2087
2088         template <class A, class T>
2089         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
2090         {
2091             constexpr std::size_t size = batch<T, A>::size;
2092             assert(n < size && "index in bounds");
2093             return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
2094         }
2095
2096         /******************
2097          * bitwise_lshift *
2098          ******************/
2099
2100         namespace detail
2101         {
2102             template <class A, class T>
2103             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
2104             {
2105                 assert(false && "bitwise_lshift out of bounds");
2106                 return batch<T, A> {};
2107             }
2108
2109             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2110             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2111             {
2112                 if (n == I)
2113                 {
2114                     return vshlq_n_u8(lhs, I);
2115                 }
2116                 else
2117                 {
2118                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2119                 }
2120             }
2121
2122             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2123             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2124             {
2125                 if (n == I)
2126                 {
2127                     return vshlq_n_s8(lhs, I);
2128                 }
2129                 else
2130                 {
2131                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2132                 }
2133             }
2134
2135             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2136             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2137             {
2138                 if (n == I)
2139                 {
2140                     return vshlq_n_u16(lhs, I);
2141                 }
2142                 else
2143                 {
2144                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2145                 }
2146             }
2147
2148             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2149             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2150             {
2151                 if (n == I)
2152                 {
2153                     return vshlq_n_s16(lhs, I);
2154                 }
2155                 else
2156                 {
2157                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2158                 }
2159             }
2160
2161             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2162             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2163             {
2164                 if (n == I)
2165                 {
2166                     return vshlq_n_u32(lhs, I);
2167                 }
2168                 else
2169                 {
2170                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2171                 }
2172             }
2173
2174             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2175             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2176             {
2177                 if (n == I)
2178                 {
2179                     return vshlq_n_s32(lhs, I);
2180                 }
2181                 else
2182                 {
2183                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2184                 }
2185             }
2186
2187             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2188             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2189             {
2190                 if (n == I)
2191                 {
2192                     return vshlq_n_u64(lhs, I);
2193                 }
2194                 else
2195                 {
2196                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2197                 }
2198             }
2199
2200             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2201             XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2202             {
2203                 if (n == I)
2204                 {
2205                     return vshlq_n_s64(lhs, I);
2206                 }
2207                 else
2208                 {
2209                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2210                 }
2211             }
2212
2213             template <class A, class T, int... Is>
2214             XSIMD_INLINE batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2215             {
2216                 if (n == 0)
2217                 {
2218                     return lhs;
2219                 }
2220                 else
2221                 {
2222                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2223                 }
2224             }
2225         }
2226
2227         template <class A, class T>
2228         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2229         {
2230             constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2231             assert(0 <= n && n < size && "index in bounds");
2232             return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2233         }
2234
2235         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2236         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2237         {
2238             return vshlq_u8(lhs, rhs);
2239         }
2240
2241         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2242         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2243         {
2244             return vshlq_s8(lhs, rhs);
2245         }
2246
2247         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2248         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2249         {
2250             return vshlq_u16(lhs, rhs);
2251         }
2252
2253         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2254         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2255         {
2256             return vshlq_s16(lhs, rhs);
2257         }
2258
2259         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2260         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2261         {
2262             return vshlq_u32(lhs, rhs);
2263         }
2264
2265         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2266         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2267         {
2268             return vshlq_s32(lhs, rhs);
2269         }
2270
2271         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2272         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2273         {
2274             return vshlq_u64(lhs, rhs);
2275         }
2276
2277         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2278         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2279         {
2280             return vshlq_s64(lhs, rhs);
2281         }
2282
2283         /******************
2284          * bitwise_rshift *
2285          ******************/
2286
2287         namespace detail
2288         {
2289             template <class A, class T>
2290             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
2291             {
2292                 assert(false && "bitwise_rshift out of bounds");
2293                 return batch<T, A> {};
2294             }
2295
2296             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2297             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2298             {
2299                 if (n == I)
2300                 {
2301                     return vshrq_n_u8(lhs, I);
2302                 }
2303                 else
2304                 {
2305                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2306                 }
2307             }
2308
2309             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2310             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2311             {
2312                 if (n == I)
2313                 {
2314                     return vshrq_n_s8(lhs, I);
2315                 }
2316                 else
2317                 {
2318                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2319                 }
2320             }
2321
2322             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2323             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2324             {
2325                 if (n == I)
2326                 {
2327                     return vshrq_n_u16(lhs, I);
2328                 }
2329                 else
2330                 {
2331                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2332                 }
2333             }
2334
2335             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2336             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2337             {
2338                 if (n == I)
2339                 {
2340                     return vshrq_n_s16(lhs, I);
2341                 }
2342                 else
2343                 {
2344                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2345                 }
2346             }
2347
2348             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2349             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2350             {
2351                 if (n == I)
2352                 {
2353                     return vshrq_n_u32(lhs, I);
2354                 }
2355                 else
2356                 {
2357                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2358                 }
2359             }
2360
2361             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2362             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2363             {
2364                 if (n == I)
2365                 {
2366                     return vshrq_n_s32(lhs, I);
2367                 }
2368                 else
2369                 {
2370                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2371                 }
2372             }
2373
2374             template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2375             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2376             {
2377                 if (n == I)
2378                 {
2379                     return vshrq_n_u64(lhs, I);
2380                 }
2381                 else
2382                 {
2383                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2384                 }
2385             }
2386
2387             template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2388             XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2389             {
2390                 if (n == I)
2391                 {
2392                     return vshrq_n_s64(lhs, I);
2393                 }
2394                 else
2395                 {
2396                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2397                 }
2398             }
2399
2400             template <class A, class T, int... Is>
2401             XSIMD_INLINE batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2402             {
2403                 if (n == 0)
2404                 {
2405                     return lhs;
2406                 }
2407                 else
2408                 {
2409                     return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2410                 }
2411             }
2412         }
2413
2414         template <class A, class T>
2415         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2416         {
2417             constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2418             assert(0 <= n && n < size && "index in bounds");
2419             return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2420         }
2421
2422         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2423         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2424         {
2425             return vshlq_u8(lhs, vnegq_s8(rhs));
2426         }
2427
2428         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2429         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2430         {
2431             return vshlq_s8(lhs, vnegq_s8(rhs));
2432         }
2433
2434         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2435         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2436         {
2437             return vshlq_u16(lhs, vnegq_s16(rhs));
2438         }
2439
2440         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2441         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2442         {
2443             return vshlq_s16(lhs, vnegq_s16(rhs));
2444         }
2445
2446         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2447         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2448         {
2449             return vshlq_u32(lhs, vnegq_s32(rhs));
2450         }
2451
2452         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2453         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2454         {
2455             return vshlq_s32(lhs, vnegq_s32(rhs));
2456         }
2457
2458         // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
2459
2460         /*******
2461          * all *
2462          *******/
2463
2464         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2465         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2466         {
2467             uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
2468             return vget_lane_u64(tmp, 0) == ~0ULL;
2469         }
2470
2471         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2472         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2473         {
2474             return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2475         }
2476
2477         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2478         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2479         {
2480             return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2481         }
2482
2483         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2484         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2485         {
2486             return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2487         }
2488
2489         /*******
2490          * any *
2491          *******/
2492
2493         template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2494         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2495         {
2496             uint32x2_t tmp = vqmovn_u64(arg);
2497             return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
2498         }
2499
2500         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2501         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2502         {
2503             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2504         }
2505
2506         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2507         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2508         {
2509             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2510         }
2511
2512         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2513         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2514         {
2515             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2516         }
2517
2518         /****************
2519          * bitwise_cast *
2520          ****************/
2521
2522 #define WRAP_CAST(SUFFIX, TYPE)                                                \
2523     namespace wrap                                                             \
2524     {                                                                          \
2525         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
2526         {                                                                      \
2527             return ::vreinterpretq_##SUFFIX##_u8(a);                           \
2528         }                                                                      \
2529         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
2530         {                                                                      \
2531             return ::vreinterpretq_##SUFFIX##_s8(a);                           \
2532         }                                                                      \
2533         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
2534         {                                                                      \
2535             return ::vreinterpretq_##SUFFIX##_u16(a);                          \
2536         }                                                                      \
2537         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
2538         {                                                                      \
2539             return ::vreinterpretq_##SUFFIX##_s16(a);                          \
2540         }                                                                      \
2541         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
2542         {                                                                      \
2543             return ::vreinterpretq_##SUFFIX##_u32(a);                          \
2544         }                                                                      \
2545         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
2546         {                                                                      \
2547             return ::vreinterpretq_##SUFFIX##_s32(a);                          \
2548         }                                                                      \
2549         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
2550         {                                                                      \
2551             return ::vreinterpretq_##SUFFIX##_u64(a);                          \
2552         }                                                                      \
2553         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
2554         {                                                                      \
2555             return ::vreinterpretq_##SUFFIX##_s64(a);                          \
2556         }                                                                      \
2557         XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
2558         {                                                                      \
2559             return ::vreinterpretq_##SUFFIX##_f32(a);                          \
2560         }                                                                      \
2561     }
2562
2563         WRAP_CAST(u8, uint8x16_t)
2564         WRAP_CAST(s8, int8x16_t)
2565         WRAP_CAST(u16, uint16x8_t)
2566         WRAP_CAST(s16, int16x8_t)
2567         WRAP_CAST(u32, uint32x4_t)
2568         WRAP_CAST(s32, int32x4_t)
2569         WRAP_CAST(u64, uint64x2_t)
2570         WRAP_CAST(s64, int64x2_t)
2571         WRAP_CAST(f32, float32x4_t)
2572
2573 #undef WRAP_CAST
2574
2575         namespace detail
2576         {
2577             template <class R, class... T>
2578             struct bitwise_caster_impl
2579             {
2580                 using container_type = std::tuple<R (*)(T)...>;
2581                 container_type m_func;
2582
2583                 template <class U>
2584                 R apply(U rhs) const noexcept
2585                 {
2586                     using func_type = R (*)(U);
2587                     auto func = xsimd::detail::get<func_type>(m_func);
2588                     return func(rhs);
2589                 }
2590             };
2591
2592             template <class R, class... T>
2593             XSIMD_INLINE const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
2594             {
2595                 return { std::make_tuple(arg...) };
2596             }
2597
2598             template <class... T>
2599             struct type_list
2600             {
2601             };
2602
2603             template <class RTL, class TTL>
2604             struct bitwise_caster;
2605
2606             template <class... R, class... T>
2607             struct bitwise_caster<type_list<R...>, type_list<T...>>
2608             {
2609                 using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
2610                 container_type m_caster;
2611
2612                 template <class V, class U>
2613                 V apply(U rhs) const noexcept
2614                 {
2615                     using caster_type = bitwise_caster_impl<V, T...>;
2616                     auto caster = xsimd::detail::get<caster_type>(m_caster);
2617                     return caster.apply(rhs);
2618                 }
2619             };
2620
2621             template <class... T>
2622             using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
2623
2624             using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
2625                                                          uint16x8_t, int16x8_t,
2626                                                          uint32x4_t, int32x4_t,
2627                                                          uint64x2_t, int64x2_t,
2628                                                          float32x4_t>;
2629         }
2630
2631         template <class A, class T, class R>
2632         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
2633         {
2634             const detail::neon_bitwise_caster caster = {
2635                 std::make_tuple(
2636                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
2637                                                      wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
2638                                                      wrap::vreinterpretq_u8_f32),
2639                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
2640                                                      wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
2641                                                      wrap::vreinterpretq_s8_f32),
2642                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
2643                                                      wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
2644                                                      wrap::vreinterpretq_u16_f32),
2645                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
2646                                                      wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
2647                                                      wrap::vreinterpretq_s16_f32),
2648                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
2649                                                      wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
2650                                                      wrap::vreinterpretq_u32_f32),
2651                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
2652                                                      wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
2653                                                      wrap::vreinterpretq_s32_f32),
2654                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
2655                                                      wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
2656                                                      wrap::vreinterpretq_u64_f32),
2657                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
2658                                                      wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
2659                                                      wrap::vreinterpretq_s64_f32),
2660                     detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
2661                                                      wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
2662                                                      wrap::vreinterpretq_f32_f32))
2663             };
2664             using src_register_type = typename batch<T, A>::register_type;
2665             using dst_register_type = typename batch<R, A>::register_type;
2666             return caster.apply<dst_register_type>(src_register_type(arg));
2667         }
2668
2669         /*********
2670          * isnan *
2671          *********/
2672
2673         template <class A>
2674         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
2675         {
2676             return !(arg == arg);
2677         }
2678
2679         // slide_left
2680         namespace detail
2681         {
2682             template <size_t N>
2683             struct slider_left
2684             {
2685                 template <class A, class T>
2686                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2687                 {
2688                     const auto left = vdupq_n_u8(0);
2689                     const auto right = bitwise_cast<uint8_t>(x).data;
2690                     const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
2691                     return bitwise_cast<T>(res);
2692                 }
2693             };
2694
2695             template <>
2696             struct slider_left<0>
2697             {
2698                 template <class A, class T>
2699                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2700                 {
2701                     return x;
2702                 }
2703             };
2704         } // namespace detail
2705
2706         template <size_t N, class A, class T>
2707         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
2708         {
2709             return detail::slider_left<N> {}(x, A {});
2710         }
2711
2712         // slide_right
2713         namespace detail
2714         {
2715             template <size_t N>
2716             struct slider_right
2717             {
2718                 template <class A, class T>
2719                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2720                 {
2721                     const auto left = bitwise_cast<uint8_t>(x).data;
2722                     const auto right = vdupq_n_u8(0);
2723                     const batch<uint8_t, A> res(vextq_u8(left, right, N));
2724                     return bitwise_cast<T>(res);
2725                 }
2726             };
2727
2728             template <>
2729             struct slider_right<16>
2730             {
2731                 template <class A, class T>
2732                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
2733                 {
2734                     return batch<T, A> {};
2735                 }
2736             };
2737         } // namespace detail
2738
2739         template <size_t N, class A, class T>
2740         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
2741         {
2742             return detail::slider_right<N> {}(x, A {});
2743         }
2744
2745         /****************
2746          * rotate_left *
2747          ****************/
2748         namespace wrap
2749         {
2750             template <size_t N>
2751             XSIMD_INLINE uint8x16_t rotate_left_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
2752             template <size_t N>
2753             XSIMD_INLINE int8x16_t rotate_left_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
2754             template <size_t N>
2755             XSIMD_INLINE uint16x8_t rotate_left_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
2756             template <size_t N>
2757             XSIMD_INLINE int16x8_t rotate_left_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
2758             template <size_t N>
2759             XSIMD_INLINE uint32x4_t rotate_left_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
2760             template <size_t N>
2761             XSIMD_INLINE int32x4_t rotate_left_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
2762             template <size_t N>
2763             XSIMD_INLINE uint64x2_t rotate_left_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
2764             template <size_t N>
2765             XSIMD_INLINE int64x2_t rotate_left_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
2766             template <size_t N>
2767             XSIMD_INLINE float32x4_t rotate_left_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
2768         }
2769
2770         template <size_t N, class A, class T, detail::enable_neon_type_t<T> = 0>
2771         XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<neon>) noexcept
2772         {
2773             using register_type = typename batch<T, A>::register_type;
2774             const detail::neon_dispatcher::binary dispatcher = {
2775                 std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N>, wrap::rotate_left_s16<N>,
2776                                 wrap::rotate_left_u32<N>, wrap::rotate_left_s32<N>, wrap::rotate_left_u64<N>, wrap::rotate_left_s64<N>,
2777                                 wrap::rotate_left_f32<N>)
2778             };
2779             return dispatcher.apply(register_type(a), register_type(a));
2780         }
2781     }
2782
2783     template <typename T, class A, T... Values>
2784     struct batch_constant;
2785
2786     namespace kernel
2787     {
2788         /***********
2789          * swizzle *
2790          ***********/
2791
2792         template <class A, class T, class I, I... idx>
2793         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
2794                                          batch_constant<I, A, idx...>,
2795                                          requires_arch<neon>) noexcept
2796         {
2797             static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
2798             std::array<T, batch<T, A>::size> data;
2799             self.store_aligned(data.data());
2800             return set(batch<T, A>(), A(), data[idx]...);
2801         }
2802     }
2803
2804 }
2805
2806 #undef WRAP_BINARY_INT_EXCLUDING_64
2807 #undef WRAP_BINARY_INT
2808 #undef WRAP_BINARY_FLOAT
2809 #undef WRAP_UNARY_INT_EXCLUDING_64
2810 #undef WRAP_UNARY_INT
2811 #undef WRAP_UNARY_FLOAT
2812
2813 #endif