File indexing completed on 2025-08-28 09:11:33
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_NEON_HPP
0013 #define XSIMD_NEON_HPP
0014
0015 #include <algorithm>
0016 #include <complex>
0017 #include <tuple>
0018 #include <type_traits>
0019
0020 #include "../types/xsimd_neon_register.hpp"
0021 #include "../types/xsimd_utils.hpp"
0022
0023
0024
0025
0026 #define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
0027 namespace wrap \
0028 { \
0029 XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \
0030 { \
0031 return ::OP##_u8(a, b); \
0032 } \
0033 XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
0034 { \
0035 return ::OP##_u16(a, b); \
0036 } \
0037 XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
0038 { \
0039 return ::OP##_u32(a, b); \
0040 } \
0041 }
0042
0043 #define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
0044 WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
0045 namespace wrap \
0046 { \
0047 XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
0048 { \
0049 return ::OP##_s8(a, b); \
0050 } \
0051 XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
0052 { \
0053 return ::OP##_s16(a, b); \
0054 } \
0055 XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
0056 { \
0057 return ::OP##_s32(a, b); \
0058 } \
0059 }
0060
0061 #define WRAP_BINARY_INT(OP, RT) \
0062 WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
0063 namespace wrap \
0064 { \
0065 XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
0066 { \
0067 return ::OP##_u64(a, b); \
0068 } \
0069 XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept \
0070 { \
0071 return ::OP##_s64(a, b); \
0072 } \
0073 }
0074
0075 #define WRAP_BINARY_FLOAT(OP, RT) \
0076 namespace wrap \
0077 { \
0078 XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
0079 { \
0080 return ::OP##_f32(a, b); \
0081 } \
0082 }
0083
0084 #define WRAP_UNARY_INT_EXCLUDING_64(OP) \
0085 namespace wrap \
0086 { \
0087 XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept \
0088 { \
0089 return ::OP##_u8(a); \
0090 } \
0091 XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept \
0092 { \
0093 return ::OP##_s8(a); \
0094 } \
0095 XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
0096 { \
0097 return ::OP##_u16(a); \
0098 } \
0099 XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept \
0100 { \
0101 return ::OP##_s16(a); \
0102 } \
0103 XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
0104 { \
0105 return ::OP##_u32(a); \
0106 } \
0107 XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept \
0108 { \
0109 return ::OP##_s32(a); \
0110 } \
0111 }
0112
0113 #define WRAP_UNARY_INT(OP) \
0114 WRAP_UNARY_INT_EXCLUDING_64(OP) \
0115 namespace wrap \
0116 { \
0117 XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
0118 { \
0119 return ::OP##_u64(a); \
0120 } \
0121 XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept \
0122 { \
0123 return ::OP##_s64(a); \
0124 } \
0125 }
0126
0127 #define WRAP_UNARY_FLOAT(OP) \
0128 namespace wrap \
0129 { \
0130 XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
0131 { \
0132 return ::OP##_f32(a); \
0133 } \
0134 }
0135
0136
0137 XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
0138 XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
0139 XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
0140 XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
0141 XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
0142 XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
0143 XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
0144 XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
0145 XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
0146
0147 namespace xsimd
0148 {
0149 template <typename T, class A, bool... Values>
0150 struct batch_bool_constant;
0151
0152 namespace kernel
0153 {
0154 using namespace types;
0155
0156 namespace detail
0157 {
0158 template <template <class> class return_type, class... T>
0159 struct neon_dispatcher_base
0160 {
0161 struct unary
0162 {
0163 using container_type = std::tuple<return_type<T> (*)(T)...>;
0164 const container_type m_func;
0165
0166 template <class U>
0167 return_type<U> apply(U rhs) const noexcept
0168 {
0169 using func_type = return_type<U> (*)(U);
0170 auto func = xsimd::detail::get<func_type>(m_func);
0171 return func(rhs);
0172 }
0173 };
0174
0175 struct binary
0176 {
0177 using container_type = std::tuple<return_type<T> (*)(T, T)...>;
0178 const container_type m_func;
0179
0180 template <class U>
0181 return_type<U> apply(U lhs, U rhs) const noexcept
0182 {
0183 using func_type = return_type<U> (*)(U, U);
0184 auto func = xsimd::detail::get<func_type>(m_func);
0185 return func(lhs, rhs);
0186 }
0187 };
0188 };
0189
0190
0191
0192
0193
0194 template <class T>
0195 using identity_return_type = T;
0196
0197 template <class... T>
0198 struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
0199 {
0200 };
0201
0202 using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
0203 uint16x8_t, int16x8_t,
0204 uint32x4_t, int32x4_t,
0205 uint64x2_t, int64x2_t,
0206 float32x4_t>;
0207
0208 using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
0209 uint16x8_t, int16x8_t,
0210 uint32x4_t, int32x4_t,
0211 float32x4_t>;
0212
0213 using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
0214 uint16x8_t, int16x8_t,
0215 uint32x4_t, int32x4_t>;
0216
0217
0218
0219
0220
0221 template <class T>
0222 struct comp_return_type_impl;
0223
0224 template <>
0225 struct comp_return_type_impl<uint8x16_t>
0226 {
0227 using type = uint8x16_t;
0228 };
0229
0230 template <>
0231 struct comp_return_type_impl<int8x16_t>
0232 {
0233 using type = uint8x16_t;
0234 };
0235
0236 template <>
0237 struct comp_return_type_impl<uint16x8_t>
0238 {
0239 using type = uint16x8_t;
0240 };
0241
0242 template <>
0243 struct comp_return_type_impl<int16x8_t>
0244 {
0245 using type = uint16x8_t;
0246 };
0247
0248 template <>
0249 struct comp_return_type_impl<uint32x4_t>
0250 {
0251 using type = uint32x4_t;
0252 };
0253
0254 template <>
0255 struct comp_return_type_impl<int32x4_t>
0256 {
0257 using type = uint32x4_t;
0258 };
0259
0260 template <>
0261 struct comp_return_type_impl<uint64x2_t>
0262 {
0263 using type = uint64x2_t;
0264 };
0265
0266 template <>
0267 struct comp_return_type_impl<int64x2_t>
0268 {
0269 using type = uint64x2_t;
0270 };
0271
0272 template <>
0273 struct comp_return_type_impl<float32x4_t>
0274 {
0275 using type = uint32x4_t;
0276 };
0277
0278 template <class T>
0279 using comp_return_type = typename comp_return_type_impl<T>::type;
0280
0281 template <class... T>
0282 struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
0283 {
0284 };
0285
0286 using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
0287 uint16x8_t, int16x8_t,
0288 uint32x4_t, int32x4_t,
0289 float32x4_t>;
0290
0291
0292
0293
0294
0295 template <class T>
0296 using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
0297 int>::type;
0298
0299 template <class T>
0300 using exclude_int64_neon_t
0301 = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
0302 }
0303
0304
0305
0306
0307
0308 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0309 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0310 {
0311 return vdupq_n_u8(uint8_t(val));
0312 }
0313
0314 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0315 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0316 {
0317 return vdupq_n_s8(int8_t(val));
0318 }
0319
0320 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0321 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0322 {
0323 return vdupq_n_u16(uint16_t(val));
0324 }
0325
0326 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0327 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0328 {
0329 return vdupq_n_s16(int16_t(val));
0330 }
0331
0332 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0333 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0334 {
0335 return vdupq_n_u32(uint32_t(val));
0336 }
0337
0338 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0339 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0340 {
0341 return vdupq_n_s32(int32_t(val));
0342 }
0343
0344 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0345 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0346 {
0347 return vdupq_n_u64(uint64_t(val));
0348 }
0349
0350 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0351 XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
0352 {
0353 return vdupq_n_s64(int64_t(val));
0354 }
0355
0356 template <class A>
0357 XSIMD_INLINE batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
0358 {
0359 return vdupq_n_f32(val);
0360 }
0361
0362
0363
0364
0365
0366 template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
0367 XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
0368 {
0369 return xsimd::types::detail::neon_vector_type<T> { args... };
0370 }
0371
0372 template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
0373 XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
0374 {
0375 using register_type = typename batch_bool<T, A>::register_type;
0376 using unsigned_type = as_unsigned_integer_t<T>;
0377 return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
0378 }
0379
0380 template <class A>
0381 XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
0382 {
0383 return float32x4_t { f0, f1, f2, f3 };
0384 }
0385
0386 template <class A>
0387 XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
0388 std::complex<float> c0, std::complex<float> c1,
0389 std::complex<float> c2, std::complex<float> c3) noexcept
0390 {
0391 return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
0392 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
0393 }
0394
0395 template <class A, class... Args>
0396 XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
0397 {
0398 using register_type = typename batch_bool<float, A>::register_type;
0399 using unsigned_type = as_unsigned_integer_t<float>;
0400 return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
0401 }
0402
0403
0404
0405
0406
0407 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0408 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0409 {
0410 return vandq_u8(arg, vdupq_n_u8(1));
0411 }
0412
0413 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0414 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0415 {
0416 return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
0417 }
0418
0419 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0420 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0421 {
0422 return vandq_u16(arg, vdupq_n_u16(1));
0423 }
0424
0425 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0426 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0427 {
0428 return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
0429 }
0430
0431 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0432 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0433 {
0434 return vandq_u32(arg, vdupq_n_u32(1));
0435 }
0436
0437 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0438 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0439 {
0440 return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
0441 }
0442
0443 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0444 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0445 {
0446 return vandq_u64(arg, vdupq_n_u64(1));
0447 }
0448
0449 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0450 XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
0451 {
0452 return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
0453 }
0454
0455 template <class A>
0456 XSIMD_INLINE batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
0457 {
0458 return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
0459 }
0460
0461
0462
0463
0464
0465
0466
0467 #if defined(__clang__) || defined(__GNUC__)
0468 #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
0469 #elif defined(_MSC_VER)
0470 #define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
0471 #else
0472 #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
0473 #endif
0474
0475 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0476 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0477 {
0478 return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
0479 }
0480
0481 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0482 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0483 {
0484 return xsimd_aligned_load(vld1q_s8, int8_t*, src);
0485 }
0486
0487 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0488 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0489 {
0490 return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
0491 }
0492 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0493 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0494 {
0495 return xsimd_aligned_load(vld1q_s16, int16_t*, src);
0496 }
0497 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0498 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0499 {
0500 return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
0501 }
0502 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0503 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0504 {
0505 return xsimd_aligned_load(vld1q_s32, int32_t*, src);
0506 }
0507 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0508 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0509 {
0510 return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
0511 }
0512 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0513 XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0514 {
0515 return xsimd_aligned_load(vld1q_s64, int64_t*, src);
0516 }
0517
0518 template <class A>
0519 XSIMD_INLINE batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
0520 {
0521 return xsimd_aligned_load(vld1q_f32, float*, src);
0522 }
0523
0524 #undef xsimd_aligned_load
0525
0526 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0527 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0528 {
0529 return vld1q_u8((uint8_t*)src);
0530 }
0531
0532 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0533 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0534 {
0535 return vld1q_s8((int8_t*)src);
0536 }
0537
0538 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0539 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0540 {
0541 return vld1q_u16((uint16_t*)src);
0542 }
0543 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0544 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0545 {
0546 return vld1q_s16((int16_t*)src);
0547 }
0548 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0549 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0550 {
0551 return vld1q_u32((uint32_t*)src);
0552 }
0553 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0554 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0555 {
0556 return vld1q_s32((int32_t*)src);
0557 }
0558 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0559 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0560 {
0561 return vld1q_u64((uint64_t*)src);
0562 }
0563 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0564 XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
0565 {
0566 return vld1q_s64((int64_t*)src);
0567 }
0568
0569 template <class A>
0570 XSIMD_INLINE batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
0571 {
0572 return vld1q_f32(src);
0573 }
0574
0575
0576
0577
0578
0579 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0580 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0581 {
0582 vst1q_u8((uint8_t*)dst, src);
0583 }
0584
0585 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0586 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0587 {
0588 vst1q_s8((int8_t*)dst, src);
0589 }
0590
0591 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0592 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0593 {
0594 vst1q_u16((uint16_t*)dst, src);
0595 }
0596
0597 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0598 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0599 {
0600 vst1q_s16((int16_t*)dst, src);
0601 }
0602
0603 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0604 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0605 {
0606 vst1q_u32((uint32_t*)dst, src);
0607 }
0608
0609 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0610 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0611 {
0612 vst1q_s32((int32_t*)dst, src);
0613 }
0614
0615 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0616 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0617 {
0618 vst1q_u64((uint64_t*)dst, src);
0619 }
0620
0621 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0622 XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0623 {
0624 vst1q_s64((int64_t*)dst, src);
0625 }
0626
0627 template <class A>
0628 XSIMD_INLINE void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
0629 {
0630 vst1q_f32(dst, src);
0631 }
0632
0633 template <class A, class T>
0634 XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
0635 {
0636 store_aligned<A>(dst, src, A {});
0637 }
0638
0639
0640
0641
0642
0643 template <class A>
0644 XSIMD_INLINE batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
0645 {
0646 using real_batch = batch<float, A>;
0647 const float* buf = reinterpret_cast<const float*>(mem);
0648 float32x4x2_t tmp = vld2q_f32(buf);
0649 real_batch real = tmp.val[0],
0650 imag = tmp.val[1];
0651 return batch<std::complex<float>, A> { real, imag };
0652 }
0653
0654 template <class A>
0655 XSIMD_INLINE batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
0656 {
0657 return load_complex_aligned<A>(mem, cvt, A {});
0658 }
0659
0660
0661
0662
0663
0664 template <class A>
0665 XSIMD_INLINE void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
0666 {
0667 float32x4x2_t tmp;
0668 tmp.val[0] = src.real();
0669 tmp.val[1] = src.imag();
0670 float* buf = reinterpret_cast<float*>(dst);
0671 vst2q_f32(buf, tmp);
0672 }
0673
0674 template <class A>
0675 XSIMD_INLINE void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
0676 {
0677 store_complex_aligned(dst, src, A {});
0678 }
0679
0680
0681
0682
0683
0684 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0685 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0686 {
0687 return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
0688 }
0689
0690 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0691 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0692 {
0693 return vnegq_s8(rhs);
0694 }
0695
0696 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0697 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0698 {
0699 return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
0700 }
0701
0702 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0703 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0704 {
0705 return vnegq_s16(rhs);
0706 }
0707
0708 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0709 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0710 {
0711 return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
0712 }
0713
0714 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0715 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0716 {
0717 return vnegq_s32(rhs);
0718 }
0719
0720 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0721 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0722 {
0723 return batch<T, A> { -rhs.get(0), -rhs.get(1) };
0724 }
0725
0726 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0727 XSIMD_INLINE batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
0728 {
0729 return batch<T, A> { -rhs.get(0), -rhs.get(1) };
0730 }
0731
0732 template <class A>
0733 XSIMD_INLINE batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
0734 {
0735 return vnegq_f32(rhs);
0736 }
0737
0738
0739
0740
0741
0742 WRAP_BINARY_INT(vaddq, detail::identity_return_type)
0743 WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
0744
0745 template <class A, class T, detail::enable_neon_type_t<T> = 0>
0746 XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0747 {
0748 using register_type = typename batch<T, A>::register_type;
0749 const detail::neon_dispatcher::binary dispatcher = {
0750 std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
0751 wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
0752 wrap::vaddq_f32)
0753 };
0754 return dispatcher.apply(register_type(lhs), register_type(rhs));
0755 }
0756
0757
0758
0759
0760
0761 WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
0762
0763 template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
0764 XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0765 {
0766 using register_type = typename batch<T, A>::register_type;
0767 const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
0768 std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
0769 };
0770 return dispatcher.apply(register_type(lhs), register_type(rhs));
0771 }
0772
0773
0774
0775
0776
0777 WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
0778
0779 template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
0780 XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0781 {
0782 using register_type = typename batch<T, A>::register_type;
0783 const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
0784 std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
0785 };
0786 return dispatcher.apply(register_type(lhs), register_type(rhs));
0787 }
0788
0789
0790
0791
0792
0793 WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
0794
0795 template <class A, class T, detail::enable_neon_type_t<T> = 0>
0796 XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0797 {
0798 using register_type = typename batch<T, A>::register_type;
0799 const detail::neon_dispatcher::binary dispatcher = {
0800 std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
0801 wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
0802 wrap::vaddq_f32)
0803 };
0804 return dispatcher.apply(register_type(lhs), register_type(rhs));
0805 }
0806
0807
0808
0809
0810
0811 WRAP_BINARY_INT(vsubq, detail::identity_return_type)
0812 WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
0813
0814 template <class A, class T, detail::enable_neon_type_t<T> = 0>
0815 XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0816 {
0817 using register_type = typename batch<T, A>::register_type;
0818 const detail::neon_dispatcher::binary dispatcher = {
0819 std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
0820 wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
0821 wrap::vsubq_f32)
0822 };
0823 return dispatcher.apply(register_type(lhs), register_type(rhs));
0824 }
0825
0826
0827
0828
0829
0830 WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
0831
0832 template <class A, class T, detail::enable_neon_type_t<T> = 0>
0833 XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0834 {
0835 using register_type = typename batch<T, A>::register_type;
0836 const detail::neon_dispatcher::binary dispatcher = {
0837 std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
0838 wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
0839 wrap::vsubq_f32)
0840 };
0841 return dispatcher.apply(register_type(lhs), register_type(rhs));
0842 }
0843
0844
0845
0846
0847
0848 WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
0849 WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
0850
0851 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0852 XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0853 {
0854 using register_type = typename batch<T, A>::register_type;
0855 const detail::excluding_int64_dispatcher::binary dispatcher = {
0856 std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
0857 wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
0858 };
0859 return dispatcher.apply(register_type(lhs), register_type(rhs));
0860 }
0861
0862
0863
0864
0865
0866 #if defined(XSIMD_FAST_INTEGER_DIVISION)
0867 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0868 XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0869 {
0870 return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
0871 }
0872
0873 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0874 XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0875 {
0876 return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
0877 }
0878 #endif
0879
0880 template <class A>
0881 XSIMD_INLINE batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
0882 {
0883
0884
0885 float32x4_t rcp = reciprocal(rhs);
0886
0887
0888
0889
0890 rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
0891 rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
0892
0893
0894 return vmulq_f32(lhs, rcp);
0895 }
0896
0897
0898
0899
0900
0901 WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
0902 WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
0903
0904 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0905 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0906 {
0907 using register_type = typename batch<T, A>::register_type;
0908 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
0909 std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
0910 wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
0911 };
0912 return dispatcher.apply(register_type(lhs), register_type(rhs));
0913 }
0914
0915 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0916 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
0917 {
0918 using register_type = typename batch_bool<T, A>::register_type;
0919 using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
0920 const dispatcher_type dispatcher = {
0921 std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
0922 };
0923 return dispatcher.apply(register_type(lhs), register_type(rhs));
0924 }
0925
0926 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0927 XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0928 {
0929 return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
0930 }
0931
0932 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0933 XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
0934 {
0935 return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
0936 }
0937
0938
0939
0940
0941
0942 namespace detail
0943 {
0944 template <class A>
0945 XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
0946 {
0947 return vcvtq_f32_s32(self);
0948 }
0949
0950 template <class A>
0951 XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
0952 {
0953 return vcvtq_f32_u32(self);
0954 }
0955
0956 template <class A>
0957 XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
0958 {
0959 return vcvtq_s32_f32(self);
0960 }
0961
0962 template <class A>
0963 XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
0964 {
0965 return vcvtq_u32_f32(self);
0966 }
0967
0968 }
0969
0970
0971
0972
0973
0974 WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
0975 WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
0976
0977 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
0978 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0979 {
0980 using register_type = typename batch<T, A>::register_type;
0981 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
0982 std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
0983 wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
0984 };
0985 return dispatcher.apply(register_type(lhs), register_type(rhs));
0986 }
0987
0988 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0989 XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
0990 {
0991 return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
0992 }
0993
0994
0995
0996
0997
0998 WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
0999 WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
1000
1001 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1002 XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1003 {
1004 using register_type = typename batch<T, A>::register_type;
1005 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1006 std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
1007 wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
1008 };
1009 return dispatcher.apply(register_type(lhs), register_type(rhs));
1010 }
1011
1012 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1013 XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1014 {
1015 return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
1016 }
1017
1018
1019
1020
1021
1022 WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
1023 WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
1024
1025 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1026 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1027 {
1028 using register_type = typename batch<T, A>::register_type;
1029 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1030 std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
1031 wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
1032 };
1033 return dispatcher.apply(register_type(lhs), register_type(rhs));
1034 }
1035
1036 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1037 XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1038 {
1039 return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
1040 }
1041
1042
1043
1044
1045
1046 WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
1047 WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
1048
1049 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1050 XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1051 {
1052 using register_type = typename batch<T, A>::register_type;
1053 const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
1054 std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
1055 wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
1056 };
1057 return dispatcher.apply(register_type(lhs), register_type(rhs));
1058 }
1059
1060 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1061 XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1062 {
1063 return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
1064 }
1065
1066
1067
1068
1069
1070 template <class A, class T_out, class T_in>
1071 XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
1072 {
1073 using register_type = typename batch_bool<T_out, A>::register_type;
1074 return register_type(self);
1075 }
1076
1077
1078
1079
1080
1081 WRAP_BINARY_INT(vandq, detail::identity_return_type)
1082
1083 namespace detail
1084 {
1085 XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1086 {
1087 return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
1088 vreinterpretq_u32_f32(rhs)));
1089 }
1090
1091 template <class V>
1092 V bitwise_and_neon(V const& lhs, V const& rhs)
1093 {
1094 const neon_dispatcher::binary dispatcher = {
1095 std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
1096 wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
1097 bitwise_and_f32)
1098 };
1099 return dispatcher.apply(lhs, rhs);
1100 }
1101 }
1102
1103 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1104 XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1105 {
1106 using register_type = typename batch<T, A>::register_type;
1107 return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1108 }
1109
1110 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1111 XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1112 {
1113 using register_type = typename batch_bool<T, A>::register_type;
1114 return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
1115 }
1116
1117
1118
1119
1120
1121 WRAP_BINARY_INT(vorrq, detail::identity_return_type)
1122
1123 namespace detail
1124 {
1125 XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1126 {
1127 return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
1128 vreinterpretq_u32_f32(rhs)));
1129 }
1130
1131 template <class V>
1132 XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
1133 {
1134 const neon_dispatcher::binary dispatcher = {
1135 std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
1136 wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
1137 bitwise_or_f32)
1138 };
1139 return dispatcher.apply(lhs, rhs);
1140 }
1141 }
1142
1143 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1144 XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1145 {
1146 using register_type = typename batch<T, A>::register_type;
1147 return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1148 }
1149
1150 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1151 XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1152 {
1153 using register_type = typename batch_bool<T, A>::register_type;
1154 return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
1155 }
1156
1157
1158
1159
1160
1161 WRAP_BINARY_INT(veorq, detail::identity_return_type)
1162
1163 namespace detail
1164 {
1165 XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1166 {
1167 return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
1168 vreinterpretq_u32_f32(rhs)));
1169 }
1170
1171 template <class V>
1172 XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
1173 {
1174 const neon_dispatcher::binary dispatcher = {
1175 std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
1176 wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
1177 bitwise_xor_f32)
1178 };
1179 return dispatcher.apply(lhs, rhs);
1180 }
1181 }
1182
1183 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1184 XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1185 {
1186 using register_type = typename batch<T, A>::register_type;
1187 return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1188 }
1189
1190 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1191 XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1192 {
1193 using register_type = typename batch_bool<T, A>::register_type;
1194 return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
1195 }
1196
1197
1198
1199
1200
1201 template <class A, class T>
1202 XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1203 {
1204 return bitwise_xor(lhs, rhs, A {});
1205 }
1206
1207
1208
1209
1210
1211 WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
1212
1213 namespace detail
1214 {
1215 XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
1216 {
1217 return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
1218 }
1219
1220 XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
1221 {
1222 return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
1223 }
1224
1225 XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
1226 {
1227 return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
1228 }
1229
1230 template <class V>
1231 XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
1232 {
1233 const neon_dispatcher::unary dispatcher = {
1234 std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
1235 wrap::vmvnq_u32, wrap::vmvnq_s32,
1236 bitwise_not_u64, bitwise_not_s64,
1237 bitwise_not_f32)
1238 };
1239 return dispatcher.apply(arg);
1240 }
1241 }
1242
1243 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1244 XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
1245 {
1246 using register_type = typename batch<T, A>::register_type;
1247 return detail::bitwise_not_neon(register_type(arg));
1248 }
1249
1250 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1251 XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
1252 {
1253 using register_type = typename batch_bool<T, A>::register_type;
1254 return detail::bitwise_not_neon(register_type(arg));
1255 }
1256
1257
1258
1259
1260
1261 WRAP_BINARY_INT(vbicq, detail::identity_return_type)
1262
1263 namespace detail
1264 {
1265 XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
1266 {
1267 return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
1268 }
1269
1270 template <class V>
1271 XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
1272 {
1273 const detail::neon_dispatcher::binary dispatcher = {
1274 std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
1275 wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
1276 bitwise_andnot_f32)
1277 };
1278 return dispatcher.apply(lhs, rhs);
1279 }
1280 }
1281
1282 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1283 XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1284 {
1285 using register_type = typename batch<T, A>::register_type;
1286 return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1287 }
1288
1289 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1290 XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
1291 {
1292 using register_type = typename batch_bool<T, A>::register_type;
1293 return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
1294 }
1295
1296
1297
1298
1299
1300 WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
1301 WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
1302
1303 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1304 XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1305 {
1306 using register_type = typename batch<T, A>::register_type;
1307 const detail::excluding_int64_dispatcher::binary dispatcher = {
1308 std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
1309 wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
1310 };
1311 return dispatcher.apply(register_type(lhs), register_type(rhs));
1312 }
1313
1314 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1315 XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1316 {
1317 return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
1318 }
1319
1320
1321
1322
1323
1324 WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
1325 WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
1326
1327 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1328 XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1329 {
1330 using register_type = typename batch<T, A>::register_type;
1331 const detail::excluding_int64_dispatcher::binary dispatcher = {
1332 std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
1333 wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
1334 };
1335 return dispatcher.apply(register_type(lhs), register_type(rhs));
1336 }
1337
1338 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1339 XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1340 {
1341 return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
1342 }
1343
1344
1345
1346
1347
1348 namespace wrap
1349 {
1350 XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
1351 XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
1352 XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
1353 }
1354 WRAP_UNARY_FLOAT(vabsq)
1355
1356 namespace detail
1357 {
1358 XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
1359 {
1360 return arg;
1361 }
1362
1363 XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
1364 {
1365 return arg;
1366 }
1367
1368 XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
1369 {
1370 return arg;
1371 }
1372 }
1373
1374 template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
1375 XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
1376 {
1377 using register_type = typename batch<T, A>::register_type;
1378 const detail::excluding_int64_dispatcher::unary dispatcher = {
1379 std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
1380 detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
1381 };
1382 return dispatcher.apply(register_type(arg));
1383 }
1384
1385
1386
1387
1388
1389 template <class A>
1390 XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1391 {
1392 return vrsqrteq_f32(arg);
1393 }
1394
1395
1396
1397
1398
1399 template <class A>
1400 XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
1401 {
1402 batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
1403
1404 sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1405 batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
1406 batch<float, A> zero(0.f);
1407 return select(arg == zero, zero, sqrt_approx);
1408 }
1409
1410
1411
1412
1413
1414 #ifdef __ARM_FEATURE_FMA
1415 template <class A>
1416 XSIMD_INLINE batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1417 {
1418 return vfmaq_f32(z, x, y);
1419 }
1420
1421 template <class A>
1422 XSIMD_INLINE batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
1423 {
1424 return vfmaq_f32(-z, x, y);
1425 }
1426 #endif
1427
1428
1429
1430
1431
1432 template <class A>
1433 XSIMD_INLINE batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
1434 {
1435
1436 float32x2_t tmp1, tmp2, tmp3;
1437
1438 tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
1439
1440 tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
1441
1442 tmp1 = vpadd_f32(tmp1, tmp2);
1443
1444 tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
1445
1446 tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
1447
1448 tmp2 = vpadd_f32(tmp2, tmp3);
1449
1450 return vcombine_f32(tmp1, tmp2);
1451 }
1452
1453
1454
1455
1456
1457 template <class A>
1458 XSIMD_INLINE batch<float, A>
1459 reciprocal(const batch<float, A>& x,
1460 kernel::requires_arch<neon>) noexcept
1461 {
1462 return vrecpeq_f32(x);
1463 }
1464
1465
1466
1467
1468
1469 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
1470 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1471 {
1472 return vsetq_lane_u8(val, self, I);
1473 }
1474
1475 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
1476 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1477 {
1478 return vsetq_lane_s8(val, self, I);
1479 }
1480
1481 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
1482 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1483 {
1484 return vsetq_lane_u16(val, self, I);
1485 }
1486
1487 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
1488 XSIMD_INLINE batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
1489 {
1490 return vsetq_lane_s16(val, self, I);
1491 }
1492
1493 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
1494 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1495 {
1496 return vsetq_lane_u32(val, self, I);
1497 }
1498
1499 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
1500 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1501 {
1502 return vsetq_lane_s32(val, self, I);
1503 }
1504
1505 template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
1506 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1507 {
1508 return vsetq_lane_u64(val, self, I);
1509 }
1510
1511 template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
1512 XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
1513 {
1514 return vsetq_lane_s64(val, self, I);
1515 }
1516
1517 template <class A, size_t I>
1518 XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
1519 {
1520 return vsetq_lane_f32(val, self, I);
1521 }
1522
1523
1524
1525
1526
1527 template <class A>
1528 XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
1529 requires_arch<neon>) noexcept
1530 {
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574 const auto signmask = vdupq_n_u32(0x80000000);
1575 const auto half = vbslq_f32(signmask, self,
1576 vdupq_n_f32(0.5f));
1577 const auto r_normal = vcvtq_s32_f32(vaddq_f32(
1578 self, half));
1579 const auto r_trunc = vcvtq_s32_f32(self);
1580 const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
1581 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31));
1582 const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1583 vdupq_n_s32(1));
1584 const auto delta = vsubq_f32(
1585 self,
1586 vcvtq_f32_s32(r_trunc));
1587 const auto is_delta_half = vceqq_f32(delta, half);
1588 return vbslq_s32(is_delta_half, r_even, r_normal);
1589 }
1590
1591
1592
1593
1594
1595 namespace detail
1596 {
1597 template <class T, class A, class V>
1598 XSIMD_INLINE T sum_batch(V const& arg) noexcept
1599 {
1600 T res = T(0);
1601 for (std::size_t i = 0; i < batch<T, A>::size; ++i)
1602 {
1603 res += arg[i];
1604 }
1605 return res;
1606 }
1607 }
1608
1609 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1610 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1611 {
1612 uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
1613 tmp = vpadd_u8(tmp, tmp);
1614 tmp = vpadd_u8(tmp, tmp);
1615 tmp = vpadd_u8(tmp, tmp);
1616 return vget_lane_u8(tmp, 0);
1617 }
1618
1619 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1620 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1621 {
1622 int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
1623 tmp = vpadd_s8(tmp, tmp);
1624 tmp = vpadd_s8(tmp, tmp);
1625 tmp = vpadd_s8(tmp, tmp);
1626 return vget_lane_s8(tmp, 0);
1627 }
1628
1629 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1630 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1631 {
1632 uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
1633 tmp = vpadd_u16(tmp, tmp);
1634 tmp = vpadd_u16(tmp, tmp);
1635 return vget_lane_u16(tmp, 0);
1636 }
1637
1638 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1639 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1640 {
1641 int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
1642 tmp = vpadd_s16(tmp, tmp);
1643 tmp = vpadd_s16(tmp, tmp);
1644 return vget_lane_s16(tmp, 0);
1645 }
1646
1647 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1648 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1649 {
1650 uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
1651 tmp = vpadd_u32(tmp, tmp);
1652 return vget_lane_u32(tmp, 0);
1653 }
1654
1655 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1656 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1657 {
1658 int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
1659 tmp = vpadd_s32(tmp, tmp);
1660 return vget_lane_s32(tmp, 0);
1661 }
1662
1663 template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
1664 XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
1665 {
1666 return arg.get(0) + arg.get(1);
1667 }
1668
1669 template <class A>
1670 XSIMD_INLINE float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
1671 {
1672 float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
1673 tmp = vpadd_f32(tmp, tmp);
1674 return vget_lane_f32(tmp, 0);
1675 }
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695 namespace wrap
1696 {
1697 XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
1698 XSIMD_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
1699 XSIMD_INLINE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
1700 XSIMD_INLINE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
1701 XSIMD_INLINE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
1702 XSIMD_INLINE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
1703 XSIMD_INLINE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
1704 XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
1705 XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
1706 }
1707
1708 namespace detail
1709 {
1710 template <class... T>
1711 struct neon_select_dispatcher_impl
1712 {
1713 using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
1714 const container_type m_func;
1715
1716 template <class U>
1717 U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
1718 {
1719 using func_type = U (*)(comp_return_type<U>, U, U);
1720 auto func = xsimd::detail::get<func_type>(m_func);
1721 return func(cond, lhs, rhs);
1722 }
1723 };
1724
1725 using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
1726 uint16x8_t, int16x8_t,
1727 uint32x4_t, int32x4_t,
1728 uint64x2_t, int64x2_t,
1729 float32x4_t>;
1730 }
1731
1732 template <class A, class T, detail::enable_neon_type_t<T> = 0>
1733 XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
1734 {
1735 using bool_register_type = typename batch_bool<T, A>::register_type;
1736 using register_type = typename batch<T, A>::register_type;
1737 const detail::neon_select_dispatcher dispatcher = {
1738 std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
1739 wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
1740 wrap::vbslq_f32)
1741 };
1742 return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
1743 }
1744
1745 template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
1746 XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
1747 {
1748 return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
1749 }
1750
1751
1752
1753
1754 template <class A>
1755 XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
1756 {
1757 assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
1758 (void)matrix_end;
1759 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1760 auto t01 = vtrnq_f32(r0, r1);
1761 auto t23 = vtrnq_f32(r2, r3);
1762 matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
1763 matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
1764 matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
1765 matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
1766 }
1767 template <class A>
1768 XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
1769 {
1770 assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
1771 (void)matrix_end;
1772 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1773 auto t01 = vtrnq_u32(r0, r1);
1774 auto t23 = vtrnq_u32(r2, r3);
1775 matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
1776 matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
1777 matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
1778 matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
1779 }
1780 template <class A>
1781 XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
1782 {
1783 assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
1784 (void)matrix_end;
1785 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1786 auto t01 = vtrnq_s32(r0, r1);
1787 auto t23 = vtrnq_s32(r2, r3);
1788 matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
1789 matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
1790 matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
1791 matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
1792 }
1793
1794 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1795 XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1796 {
1797 assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1798 (void)matrix_end;
1799 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1800 matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
1801 matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
1802 }
1803
1804 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1805 XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
1806 {
1807 assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1808 (void)matrix_end;
1809 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1810 matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
1811 matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
1812 }
1813
1814
1815
1816
1817
1818 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1819 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1820 {
1821 uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
1822 return vcombine_u8(tmp.val[0], tmp.val[1]);
1823 }
1824
1825 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1826 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1827 {
1828 int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
1829 return vcombine_s8(tmp.val[0], tmp.val[1]);
1830 }
1831
1832 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1833 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1834 {
1835 uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
1836 return vcombine_u16(tmp.val[0], tmp.val[1]);
1837 }
1838
1839 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1840 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1841 {
1842 int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
1843 return vcombine_s16(tmp.val[0], tmp.val[1]);
1844 }
1845
1846 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1847 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1848 {
1849 uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
1850 return vcombine_u32(tmp.val[0], tmp.val[1]);
1851 }
1852
1853 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1854 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1855 {
1856 int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
1857 return vcombine_s32(tmp.val[0], tmp.val[1]);
1858 }
1859
1860 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1861 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1862 {
1863 return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
1864 }
1865
1866 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1867 XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1868 {
1869 return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
1870 }
1871
1872 template <class A>
1873 XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1874 {
1875 float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
1876 return vcombine_f32(tmp.val[0], tmp.val[1]);
1877 }
1878
1879
1880
1881
1882
1883 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1884 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1885 {
1886 uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
1887 return vcombine_u8(tmp.val[0], tmp.val[1]);
1888 }
1889
1890 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1891 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1892 {
1893 int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
1894 return vcombine_s8(tmp.val[0], tmp.val[1]);
1895 }
1896
1897 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1898 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1899 {
1900 uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
1901 return vcombine_u16(tmp.val[0], tmp.val[1]);
1902 }
1903
1904 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1905 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1906 {
1907 int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
1908 return vcombine_s16(tmp.val[0], tmp.val[1]);
1909 }
1910
1911 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1912 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1913 {
1914 uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
1915 return vcombine_u32(tmp.val[0], tmp.val[1]);
1916 }
1917
1918 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1919 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1920 {
1921 int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
1922 return vcombine_s32(tmp.val[0], tmp.val[1]);
1923 }
1924
1925 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
1926 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1927 {
1928 return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
1929 }
1930
1931 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
1932 XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
1933 {
1934 return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
1935 }
1936
1937 template <class A>
1938 XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
1939 {
1940 float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
1941 return vcombine_f32(tmp.val[0], tmp.val[1]);
1942 }
1943
1944
1945
1946
1947
1948 namespace detail
1949 {
1950 template <class A, class T>
1951 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& , std::size_t, ::xsimd::detail::index_sequence<>) noexcept
1952 {
1953 assert(false && "extract_pair out of bounds");
1954 return batch<T, A> {};
1955 }
1956
1957 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
1958 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1959 {
1960 if (n == I)
1961 {
1962 return vextq_u8(rhs, lhs, I);
1963 }
1964 else
1965 {
1966 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1967 }
1968 }
1969
1970 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
1971 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1972 {
1973 if (n == I)
1974 {
1975 return vextq_s8(rhs, lhs, I);
1976 }
1977 else
1978 {
1979 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1980 }
1981 }
1982
1983 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
1984 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1985 {
1986 if (n == I)
1987 {
1988 return vextq_u16(rhs, lhs, I);
1989 }
1990 else
1991 {
1992 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
1993 }
1994 }
1995
1996 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
1997 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
1998 {
1999 if (n == I)
2000 {
2001 return vextq_s16(rhs, lhs, I);
2002 }
2003 else
2004 {
2005 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2006 }
2007 }
2008
2009 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2010 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2011 {
2012 if (n == I)
2013 {
2014 return vextq_u32(rhs, lhs, I);
2015 }
2016 else
2017 {
2018 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2019 }
2020 }
2021
2022 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
2023 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2024 {
2025 if (n == I)
2026 {
2027 return vextq_s32(rhs, lhs, I);
2028 }
2029 else
2030 {
2031 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2032 }
2033 }
2034
2035 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2036 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2037 {
2038 if (n == I)
2039 {
2040 return vextq_u64(rhs, lhs, I);
2041 }
2042 else
2043 {
2044 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2045 }
2046 }
2047
2048 template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
2049 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2050 {
2051 if (n == I)
2052 {
2053 return vextq_s64(rhs, lhs, I);
2054 }
2055 else
2056 {
2057 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2058 }
2059 }
2060
2061 template <class A, size_t I, size_t... Is>
2062 XSIMD_INLINE batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
2063 {
2064 if (n == I)
2065 {
2066 return vextq_f32(rhs, lhs, I);
2067 }
2068 else
2069 {
2070 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2071 }
2072 }
2073
2074 template <class A, class T, size_t... Is>
2075 XSIMD_INLINE batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
2076 {
2077 if (n == 0)
2078 {
2079 return rhs;
2080 }
2081 else
2082 {
2083 return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
2084 }
2085 }
2086 }
2087
2088 template <class A, class T>
2089 XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
2090 {
2091 constexpr std::size_t size = batch<T, A>::size;
2092 assert(n < size && "index in bounds");
2093 return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
2094 }
2095
2096
2097
2098
2099
2100 namespace detail
2101 {
2102 template <class A, class T>
2103 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& , int , ::xsimd::detail::int_sequence<>) noexcept
2104 {
2105 assert(false && "bitwise_lshift out of bounds");
2106 return batch<T, A> {};
2107 }
2108
2109 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2110 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2111 {
2112 if (n == I)
2113 {
2114 return vshlq_n_u8(lhs, I);
2115 }
2116 else
2117 {
2118 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2119 }
2120 }
2121
2122 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2123 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2124 {
2125 if (n == I)
2126 {
2127 return vshlq_n_s8(lhs, I);
2128 }
2129 else
2130 {
2131 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2132 }
2133 }
2134
2135 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2136 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2137 {
2138 if (n == I)
2139 {
2140 return vshlq_n_u16(lhs, I);
2141 }
2142 else
2143 {
2144 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2145 }
2146 }
2147
2148 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2149 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2150 {
2151 if (n == I)
2152 {
2153 return vshlq_n_s16(lhs, I);
2154 }
2155 else
2156 {
2157 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2158 }
2159 }
2160
2161 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2162 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2163 {
2164 if (n == I)
2165 {
2166 return vshlq_n_u32(lhs, I);
2167 }
2168 else
2169 {
2170 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2171 }
2172 }
2173
2174 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2175 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2176 {
2177 if (n == I)
2178 {
2179 return vshlq_n_s32(lhs, I);
2180 }
2181 else
2182 {
2183 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2184 }
2185 }
2186
2187 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2188 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2189 {
2190 if (n == I)
2191 {
2192 return vshlq_n_u64(lhs, I);
2193 }
2194 else
2195 {
2196 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2197 }
2198 }
2199
2200 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2201 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2202 {
2203 if (n == I)
2204 {
2205 return vshlq_n_s64(lhs, I);
2206 }
2207 else
2208 {
2209 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2210 }
2211 }
2212
2213 template <class A, class T, int... Is>
2214 XSIMD_INLINE batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2215 {
2216 if (n == 0)
2217 {
2218 return lhs;
2219 }
2220 else
2221 {
2222 return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2223 }
2224 }
2225 }
2226
2227 template <class A, class T>
2228 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2229 {
2230 constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2231 assert(0 <= n && n < size && "index in bounds");
2232 return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2233 }
2234
2235 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2236 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2237 {
2238 return vshlq_u8(lhs, rhs);
2239 }
2240
2241 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2242 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2243 {
2244 return vshlq_s8(lhs, rhs);
2245 }
2246
2247 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2248 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2249 {
2250 return vshlq_u16(lhs, rhs);
2251 }
2252
2253 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2254 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2255 {
2256 return vshlq_s16(lhs, rhs);
2257 }
2258
2259 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2260 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2261 {
2262 return vshlq_u32(lhs, rhs);
2263 }
2264
2265 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2266 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2267 {
2268 return vshlq_s32(lhs, rhs);
2269 }
2270
2271 template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
2272 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2273 {
2274 return vshlq_u64(lhs, rhs);
2275 }
2276
2277 template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
2278 XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2279 {
2280 return vshlq_s64(lhs, rhs);
2281 }
2282
2283
2284
2285
2286
2287 namespace detail
2288 {
2289 template <class A, class T>
2290 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& , int , ::xsimd::detail::int_sequence<>) noexcept
2291 {
2292 assert(false && "bitwise_rshift out of bounds");
2293 return batch<T, A> {};
2294 }
2295
2296 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
2297 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2298 {
2299 if (n == I)
2300 {
2301 return vshrq_n_u8(lhs, I);
2302 }
2303 else
2304 {
2305 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2306 }
2307 }
2308
2309 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
2310 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2311 {
2312 if (n == I)
2313 {
2314 return vshrq_n_s8(lhs, I);
2315 }
2316 else
2317 {
2318 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2319 }
2320 }
2321
2322 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
2323 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2324 {
2325 if (n == I)
2326 {
2327 return vshrq_n_u16(lhs, I);
2328 }
2329 else
2330 {
2331 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2332 }
2333 }
2334
2335 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
2336 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2337 {
2338 if (n == I)
2339 {
2340 return vshrq_n_s16(lhs, I);
2341 }
2342 else
2343 {
2344 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2345 }
2346 }
2347
2348 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
2349 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2350 {
2351 if (n == I)
2352 {
2353 return vshrq_n_u32(lhs, I);
2354 }
2355 else
2356 {
2357 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2358 }
2359 }
2360
2361 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
2362 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2363 {
2364 if (n == I)
2365 {
2366 return vshrq_n_s32(lhs, I);
2367 }
2368 else
2369 {
2370 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2371 }
2372 }
2373
2374 template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
2375 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2376 {
2377 if (n == I)
2378 {
2379 return vshrq_n_u64(lhs, I);
2380 }
2381 else
2382 {
2383 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2384 }
2385 }
2386
2387 template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
2388 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
2389 {
2390 if (n == I)
2391 {
2392 return vshrq_n_s64(lhs, I);
2393 }
2394 else
2395 {
2396 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2397 }
2398 }
2399
2400 template <class A, class T, int... Is>
2401 XSIMD_INLINE batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
2402 {
2403 if (n == 0)
2404 {
2405 return lhs;
2406 }
2407 else
2408 {
2409 return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
2410 }
2411 }
2412 }
2413
2414 template <class A, class T>
2415 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
2416 {
2417 constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
2418 assert(0 <= n && n < size && "index in bounds");
2419 return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
2420 }
2421
2422 template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
2423 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2424 {
2425 return vshlq_u8(lhs, vnegq_s8(rhs));
2426 }
2427
2428 template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
2429 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2430 {
2431 return vshlq_s8(lhs, vnegq_s8(rhs));
2432 }
2433
2434 template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
2435 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2436 {
2437 return vshlq_u16(lhs, vnegq_s16(rhs));
2438 }
2439
2440 template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
2441 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2442 {
2443 return vshlq_s16(lhs, vnegq_s16(rhs));
2444 }
2445
2446 template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
2447 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
2448 {
2449 return vshlq_u32(lhs, vnegq_s32(rhs));
2450 }
2451
2452 template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
2453 XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
2454 {
2455 return vshlq_s32(lhs, vnegq_s32(rhs));
2456 }
2457
2458
2459
2460
2461
2462
2463
2464 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2465 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2466 {
2467 uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
2468 return vget_lane_u64(tmp, 0) == ~0ULL;
2469 }
2470
2471 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2472 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2473 {
2474 return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2475 }
2476
2477 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2478 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2479 {
2480 return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2481 }
2482
2483 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2484 XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2485 {
2486 return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2487 }
2488
2489
2490
2491
2492
2493 template <class A, class T, detail::enable_sized_t<T, 8> = 0>
2494 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2495 {
2496 uint32x2_t tmp = vqmovn_u64(arg);
2497 return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
2498 }
2499
2500 template <class A, class T, detail::enable_sized_t<T, 1> = 0>
2501 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2502 {
2503 return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
2504 }
2505
2506 template <class A, class T, detail::enable_sized_t<T, 2> = 0>
2507 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2508 {
2509 return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
2510 }
2511
2512 template <class A, class T, detail::enable_sized_t<T, 4> = 0>
2513 XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
2514 {
2515 return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
2516 }
2517
2518
2519
2520
2521
2522 #define WRAP_CAST(SUFFIX, TYPE) \
2523 namespace wrap \
2524 { \
2525 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept \
2526 { \
2527 return ::vreinterpretq_##SUFFIX##_u8(a); \
2528 } \
2529 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept \
2530 { \
2531 return ::vreinterpretq_##SUFFIX##_s8(a); \
2532 } \
2533 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept \
2534 { \
2535 return ::vreinterpretq_##SUFFIX##_u16(a); \
2536 } \
2537 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept \
2538 { \
2539 return ::vreinterpretq_##SUFFIX##_s16(a); \
2540 } \
2541 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept \
2542 { \
2543 return ::vreinterpretq_##SUFFIX##_u32(a); \
2544 } \
2545 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept \
2546 { \
2547 return ::vreinterpretq_##SUFFIX##_s32(a); \
2548 } \
2549 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept \
2550 { \
2551 return ::vreinterpretq_##SUFFIX##_u64(a); \
2552 } \
2553 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept \
2554 { \
2555 return ::vreinterpretq_##SUFFIX##_s64(a); \
2556 } \
2557 XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
2558 { \
2559 return ::vreinterpretq_##SUFFIX##_f32(a); \
2560 } \
2561 }
2562
2563 WRAP_CAST(u8, uint8x16_t)
2564 WRAP_CAST(s8, int8x16_t)
2565 WRAP_CAST(u16, uint16x8_t)
2566 WRAP_CAST(s16, int16x8_t)
2567 WRAP_CAST(u32, uint32x4_t)
2568 WRAP_CAST(s32, int32x4_t)
2569 WRAP_CAST(u64, uint64x2_t)
2570 WRAP_CAST(s64, int64x2_t)
2571 WRAP_CAST(f32, float32x4_t)
2572
2573 #undef WRAP_CAST
2574
2575 namespace detail
2576 {
2577 template <class R, class... T>
2578 struct bitwise_caster_impl
2579 {
2580 using container_type = std::tuple<R (*)(T)...>;
2581 container_type m_func;
2582
2583 template <class U>
2584 R apply(U rhs) const noexcept
2585 {
2586 using func_type = R (*)(U);
2587 auto func = xsimd::detail::get<func_type>(m_func);
2588 return func(rhs);
2589 }
2590 };
2591
2592 template <class R, class... T>
2593 XSIMD_INLINE const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
2594 {
2595 return { std::make_tuple(arg...) };
2596 }
2597
2598 template <class... T>
2599 struct type_list
2600 {
2601 };
2602
2603 template <class RTL, class TTL>
2604 struct bitwise_caster;
2605
2606 template <class... R, class... T>
2607 struct bitwise_caster<type_list<R...>, type_list<T...>>
2608 {
2609 using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
2610 container_type m_caster;
2611
2612 template <class V, class U>
2613 V apply(U rhs) const noexcept
2614 {
2615 using caster_type = bitwise_caster_impl<V, T...>;
2616 auto caster = xsimd::detail::get<caster_type>(m_caster);
2617 return caster.apply(rhs);
2618 }
2619 };
2620
2621 template <class... T>
2622 using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
2623
2624 using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
2625 uint16x8_t, int16x8_t,
2626 uint32x4_t, int32x4_t,
2627 uint64x2_t, int64x2_t,
2628 float32x4_t>;
2629 }
2630
2631 template <class A, class T, class R>
2632 XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
2633 {
2634 const detail::neon_bitwise_caster caster = {
2635 std::make_tuple(
2636 detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
2637 wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
2638 wrap::vreinterpretq_u8_f32),
2639 detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
2640 wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
2641 wrap::vreinterpretq_s8_f32),
2642 detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
2643 wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
2644 wrap::vreinterpretq_u16_f32),
2645 detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
2646 wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
2647 wrap::vreinterpretq_s16_f32),
2648 detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
2649 wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
2650 wrap::vreinterpretq_u32_f32),
2651 detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
2652 wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
2653 wrap::vreinterpretq_s32_f32),
2654 detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
2655 wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
2656 wrap::vreinterpretq_u64_f32),
2657 detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
2658 wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
2659 wrap::vreinterpretq_s64_f32),
2660 detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
2661 wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
2662 wrap::vreinterpretq_f32_f32))
2663 };
2664 using src_register_type = typename batch<T, A>::register_type;
2665 using dst_register_type = typename batch<R, A>::register_type;
2666 return caster.apply<dst_register_type>(src_register_type(arg));
2667 }
2668
2669
2670
2671
2672
2673 template <class A>
2674 XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
2675 {
2676 return !(arg == arg);
2677 }
2678
2679
2680 namespace detail
2681 {
2682 template <size_t N>
2683 struct slider_left
2684 {
2685 template <class A, class T>
2686 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2687 {
2688 const auto left = vdupq_n_u8(0);
2689 const auto right = bitwise_cast<uint8_t>(x).data;
2690 const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
2691 return bitwise_cast<T>(res);
2692 }
2693 };
2694
2695 template <>
2696 struct slider_left<0>
2697 {
2698 template <class A, class T>
2699 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2700 {
2701 return x;
2702 }
2703 };
2704 }
2705
2706 template <size_t N, class A, class T>
2707 XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
2708 {
2709 return detail::slider_left<N> {}(x, A {});
2710 }
2711
2712
2713 namespace detail
2714 {
2715 template <size_t N>
2716 struct slider_right
2717 {
2718 template <class A, class T>
2719 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
2720 {
2721 const auto left = bitwise_cast<uint8_t>(x).data;
2722 const auto right = vdupq_n_u8(0);
2723 const batch<uint8_t, A> res(vextq_u8(left, right, N));
2724 return bitwise_cast<T>(res);
2725 }
2726 };
2727
2728 template <>
2729 struct slider_right<16>
2730 {
2731 template <class A, class T>
2732 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
2733 {
2734 return batch<T, A> {};
2735 }
2736 };
2737 }
2738
2739 template <size_t N, class A, class T>
2740 XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
2741 {
2742 return detail::slider_right<N> {}(x, A {});
2743 }
2744
2745
2746
2747
2748 namespace wrap
2749 {
2750 template <size_t N>
2751 XSIMD_INLINE uint8x16_t rotate_left_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
2752 template <size_t N>
2753 XSIMD_INLINE int8x16_t rotate_left_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
2754 template <size_t N>
2755 XSIMD_INLINE uint16x8_t rotate_left_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
2756 template <size_t N>
2757 XSIMD_INLINE int16x8_t rotate_left_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
2758 template <size_t N>
2759 XSIMD_INLINE uint32x4_t rotate_left_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
2760 template <size_t N>
2761 XSIMD_INLINE int32x4_t rotate_left_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
2762 template <size_t N>
2763 XSIMD_INLINE uint64x2_t rotate_left_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
2764 template <size_t N>
2765 XSIMD_INLINE int64x2_t rotate_left_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
2766 template <size_t N>
2767 XSIMD_INLINE float32x4_t rotate_left_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
2768 }
2769
2770 template <size_t N, class A, class T, detail::enable_neon_type_t<T> = 0>
2771 XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<neon>) noexcept
2772 {
2773 using register_type = typename batch<T, A>::register_type;
2774 const detail::neon_dispatcher::binary dispatcher = {
2775 std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N>, wrap::rotate_left_s16<N>,
2776 wrap::rotate_left_u32<N>, wrap::rotate_left_s32<N>, wrap::rotate_left_u64<N>, wrap::rotate_left_s64<N>,
2777 wrap::rotate_left_f32<N>)
2778 };
2779 return dispatcher.apply(register_type(a), register_type(a));
2780 }
2781 }
2782
2783 template <typename T, class A, T... Values>
2784 struct batch_constant;
2785
2786 namespace kernel
2787 {
2788
2789
2790
2791
2792 template <class A, class T, class I, I... idx>
2793 XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self,
2794 batch_constant<I, A, idx...>,
2795 requires_arch<neon>) noexcept
2796 {
2797 static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
2798 std::array<T, batch<T, A>::size> data;
2799 self.store_aligned(data.data());
2800 return set(batch<T, A>(), A(), data[idx]...);
2801 }
2802 }
2803
2804 }
2805
2806 #undef WRAP_BINARY_INT_EXCLUDING_64
2807 #undef WRAP_BINARY_INT
2808 #undef WRAP_BINARY_FLOAT
2809 #undef WRAP_UNARY_INT_EXCLUDING_64
2810 #undef WRAP_UNARY_INT
2811 #undef WRAP_UNARY_FLOAT
2812
2813 #endif