xsimd/arch/xsimd_sve.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  * Copyright (c) Yibo Cai                                                   *
0007  *                                                                          *
0008  * Distributed under the terms of the BSD 3-Clause License.                 *
0009  *                                                                          *
0010  * The full license is in the file LICENSE, distributed with this software. *
0011  ****************************************************************************/
0012
0013 #ifndef XSIMD_SVE_HPP
0014 #define XSIMD_SVE_HPP
0015
0016 #include <complex>
0017 #include <type_traits>
0018
0019 #include "../types/xsimd_sve_register.hpp"
0020
0021 namespace xsimd
0022 {
0023     template <typename T, class A, T... Values>
0024     struct batch_constant;
0025
0026     namespace kernel
0027     {
0028         namespace detail
0029         {
0030             using xsimd::index;
0031             using xsimd::types::detail::sve_vector_type;
0032
0033             // predicate creation
0034             XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
0035             XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
0036             XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
0037             XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
0038
0039             template <class T>
0040             svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
0041
0042             // count active lanes in a predicate
0043             XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
0044             XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
0045             XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
0046             XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
0047
0048             template <class T>
0049             XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
0050
0051             // enable for signed integers
0052             template <class T>
0053             using sve_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
0054
0055             // enable for unsigned integers
0056             template <class T>
0057             using sve_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, int>::type;
0058
0059             // enable for floating points
0060             template <class T>
0061             using sve_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
0062
0063             // enable for signed integers or floating points
0064             template <class T>
0065             using sve_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
0066
0067             // enable for all SVE supported types
0068             template <class T>
0069             using sve_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
0070         } // namespace detail
0071
0072         /*********
0073          * Load *
0074          *********/
0075
0076         namespace detail
0077         {
0078             // "char" is not allowed in SVE load/store operations
0079             using sve_fix_char_t_impl = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
0080
0081             template <class T>
0082             using sve_fix_char_t = typename std::conditional<std::is_same<char, typename std::decay<T>::type>::value,
0083                                                              sve_fix_char_t_impl, T>::type;
0084         }
0085
0086         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0087         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
0088         {
0089             return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
0090         }
0091
0092         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0093         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
0094         {
0095             return load_aligned<A>(src, convert<T>(), sve {});
0096         }
0097
0098         // load_complex
0099         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0100         XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
0101         {
0102             const T* buf = reinterpret_cast<const T*>(mem);
0103             const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
0104             const auto real = svget2(tmp, 0);
0105             const auto imag = svget2(tmp, 1);
0106             return batch<std::complex<T>, A> { real, imag };
0107         }
0108
0109         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0110         XSIMD_INLINE batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
0111         {
0112             return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
0113         }
0114
0115         /*********
0116          * Store *
0117          *********/
0118
0119         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0120         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
0121         {
0122             svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
0123         }
0124
0125         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0126         XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
0127         {
0128             store_aligned<A>(dst, src, sve {});
0129         }
0130
0131         // store_complex
0132         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0133         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
0134         {
0135             using v2type = typename std::conditional<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>::type;
0136             v2type tmp {};
0137             tmp = svset2(tmp, 0, src.real());
0138             tmp = svset2(tmp, 1, src.imag());
0139             T* buf = reinterpret_cast<T*>(dst);
0140             svst2(detail::sve_ptrue<T>(), buf, tmp);
0141         }
0142
0143         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0144         XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
0145         {
0146             store_complex_aligned(dst, src, sve {});
0147         }
0148
0149         /******************
0150          * scatter/gather *
0151          ******************/
0152
0153         namespace detail
0154         {
0155             template <class T, class U>
0156             using sve_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
0157         }
0158
0159         // scatter
0160         template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
0161         XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
0162         {
0163             svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
0164         }
0165
0166         // gather
0167         template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
0168         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
0169         {
0170             return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
0171         }
0172
0173         /********************
0174          * Scalar to vector *
0175          ********************/
0176
0177         // broadcast
0178         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0179         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0180         {
0181             return svdup_n_u8(uint8_t(arg));
0182         }
0183
0184         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
0185         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0186         {
0187             return svdup_n_s8(int8_t(arg));
0188         }
0189
0190         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0191         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0192         {
0193             return svdup_n_u16(uint16_t(arg));
0194         }
0195
0196         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
0197         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0198         {
0199             return svdup_n_s16(int16_t(arg));
0200         }
0201
0202         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0203         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0204         {
0205             return svdup_n_u32(uint32_t(arg));
0206         }
0207
0208         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
0209         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0210         {
0211             return svdup_n_s32(int32_t(arg));
0212         }
0213
0214         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0215         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0216         {
0217             return svdup_n_u64(uint64_t(arg));
0218         }
0219
0220         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
0221         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
0222         {
0223             return svdup_n_s64(int64_t(arg));
0224         }
0225
0226         template <class A>
0227         XSIMD_INLINE batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
0228         {
0229             return svdup_n_f32(arg);
0230         }
0231
0232         template <class A>
0233         XSIMD_INLINE batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
0234         {
0235             return svdup_n_f64(arg);
0236         }
0237
0238         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0239         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
0240         {
0241             return broadcast<sve>(val, sve {});
0242         }
0243
0244         /**************
0245          * Arithmetic *
0246          **************/
0247
0248         // add
0249         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0250         XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0251         {
0252             return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
0253         }
0254
0255         // sadd
0256         template <class A, class T, detail::enable_integral_t<T> = 0>
0257         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0258         {
0259             return svqadd(lhs, rhs);
0260         }
0261
0262         // sub
0263         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0264         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0265         {
0266             return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
0267         }
0268
0269         // ssub
0270         template <class A, class T, detail::enable_integral_t<T> = 0>
0271         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0272         {
0273             return svqsub(lhs, rhs);
0274         }
0275
0276         // mul
0277         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0278         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0279         {
0280             return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
0281         }
0282
0283         // div
0284         template <class A, class T, typename std::enable_if<sizeof(T) >= 4, int>::type = 0>
0285         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0286         {
0287             return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
0288         }
0289
0290         // max
0291         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0292         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0293         {
0294             return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
0295         }
0296
0297         // min
0298         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0299         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0300         {
0301             return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
0302         }
0303
0304         // neg
0305         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
0306         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
0307         {
0308             return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(arg)));
0309         }
0310
0311         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
0312         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
0313         {
0314             return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(arg)));
0315         }
0316
0317         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
0318         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
0319         {
0320             return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(arg)));
0321         }
0322
0323         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
0324         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
0325         {
0326             return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(arg)));
0327         }
0328
0329         template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
0330         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
0331         {
0332             return svneg_x(detail::sve_ptrue<T>(), arg);
0333         }
0334
0335         // abs
0336         template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
0337         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
0338         {
0339             return arg;
0340         }
0341
0342         template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
0343         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
0344         {
0345             return svabs_x(detail::sve_ptrue<T>(), arg);
0346         }
0347
0348         // fma: x * y + z
0349         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0350         XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
0351         {
0352             return svmad_x(detail::sve_ptrue<T>(), x, y, z);
0353         }
0354
0355         // fnma: z - x * y
0356         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0357         XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
0358         {
0359             return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
0360         }
0361
0362         // fms: x * y - z
0363         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0364         XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
0365         {
0366             return -fnma(x, y, z, sve {});
0367         }
0368
0369         // fnms: - x * y - z
0370         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0371         XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
0372         {
0373             return -fma(x, y, z, sve {});
0374         }
0375
0376         /**********************
0377          * Logical operations *
0378          **********************/
0379
0380         // bitwise_and
0381         template <class A, class T, detail::enable_integral_t<T> = 0>
0382         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0383         {
0384             return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
0385         }
0386
0387         template <class A>
0388         XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
0389         {
0390             const auto lhs_bits = svreinterpret_u32(lhs);
0391             const auto rhs_bits = svreinterpret_u32(rhs);
0392             const auto result_bits = svand_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
0393             return svreinterpret_f32(result_bits);
0394         }
0395
0396         template <class A>
0397         XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
0398         {
0399             const auto lhs_bits = svreinterpret_u64(lhs);
0400             const auto rhs_bits = svreinterpret_u64(rhs);
0401             const auto result_bits = svand_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
0402             return svreinterpret_f64(result_bits);
0403         }
0404
0405         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0406         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
0407         {
0408             return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
0409         }
0410
0411         // bitwise_andnot
0412         template <class A, class T, detail::enable_integral_t<T> = 0>
0413         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0414         {
0415             return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
0416         }
0417
0418         template <class A>
0419         XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
0420         {
0421             const auto lhs_bits = svreinterpret_u32(lhs);
0422             const auto rhs_bits = svreinterpret_u32(rhs);
0423             const auto result_bits = svbic_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
0424             return svreinterpret_f32(result_bits);
0425         }
0426
0427         template <class A>
0428         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
0429         {
0430             const auto lhs_bits = svreinterpret_u64(lhs);
0431             const auto rhs_bits = svreinterpret_u64(rhs);
0432             const auto result_bits = svbic_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
0433             return svreinterpret_f64(result_bits);
0434         }
0435
0436         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0437         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
0438         {
0439             return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
0440         }
0441
0442         // bitwise_or
0443         template <class A, class T, detail::enable_integral_t<T> = 0>
0444         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0445         {
0446             return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
0447         }
0448
0449         template <class A>
0450         XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
0451         {
0452             const auto lhs_bits = svreinterpret_u32(lhs);
0453             const auto rhs_bits = svreinterpret_u32(rhs);
0454             const auto result_bits = svorr_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
0455             return svreinterpret_f32(result_bits);
0456         }
0457
0458         template <class A>
0459         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
0460         {
0461             const auto lhs_bits = svreinterpret_u64(lhs);
0462             const auto rhs_bits = svreinterpret_u64(rhs);
0463             const auto result_bits = svorr_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
0464             return svreinterpret_f64(result_bits);
0465         }
0466
0467         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0468         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
0469         {
0470             return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
0471         }
0472
0473         // bitwise_xor
0474         template <class A, class T, detail::enable_integral_t<T> = 0>
0475         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0476         {
0477             return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
0478         }
0479
0480         template <class A>
0481         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
0482         {
0483             const auto lhs_bits = svreinterpret_u32(lhs);
0484             const auto rhs_bits = svreinterpret_u32(rhs);
0485             const auto result_bits = sveor_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
0486             return svreinterpret_f32(result_bits);
0487         }
0488
0489         template <class A>
0490         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
0491         {
0492             const auto lhs_bits = svreinterpret_u64(lhs);
0493             const auto rhs_bits = svreinterpret_u64(rhs);
0494             const auto result_bits = sveor_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
0495             return svreinterpret_f64(result_bits);
0496         }
0497
0498         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0499         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
0500         {
0501             return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
0502         }
0503
0504         // bitwise_not
0505         template <class A, class T, detail::enable_integral_t<T> = 0>
0506         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
0507         {
0508             return svnot_x(detail::sve_ptrue<T>(), arg);
0509         }
0510
0511         template <class A>
0512         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
0513         {
0514             const auto arg_bits = svreinterpret_u32(arg);
0515             const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
0516             return svreinterpret_f32(result_bits);
0517         }
0518
0519         template <class A>
0520         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
0521         {
0522             const auto arg_bits = svreinterpret_u64(arg);
0523             const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
0524             return svreinterpret_f64(result_bits);
0525         }
0526
0527         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0528         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
0529         {
0530             return svnot_z(detail::sve_ptrue<T>(), arg);
0531         }
0532
0533         /**********
0534          * Shifts *
0535          **********/
0536
0537         namespace detail
0538         {
0539             template <class A, class T, class U>
0540             XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
0541             {
0542                 return svreinterpret_u8(arg);
0543             }
0544
0545             template <class A, class T, class U>
0546             XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
0547             {
0548                 return svreinterpret_u16(arg);
0549             }
0550
0551             template <class A, class T, class U>
0552             XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
0553             {
0554                 return svreinterpret_u32(arg);
0555             }
0556
0557             template <class A, class T, class U>
0558             XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
0559             {
0560                 return svreinterpret_u64(arg);
0561             }
0562
0563             template <class A, class T, class U = as_unsigned_integer_t<T>>
0564             XSIMD_INLINE batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
0565             {
0566                 return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
0567             }
0568         } // namespace detail
0569
0570         // bitwise_lshift
0571         template <class A, class T, detail::enable_integral_t<T> = 0>
0572         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
0573         {
0574             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
0575             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
0576             return svlsl_x(detail::sve_ptrue<T>(), arg, n);
0577         }
0578
0579         template <class A, class T, detail::enable_integral_t<T> = 0>
0580         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0581         {
0582             return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
0583         }
0584
0585         // bitwise_rshift
0586         template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
0587         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
0588         {
0589             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
0590             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
0591             return svlsr_x(detail::sve_ptrue<T>(), arg, static_cast<T>(n));
0592         }
0593
0594         template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
0595         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0596         {
0597             return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
0598         }
0599
0600         template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
0601         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
0602         {
0603             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
0604             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
0605             return svasr_x(detail::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
0606         }
0607
0608         template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
0609         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0610         {
0611             return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
0612         }
0613
0614         /**************
0615          * Reductions *
0616          **************/
0617
0618         // reduce_add
0619         template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
0620         XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
0621         {
0622             // sve integer reduction results are promoted to 64 bits
0623             return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
0624         }
0625
0626         // reduce_max
0627         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0628         XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
0629         {
0630             return svmaxv(detail::sve_ptrue<T>(), arg);
0631         }
0632
0633         // reduce_min
0634         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0635         XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
0636         {
0637             return svminv(detail::sve_ptrue<T>(), arg);
0638         }
0639
0640         // haddp
0641         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0642         XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
0643         {
0644             constexpr std::size_t size = batch<T, A>::size;
0645             T sums[size];
0646             for (std::size_t i = 0; i < size; ++i)
0647             {
0648                 sums[i] = reduce_add(row[i], sve {});
0649             }
0650             return svld1(detail::sve_ptrue<T>(), sums);
0651         }
0652
0653         /***************
0654          * Comparisons *
0655          ***************/
0656
0657         // eq
0658         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0659         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0660         {
0661             return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
0662         }
0663
0664         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0665         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
0666         {
0667             const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
0668             return svnot_z(detail::sve_ptrue<T>(), neq_result);
0669         }
0670
0671         // neq
0672         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0673         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0674         {
0675             return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
0676         }
0677
0678         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0679         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
0680         {
0681             return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
0682         }
0683
0684         // lt
0685         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0686         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0687         {
0688             return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
0689         }
0690
0691         // le
0692         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0693         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0694         {
0695             return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
0696         }
0697
0698         // gt
0699         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0700         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0701         {
0702             return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
0703         }
0704
0705         // ge
0706         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0707         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0708         {
0709             return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
0710         }
0711
0712         /***************
0713          * Permutation *
0714          ***************/
0715
0716         //  rotate_left
0717         template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
0718         XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<sve>) noexcept
0719         {
0720             return svext(a, a, N);
0721         }
0722
0723         // swizzle (dynamic)
0724         template <class A, class T, class I>
0725         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch<I, A> indices, requires_arch<sve>) noexcept
0726         {
0727             return svtbl(arg, indices);
0728         }
0729
0730         template <class A, class T, class I>
0731         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
0732                                                        batch<I, A> indices,
0733                                                        requires_arch<sve>) noexcept
0734         {
0735             const auto real = swizzle(self.real(), indices, sve {});
0736             const auto imag = swizzle(self.imag(), indices, sve {});
0737             return batch<std::complex<T>>(real, imag);
0738         }
0739
0740         // swizzle (static)
0741         template <class A, class T, class I, I... idx>
0742         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
0743         {
0744             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
0745             return swizzle(arg, indices.as_batch(), sve {});
0746         }
0747
0748         template <class A, class T, class I, I... idx>
0749         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
0750                                                        batch_constant<I, A, idx...> indices,
0751                                                        requires_arch<sve>) noexcept
0752         {
0753             static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
0754             return swizzle(arg, indices.as_batch(), sve {});
0755         }
0756
0757         /*************
0758          * Selection *
0759          *************/
0760
0761         // extract_pair
0762         namespace detail
0763         {
0764             template <class A, class T>
0765             XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
0766             {
0767                 assert(false && "extract_pair out of bounds");
0768                 return batch<T, A> {};
0769             }
0770
0771             template <class A, class T, size_t I, size_t... Is>
0772             XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
0773             {
0774                 if (n == I)
0775                 {
0776                     return svext(rhs, lhs, I);
0777                 }
0778                 else
0779                 {
0780                     return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
0781                 }
0782             }
0783
0784             template <class A, class T, size_t... Is>
0785             XSIMD_INLINE batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
0786             {
0787                 if (n == 0)
0788                 {
0789                     return rhs;
0790                 }
0791                 else
0792                 {
0793                     return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
0794                 }
0795             }
0796         }
0797
0798         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0799         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
0800         {
0801             constexpr std::size_t size = batch<T, A>::size;
0802             assert(n < size && "index in bounds");
0803             return detail::sve_extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
0804         }
0805
0806         // select
0807         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0808         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
0809         {
0810             return svsel(cond, a, b);
0811         }
0812
0813         template <class A, class T, bool... b>
0814         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
0815         {
0816             return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
0817         }
0818
0819         // zip_lo
0820         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0821         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0822         {
0823             return svzip1(lhs, rhs);
0824         }
0825
0826         // zip_hi
0827         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0828         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
0829         {
0830             return svzip2(lhs, rhs);
0831         }
0832
0833         /*****************************
0834          * Floating-point arithmetic *
0835          *****************************/
0836
0837         // rsqrt
0838         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0839         XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
0840         {
0841             return svrsqrte(arg);
0842         }
0843
0844         // sqrt
0845         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0846         XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
0847         {
0848             return svsqrt_x(detail::sve_ptrue<T>(), arg);
0849         }
0850
0851         // reciprocal
0852         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
0853         XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
0854         {
0855             return svrecpe(arg);
0856         }
0857
0858         /******************************
0859          * Floating-point conversions *
0860          ******************************/
0861
0862         // fast_cast
0863         namespace detail
0864         {
0865             template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
0866             XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
0867             {
0868                 return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
0869             }
0870
0871             template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
0872             XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
0873             {
0874                 return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
0875             }
0876
0877             template <class A>
0878             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
0879             {
0880                 return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
0881             }
0882
0883             template <class A>
0884             XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
0885             {
0886                 return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
0887             }
0888
0889             template <class A>
0890             XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
0891             {
0892                 return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
0893             }
0894
0895             template <class A>
0896             XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
0897             {
0898                 return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
0899             }
0900         }
0901
0902         /*********
0903          * Miscs *
0904          *********/
0905
0906         // set
0907         template <class A, class T, class... Args>
0908         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
0909         {
0910             return detail::sve_vector_type<T> { args... };
0911         }
0912
0913         template <class A, class T, class... Args>
0914         XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
0915                                                    Args... args_complex) noexcept
0916         {
0917             return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
0918                                           detail::sve_vector_type<T> { args_complex.imag()... });
0919         }
0920
0921         template <class A, class T, class... Args>
0922         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
0923         {
0924             using U = as_unsigned_integer_t<T>;
0925             const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
0926             const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
0927             return svcmpne(detail::sve_ptrue<T>(), values, zero);
0928         }
0929
0930         // insert
0931         namespace detail
0932         {
0933             // generate index sequence (iota)
0934             XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
0935             XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
0936             XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
0937             XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
0938
0939             template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
0940             XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
0941         } // namespace detail
0942
0943         template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
0944         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
0945         {
0946             // create a predicate with only the I-th lane activated
0947             const auto iota = detail::sve_iota<T>();
0948             const auto index_predicate = svcmpeq(detail::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
0949             return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
0950         }
0951
0952         // all
0953         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0954         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
0955         {
0956             return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
0957         }
0958
0959         // any
0960         template <class A, class T, detail::sve_enable_all_t<T> = 0>
0961         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
0962         {
0963             return svptest_any(arg, arg);
0964         }
0965
0966         // bitwise_cast
0967         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
0968         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
0969         {
0970             return svreinterpret_u8(arg);
0971         }
0972
0973         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
0974         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
0975         {
0976             return svreinterpret_s8(arg);
0977         }
0978
0979         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
0980         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
0981         {
0982             return svreinterpret_u16(arg);
0983         }
0984
0985         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
0986         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
0987         {
0988             return svreinterpret_s16(arg);
0989         }
0990
0991         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
0992         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
0993         {
0994             return svreinterpret_u32(arg);
0995         }
0996
0997         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
0998         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
0999         {
1000             return svreinterpret_s32(arg);
1001         }
1002
1003         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
1004         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
1005         {
1006             return svreinterpret_u64(arg);
1007         }
1008
1009         template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
1010         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
1011         {
1012             return svreinterpret_s64(arg);
1013         }
1014
1015         template <class A, class T, detail::sve_enable_all_t<T> = 0>
1016         XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
1017         {
1018             return svreinterpret_f32(arg);
1019         }
1020
1021         template <class A, class T, detail::sve_enable_all_t<T> = 0>
1022         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
1023         {
1024             return svreinterpret_f64(arg);
1025         }
1026
1027         // batch_bool_cast
1028         template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
1029         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
1030         {
1031             return arg.data;
1032         }
1033
1034         // from_bool
1035         template <class A, class T, detail::sve_enable_all_t<T> = 0>
1036         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
1037         {
1038             return select(arg, batch<T, A>(1), batch<T, A>(0));
1039         }
1040
1041         // slide_left
1042         namespace detail
1043         {
1044             template <size_t N>
1045             struct sve_slider_left
1046             {
1047                 template <class A, class T>
1048                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
1049                 {
1050                     using u8_vector = batch<uint8_t, A>;
1051                     const auto left = svdup_n_u8(0);
1052                     const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
1053                     const u8_vector result(svext(left, right, u8_vector::size - N));
1054                     return bitwise_cast(result, batch<T, A> {}, sve {});
1055                 }
1056             };
1057
1058             template <>
1059             struct sve_slider_left<0>
1060             {
1061                 template <class A, class T>
1062                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
1063                 {
1064                     return arg;
1065                 }
1066             };
1067         } // namespace detail
1068
1069         template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
1070         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
1071         {
1072             return detail::sve_slider_left<N>()(arg);
1073         }
1074
1075         // slide_right
1076         namespace detail
1077         {
1078             template <size_t N>
1079             struct sve_slider_right
1080             {
1081                 template <class A, class T>
1082                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
1083                 {
1084                     using u8_vector = batch<uint8_t, A>;
1085                     const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
1086                     const auto right = svdup_n_u8(0);
1087                     const u8_vector result(svext(left, right, N));
1088                     return bitwise_cast(result, batch<T, A> {}, sve {});
1089                 }
1090             };
1091
1092             template <>
1093             struct sve_slider_right<batch<uint8_t, sve>::size>
1094             {
1095                 template <class A, class T>
1096                 XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
1097                 {
1098                     return batch<T, A> {};
1099                 }
1100             };
1101         } // namespace detail
1102
1103         template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
1104         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
1105         {
1106             return detail::sve_slider_right<N>()(arg);
1107         }
1108
1109         // isnan
1110         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
1111         XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
1112         {
1113             return !(arg == arg);
1114         }
1115
1116         // nearbyint
1117         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
1118         XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
1119         {
1120             return svrintx_x(detail::sve_ptrue<T>(), arg);
1121         }
1122
1123         // nearbyint_as_int
1124         template <class A>
1125         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
1126         {
1127             const auto nearest = svrintx_x(detail::sve_ptrue<float>(), arg);
1128             return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
1129         }
1130
1131         template <class A>
1132         XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
1133         {
1134             const auto nearest = svrintx_x(detail::sve_ptrue<double>(), arg);
1135             return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
1136         }
1137
1138         // ldexp
1139         template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
1140         XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
1141         {
1142             return svscale_x(detail::sve_ptrue<T>(), x, exp);
1143         }
1144
1145     } // namespace kernel
1146 } // namespace xsimd
1147
1148 #endif