xsimd/arch/xsimd_rvv.hpp

0001 /***************************************************************************
0002
0003  * Copyright (c) Rivos Inc.                                                 *
0004  *                                                                          *
0005  * Distributed under the terms of the BSD 3-Clause License.                 *
0006  *                                                                          *
0007  * The full license is in the file LICENSE, distributed with this software. *
0008  ****************************************************************************/
0009
0010 #ifndef XSIMD_RVV_HPP
0011 #define XSIMD_RVV_HPP
0012
0013 #include <complex>
0014 #include <type_traits>
0015 #include <utility>
0016
0017 #include "../types/xsimd_rvv_register.hpp"
0018 #include "xsimd_constants.hpp"
0019
0020 // This set of macros allows the synthesis of identifiers using a template and
0021 // variable macro arguments.  A single template can then be used by multiple
0022 // macros, or multiple instances of a macro to define the same logic for
0023 // different data types.
0024 //
0025 // First some logic to paste text together...
0026 //
0027 #define XSIMD_RVV_JOIN_(x, y) x##y
0028 #define XSIMD_RVV_JOIN(x, y) XSIMD_RVV_JOIN_(x, y)
0029 #define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_RVV_JOIN(T, then)
0030 #define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_RVV_JOIN(S, then)
0031 #define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_RVV_JOIN(m1, then)
0032 #define XSIMD_RVV_PREFIX(T, S, then) then
0033 //
0034 // XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for
0035 // the identifier.  The template is a comma-separated list of alternating
0036 // literal and parameter segments.  Each parameter is appended to XSIMD_RVV_PREFIX to
0037 // form a new macro name which decides which parameter should be inserted.
0038 // Then a literal segment is inserted after that.  Empty literals are used to
0039 // join two or more variables together.
0040 //
0041 #define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t
0042 #define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__)))
0043 #define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__)))
0044 #define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__)))
0045 #define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__)))
0046 #define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__)))
0047 #define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__)))
0048 #define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__)))
0049 #define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__)))
0050 #define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__)))
0051 //
0052 // UNBRACKET and REPARSE force the preprocessor to handle expansion in a
0053 // specific order.  XSIMD_RVV_UNBRACKET strips the parentheses from the template
0054 // (which were necessary to keep the template as a single, named macro
0055 // parameter up to this point).  XSIMD_RVV_ARG_LIST then forms the new parameter list
0056 // to pass to XSIMD_RVV_IDENTIFIER0, with trailing commas to ensure the unrolled
0057 // XSIMD_RVV_IDENTIFIER loop runs to completion adding empty strings.
0058 //
0059 // However XSIMD_RVV_IDENTIFIER0 is not expanded immediately because it does not
0060 // match a function-like macro in this pass.  XSIMD_RVV_REPARSE forces another
0061 // evaluation after the expansion of XSIMD_RVV_ARG_LIST, where XSIMD_RVV_IDENTIFIER0 will
0062 // now match as a function-like macro, and the cycle of substitutions and
0063 // insertions can begin.
0064 //
0065 #define XSIMD_RVV_REPARSE(v) (v)
0066 #define XSIMD_RVV_UNBRACKET(...) __VA_ARGS__
0067 #define XSIMD_RVV_ARG_LIST(T, S, name) (T, S, XSIMD_RVV_UNBRACKET name, , , , , , , , , , , , , , , , , , , , , )
0068 #define XSIMD_RVV_IDENTIFIER(T, S, name) XSIMD_RVV_REPARSE(XSIMD_RVV_IDENTIFIER0 XSIMD_RVV_ARG_LIST(T, S, name))
0069 //
0070 // To avoid comma-counting bugs, replace the variable references with macros
0071 // which include enough commas to keep proper phase, and then use no commas at
0072 // all in the templates.
0073 //
0074 #define XSIMD_RVV_T , _T,
0075 #define XSIMD_RVV_S , _S,
0076 #define XSIMD_RVV_M , _M,
0077 #define XSIMD_RVV_TSM XSIMD_RVV_T XSIMD_RVV_S XSIMD_RVV_M
0078
0079 // XSIMD_RVV_OVERLOAD, below, expands to a head section, a number of body sections
0080 // (depending on which types are supported), and a tail section.  Different
0081 // variants of these sections are implemented with different suffixes on the
0082 // three macro names XSIMD_RVV_WRAPPER_HEAD, XSIMD_RVV_WRAPPER, and XSIMD_RVV_WRAPPER_TAIL and
0083 // specified as an argument to XSIMD_RVV_OVERLOAD (the empty string is the default,
0084 // but still needs an extra comma to hold its place).
0085 //
0086 // The default XSIMD_RVV_WRAPPER_HEAD provides a class containing convenient names
0087 // for the function signature argument(s) to XSIMD_RVV_OVERLOAD.  That signature can
0088 // also reference the template argument T, because it's a text substitution
0089 // into the template.
0090 #define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                      \
0091     namespace NAME##_cruft                                                \
0092     {                                                                     \
0093         template <class T>                                                \
0094         struct ctx                                                        \
0095         {                                                                 \
0096             static constexpr size_t width = XSIMD_RVV_BITS;               \
0097             static constexpr size_t vl = width / (sizeof(T) * 8);         \
0098             using vec = rvv_reg_t<T, width>;                              \
0099             using uvec = rvv_reg_t<as_unsigned_relaxed_t<T>, width>;      \
0100             using svec = rvv_reg_t<as_signed_relaxed_t<T>, width>;        \
0101             using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;         \
0102             using bvec = rvv_bool_t<T, width>;                            \
0103             using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>; \
0104             using wide_vec = rvv_reg_t<T, width * 2>;                     \
0105             using narrow_vec = rvv_reg_t<T, width / 2>;                   \
0106             using type = SIGNATURE;                                       \
0107         };                                                                \
0108         template <class T>                                                \
0109         using sig_t = typename ctx<T>::type;                              \
0110         template <class K, class T>                                       \
0111         struct impl                                                       \
0112         {                                                                 \
0113             void operator()() const noexcept {};                          \
0114         };                                                                \
0115         template <class K>                                                \
0116         using impl_t = impl<K, sig_t<K>>;
0117
0118 #define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
0119 #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
0120 #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
0121 #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
0122
0123 // The body of the wrapper defines a functor (because partial specialisation of
0124 // functions is not legal) which forwards its arguments to the named intrinsic
0125 // with a few manipulations.  In general, vector types are handled as
0126 // rvv_reg_t<> and rely on the conversion operators in that class for
0127 // compatibility with the intrinsics.
0128 //
0129 // The function signature is not mentioned here.  Instead it's provided in the
0130 // tail code as the template argument for which this is a specialisation, which
0131 // overcomes the problem of converting a function signature type to an argument
0132 // list to pass to another function.
0133 //
0134 #define XSIMD_RVV_WRAPPER(KEY, CALLEE, ...)                   \
0135     template <class Ret, class... Args>                       \
0136     struct impl<KEY, Ret(Args...)>                            \
0137     {                                                         \
0138         using ctx = ctx<KEY>;                                 \
0139         constexpr Ret operator()(Args... args) const noexcept \
0140         {                                                     \
0141             return CALLEE(args..., ctx::vl);                  \
0142         };                                                    \
0143     };
0144 #define XSIMD_RVV_WRAPPER_NOVL(KEY, CALLEE, ...)              \
0145     template <class Ret, class... Args>                       \
0146     struct impl<KEY, Ret(Args...)>                            \
0147     {                                                         \
0148         constexpr Ret operator()(Args... args) const noexcept \
0149         {                                                     \
0150             return CALLEE(args...);                           \
0151         };                                                    \
0152     };
0153 #define XSIMD_RVV_WRAPPER_DROP_1ST(KEY, CALLEE, ...)                 \
0154     template <class Ret, class First, class... Args>                 \
0155     struct impl<KEY, Ret(First, Args...)>                            \
0156     {                                                                \
0157         using ctx = ctx<KEY>;                                        \
0158         constexpr Ret operator()(First, Args... args) const noexcept \
0159         {                                                            \
0160             return CALLEE(args..., ctx::vl);                         \
0161         };                                                           \
0162     };
0163 #define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS(KEY, CALLEE, SIGNATURE, ...) \
0164     template <class Ret, class First, class... Args>                        \
0165     struct impl<KEY, Ret(First, Args...)>                                   \
0166     {                                                                       \
0167         using ctx = ctx<KEY>;                                               \
0168         constexpr Ret operator()(First, Args... args) const noexcept        \
0169         {                                                                   \
0170             return CALLEE(__VA_ARGS__, ctx::vl);                            \
0171         };                                                                  \
0172     };
0173 #define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS_NOVL(KEY, CALLEE, SIGNATURE, ...) \
0174     template <class Ret, class First, class... Args>                             \
0175     struct impl<KEY, Ret(First, Args...)>                                        \
0176     {                                                                            \
0177         constexpr Ret operator()(First, Args... args) const noexcept             \
0178         {                                                                        \
0179             return CALLEE(__VA_ARGS__);                                          \
0180         };                                                                       \
0181     };
0182
0183 // This part folds all the above templates down into a single functor instance
0184 // with all the different function signatures available under the one name.
0185 // Not all of the base classes necessarily contain useful code, but there's a
0186 // default implementation so that filtering them out isn't really necessary.
0187 #define XSIMD_RVV_WRAPPER_TAIL(NAME, ...)                     \
0188     } /* namespace NAME##_cruft */                            \
0189     static constexpr struct : NAME##_cruft::impl_t<int8_t>,   \
0190                               NAME##_cruft::impl_t<uint8_t>,  \
0191                               NAME##_cruft::impl_t<int16_t>,  \
0192                               NAME##_cruft::impl_t<uint16_t>, \
0193                               NAME##_cruft::impl_t<int32_t>,  \
0194                               NAME##_cruft::impl_t<uint32_t>, \
0195                               NAME##_cruft::impl_t<int64_t>,  \
0196                               NAME##_cruft::impl_t<uint64_t>, \
0197                               NAME##_cruft::impl_t<float>,    \
0198                               NAME##_cruft::impl_t<double>    \
0199     {                                                         \
0200         using NAME##_cruft::impl_t<int8_t>::operator();       \
0201         using NAME##_cruft::impl_t<uint8_t>::operator();      \
0202         using NAME##_cruft::impl_t<int16_t>::operator();      \
0203         using NAME##_cruft::impl_t<uint16_t>::operator();     \
0204         using NAME##_cruft::impl_t<int32_t>::operator();      \
0205         using NAME##_cruft::impl_t<uint32_t>::operator();     \
0206         using NAME##_cruft::impl_t<int64_t>::operator();      \
0207         using NAME##_cruft::impl_t<uint64_t>::operator();     \
0208         using NAME##_cruft::impl_t<float>::operator();        \
0209         using NAME##_cruft::impl_t<double>::operator();       \
0210     } NAME {};
0211 #define XSIMD_RVV_WRAPPER_TAIL_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
0212 #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
0213 #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
0214 #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
0215
0216 // clang-format off
0217
0218 #define XSIMD_RVV_OVERLOAD_head(my_name, variant, ...) \
0219     XSIMD_RVV_WRAPPER_HEAD##variant(my_name, __VA_ARGS__)
0220 #define XSIMD_RVV_OVERLOAD_i(name, variant, ...)                                        \
0221     XSIMD_RVV_WRAPPER##variant(int8_t, XSIMD_RVV_IDENTIFIER(i, 8, name), __VA_ARGS__)   \
0222     XSIMD_RVV_WRAPPER##variant(int16_t, XSIMD_RVV_IDENTIFIER(i, 16, name), __VA_ARGS__) \
0223     XSIMD_RVV_WRAPPER##variant(int32_t, XSIMD_RVV_IDENTIFIER(i, 32, name), __VA_ARGS__) \
0224     XSIMD_RVV_WRAPPER##variant(int64_t, XSIMD_RVV_IDENTIFIER(i, 64, name), __VA_ARGS__)
0225 #define XSIMD_RVV_OVERLOAD_u(name, variant, ...)                                         \
0226     XSIMD_RVV_WRAPPER##variant(uint8_t, XSIMD_RVV_IDENTIFIER(u, 8, name), __VA_ARGS__)   \
0227     XSIMD_RVV_WRAPPER##variant(uint16_t, XSIMD_RVV_IDENTIFIER(u, 16, name), __VA_ARGS__) \
0228     XSIMD_RVV_WRAPPER##variant(uint32_t, XSIMD_RVV_IDENTIFIER(u, 32, name), __VA_ARGS__) \
0229     XSIMD_RVV_WRAPPER##variant(uint64_t, XSIMD_RVV_IDENTIFIER(u, 64, name), __VA_ARGS__)
0230 #define XSIMD_RVV_OVERLOAD_f(name, variant, ...)                                      \
0231     XSIMD_RVV_WRAPPER##variant(float, XSIMD_RVV_IDENTIFIER(f, 32, name), __VA_ARGS__) \
0232     XSIMD_RVV_WRAPPER##variant(double, XSIMD_RVV_IDENTIFIER(f, 64, name), __VA_ARGS__)
0233 #define XSIMD_RVV_OVERLOAD_tail(my_name, variant, ...) \
0234     XSIMD_RVV_WRAPPER_TAIL##variant(my_name, __VA_ARGS__)
0235
0236 // Use these to create function (actually functor, sorry) wrappers overloaded
0237 // for whichever types are supported.  Being functors means they can't take a
0238 // template argument (until C++14), so if a type can't be deduced then a junk
0239 // value can be passed as the first argument and discarded by using the
0240 // _DROP_1ST variant, instead.
0241 //
0242 // The wrappers use the rvv_reg_t<> types for template accessibility, and
0243 // because some types (eg., vfloat64mf2_t) don't exist and need extra
0244 // abstraction to emulate.
0245 //
0246 // In many cases the intrinsic names are different for signed, unsigned, or
0247 // float variants, the macros OVERLOAD2 and OVERLOAD3 (depending on whether or
0248 // not a float variant exists) take multiple intrinsic names and bring them
0249 // together under a single overloaded identifier where they can be used within
0250 // templates.
0251 //
0252 #define XSIMD_RVV_OVERLOAD2(my_name, name_i, name_u, variant, ...) \
0253     XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)         \
0254     XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__)         \
0255     XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__)     \
0256     XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
0257
0258 #define XSIMD_RVV_OVERLOAD3(my_name, name_i, name_u, name_f, variant, ...) \
0259     XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)                 \
0260     XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__)                     \
0261     XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__)                     \
0262     XSIMD_RVV_OVERLOAD_f(name_f, variant, __VA_ARGS__)                     \
0263     XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
0264
0265 #define XSIMD_RVV_OVERLOAD(my_name, name, ...) XSIMD_RVV_OVERLOAD3(my_name, name, name, name, __VA_ARGS__)
0266 #define XSIMD_RVV_OVERLOAD_INTS(my_name, name, ...) XSIMD_RVV_OVERLOAD2(my_name, name, name, __VA_ARGS__)
0267
0268 #define XSIMD_RVV_OVERLOAD_SINTS(my_name, name, variant, ...) \
0269     XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)    \
0270     XSIMD_RVV_OVERLOAD_i(name, variant, __VA_ARGS__)          \
0271     XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
0272
0273 #define XSIMD_RVV_OVERLOAD_UINTS(my_name, name, variant, ...) \
0274     XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)    \
0275     XSIMD_RVV_OVERLOAD_u(name, variant, __VA_ARGS__)          \
0276     XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
0277
0278 #define XSIMD_RVV_OVERLOAD_FLOATS(my_name, name, variant, ...) \
0279     XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)     \
0280     XSIMD_RVV_OVERLOAD_f(name, variant, __VA_ARGS__)           \
0281     XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
0282
0283 // clang-format on
0284
0285 namespace xsimd
0286 {
0287     template <typename T, class A, T... Values>
0288     struct batch_constant;
0289
0290     namespace kernel
0291     {
0292         namespace detail
0293         {
0294             template <class T>
0295             using rvv_fix_char_t = types::detail::rvv_fix_char_t<T>;
0296             template <class T, size_t Width = XSIMD_RVV_BITS>
0297             using rvv_reg_t = types::detail::rvv_reg_t<T, Width>;
0298             template <class T, size_t Width = XSIMD_RVV_BITS>
0299             using rvv_bool_t = types::detail::rvv_bool_t<T, Width>;
0300
0301             template <size_t>
0302             struct as_signed_relaxed;
0303             template <>
0304             struct as_signed_relaxed<1>
0305             {
0306                 using type = int8_t;
0307             };
0308             template <>
0309             struct as_signed_relaxed<2>
0310             {
0311                 using type = int16_t;
0312             };
0313             template <>
0314             struct as_signed_relaxed<4>
0315             {
0316                 using type = int32_t;
0317             };
0318             template <>
0319             struct as_signed_relaxed<8>
0320             {
0321                 using type = int64_t;
0322             };
0323             template <class T>
0324             using as_signed_relaxed_t = typename as_signed_relaxed<sizeof(T)>::type;
0325             template <size_t>
0326             struct as_unsigned_relaxed;
0327             template <>
0328             struct as_unsigned_relaxed<1>
0329             {
0330                 using type = uint8_t;
0331             };
0332             template <>
0333             struct as_unsigned_relaxed<2>
0334             {
0335                 using type = uint16_t;
0336             };
0337             template <>
0338             struct as_unsigned_relaxed<4>
0339             {
0340                 using type = uint32_t;
0341             };
0342             template <>
0343             struct as_unsigned_relaxed<8>
0344             {
0345                 using type = uint64_t;
0346             };
0347             template <class T>
0348             using as_unsigned_relaxed_t = typename as_unsigned_relaxed<sizeof(T)>::type;
0349             template <size_t>
0350             struct as_float_relaxed;
0351             template <>
0352             struct as_float_relaxed<1>
0353             {
0354                 using type = int8_t;
0355             };
0356             template <>
0357             struct as_float_relaxed<2>
0358             {
0359                 using type = int16_t;
0360             };
0361             template <>
0362             struct as_float_relaxed<4>
0363             {
0364                 using type = float;
0365             };
0366             template <>
0367             struct as_float_relaxed<8>
0368             {
0369                 using type = double;
0370             };
0371             template <class T>
0372             using as_float_relaxed_t = typename as_float_relaxed<sizeof(T)>::type;
0373
0374             template <class T, class U>
0375             rvv_reg_t<T, U::width> rvvreinterpret(U const& arg) noexcept
0376             {
0377                 return rvv_reg_t<T, U::width>(arg, types::detail::XSIMD_RVV_BITCAST);
0378             }
0379             template <class T, class A, class U>
0380             rvv_reg_t<T, A::width> rvvreinterpret(batch<U, A> const& arg) noexcept
0381             {
0382                 typename batch<U, A>::register_type r = arg;
0383                 return rvvreinterpret<T>(r);
0384             }
0385
0386             template <class A, class T, class U = as_unsigned_integer_t<T>>
0387             XSIMD_INLINE batch<U, A> rvv_to_unsigned_batch(batch<T, A> const& arg) noexcept
0388             {
0389                 return rvvreinterpret<U>(arg.data);
0390             }
0391
0392             XSIMD_RVV_OVERLOAD(rvvid,
0393                                (__riscv_vid_v_u XSIMD_RVV_S XSIMD_RVV_M), _DROP_1ST, uvec(T))
0394
0395             XSIMD_RVV_OVERLOAD3(rvvmv_splat,
0396                                 (__riscv_vmv_v_x_ XSIMD_RVV_TSM),
0397                                 (__riscv_vmv_v_x_ XSIMD_RVV_TSM),
0398                                 (__riscv_vfmv_v_f_ XSIMD_RVV_TSM), , vec(T))
0399
0400             XSIMD_RVV_OVERLOAD3(rvvmv_lane0,
0401                                 (__riscv_vmv_x),
0402                                 (__riscv_vmv_x),
0403                                 (__riscv_vfmv_f), _NOVL, T(vec))
0404
0405             XSIMD_RVV_OVERLOAD(rvvmerge, (__riscv_vmerge), , vec(vec, vec, bvec))
0406             XSIMD_RVV_OVERLOAD3(rvvmerge_splat,
0407                                 (__riscv_vmerge),
0408                                 (__riscv_vmerge),
0409                                 (__riscv_vfmerge), , vec(vec, T, bvec))
0410
0411             // count active lanes in a predicate
0412             XSIMD_RVV_OVERLOAD(rvvcpop, (__riscv_vcpop),
0413                                , size_t(bvec));
0414
0415             template <class T, size_t Width>
0416             XSIMD_INLINE rvv_bool_t<T, Width> pmask8(uint8_t mask) noexcept
0417             {
0418                 return rvv_bool_t<T, Width>(mask);
0419             }
0420             template <class T, size_t Width>
0421             XSIMD_INLINE rvv_bool_t<T, Width> pmask(uint64_t mask) noexcept
0422             {
0423                 return rvv_bool_t<T, Width>(mask);
0424             }
0425
0426             template <class A, class T, size_t offset = 0, int shift = 0>
0427             XSIMD_INLINE rvv_reg_t<T, A::width> vindex() noexcept
0428             {
0429                 auto index = rvvid(T {});
0430                 if (shift < 0)
0431                     index = __riscv_vsrl(index, -shift, batch<T, A>::size);
0432                 else
0433                     index = __riscv_vsll(index, shift, batch<T, A>::size);
0434                 return __riscv_vadd(index, T(offset), batch<T, A>::size);
0435             }
0436
0437             // enable for signed integers
0438             template <class T>
0439             using rvv_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
0440
0441             // enable for unsigned integers
0442             template <class T>
0443             using rvv_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value, int>::type;
0444
0445             // enable for floating points
0446             template <class T>
0447             using rvv_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
0448
0449             // enable for signed integers or floating points
0450             template <class T>
0451             using rvv_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
0452
0453             // enable for all RVE supported types
0454             template <class T>
0455             using rvv_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
0456         } // namespace detail
0457
0458         /********************
0459          * Scalar to vector *
0460          ********************/
0461
0462         namespace detail
0463         {
0464             template <class T, size_t Width>
0465             XSIMD_INLINE detail::rvv_reg_t<T, Width> broadcast(T arg) noexcept
0466             {
0467                 // A bit of a dance, here, because rvvmv_splat has no other
0468                 // argument from which to deduce type, and T=char is not
0469                 // supported.
0470                 detail::rvv_fix_char_t<T> arg_not_char(arg);
0471                 const auto splat = detail::rvvmv_splat(arg_not_char);
0472                 return detail::rvv_reg_t<T, Width>(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
0473             }
0474         }
0475
0476         // broadcast
0477         template <class A, class T>
0478         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<rvv>) noexcept
0479         {
0480             return detail::broadcast<T, A::width>(arg);
0481         }
0482
0483         /*********
0484          * Load *
0485          *********/
0486
0487         namespace detail
0488         {
0489             XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
0490             XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
0491         }
0492
0493         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0494         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
0495         {
0496             return detail::rvvle(reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src));
0497         }
0498
0499         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0500         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
0501         {
0502             return load_aligned<A>(src, convert<T>(), rvv {});
0503         }
0504
0505         // load_complex
0506         namespace detail
0507         {
0508             template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
0509             XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
0510             {
0511                 typename rvv_reg_t<T, W * 2>::register_type tmp;
0512                 tmp = __riscv_vset(tmp, 0, lo);
0513                 return __riscv_vset(tmp, 1, hi);
0514             }
0515
0516             template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
0517             {
0518                 return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2);
0519             }
0520
0521             XSIMD_RVV_OVERLOAD(rvvget_lo_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 0)
0522             XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1)
0523
0524             template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
0525             rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
0526             {
0527                 typename rvv_reg_t<T, W>::register_type tmp = rvvget_lo_(T {}, vv);
0528                 return tmp;
0529             }
0530             template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
0531             rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
0532             {
0533                 typename rvv_reg_t<T, W>::register_type tmp = rvvget_hi_(T {}, vv);
0534                 return tmp;
0535             }
0536             template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
0537             {
0538                 typename rvv_reg_t<T, W>::register_type tmp = vv;
0539                 return tmp;
0540             }
0541             template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
0542             {
0543                 return __riscv_vslidedown(vv, vv.vl / 2, vv.vl);
0544             }
0545
0546             template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0547             XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<rvv>) noexcept
0548             {
0549                 const auto real_index = vindex<A, as_unsigned_integer_t<T>, 0, 1>();
0550                 const auto imag_index = vindex<A, as_unsigned_integer_t<T>, 1, 1>();
0551                 const auto index = rvvabut<as_unsigned_integer_t<T>, A::width>(real_index, imag_index);
0552                 const auto input = rvvabut<T, A::width>(lo.data, hi.data);
0553                 const rvv_reg_t<T, A::width * 2> result = __riscv_vrgather(input, index, index.vl);
0554
0555                 return { rvvget_lo<T, A::width>(result), rvvget_hi<T, A::width>(result) };
0556             }
0557         }
0558
0559         /*********
0560          * Store *
0561          *********/
0562
0563         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0564         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
0565         {
0566             detail::rvvse(reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst), src);
0567         }
0568
0569         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0570         XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
0571         {
0572             store_aligned<A>(dst, src, rvv {});
0573         }
0574
0575         /******************
0576          * scatter/gather *
0577          ******************/
0578
0579         namespace detail
0580         {
0581             template <class T, class U>
0582             using rvv_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
0583             XSIMD_RVV_OVERLOAD(rvvloxei, (__riscv_vloxei XSIMD_RVV_S), , vec(T const*, uvec))
0584             XSIMD_RVV_OVERLOAD(rvvsoxei, (__riscv_vsoxei XSIMD_RVV_S), , void(T*, uvec, vec))
0585             XSIMD_RVV_OVERLOAD3(rvvmul_splat,
0586                                 (__riscv_vmul),
0587                                 (__riscv_vmul),
0588                                 (__riscv_vfmul), , vec(vec, T))
0589         }
0590
0591         // scatter
0592         template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
0593         XSIMD_INLINE void scatter(batch<T, A> const& vals, T* dst, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
0594         {
0595             using UU = as_unsigned_integer_t<U>;
0596             const auto uindex = detail::rvv_to_unsigned_batch(index);
0597             auto* base = reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst);
0598             // or rvvsuxei
0599             const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
0600             detail::rvvsoxei(base, bi, vals);
0601         }
0602
0603         // gather
0604         template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
0605         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
0606         {
0607             using UU = as_unsigned_integer_t<U>;
0608             const auto uindex = detail::rvv_to_unsigned_batch(index);
0609             auto const* base = reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src);
0610             // or rvvluxei
0611             const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
0612             return detail::rvvloxei(base, bi);
0613         }
0614
0615         /**************
0616          * Arithmetic *
0617          **************/
0618
0619         namespace detail
0620         {
0621             XSIMD_RVV_OVERLOAD3(rvvadd,
0622                                 (__riscv_vadd),
0623                                 (__riscv_vadd),
0624                                 (__riscv_vfadd), , vec(vec, vec))
0625             XSIMD_RVV_OVERLOAD2(rvvsadd,
0626                                 (__riscv_vsadd),
0627                                 (__riscv_vsaddu), , vec(vec, vec))
0628             XSIMD_RVV_OVERLOAD3(rvvsub,
0629                                 (__riscv_vsub),
0630                                 (__riscv_vsub),
0631                                 (__riscv_vfsub), , vec(vec, vec))
0632             XSIMD_RVV_OVERLOAD2(rvvssub,
0633                                 (__riscv_vssub),
0634                                 (__riscv_vssubu), , vec(vec, vec))
0635             XSIMD_RVV_OVERLOAD2(rvvaadd,
0636                                 (__riscv_vaadd),
0637                                 (__riscv_vaaddu), , vec(vec, vec))
0638             XSIMD_RVV_OVERLOAD3(rvvmul,
0639                                 (__riscv_vmul),
0640                                 (__riscv_vmul),
0641                                 (__riscv_vfmul), , vec(vec, vec))
0642             XSIMD_RVV_OVERLOAD3(rvvdiv,
0643                                 (__riscv_vdiv),
0644                                 (__riscv_vdivu),
0645                                 (__riscv_vfdiv), , vec(vec, vec))
0646             XSIMD_RVV_OVERLOAD3(rvvmax,
0647                                 (__riscv_vmax),
0648                                 (__riscv_vmaxu),
0649                                 (__riscv_vfmax), , vec(vec, vec))
0650             XSIMD_RVV_OVERLOAD3(rvvmin,
0651                                 (__riscv_vmin),
0652                                 (__riscv_vminu),
0653                                 (__riscv_vfmin), , vec(vec, vec))
0654             XSIMD_RVV_OVERLOAD3(rvvneg,
0655                                 (__riscv_vneg),
0656                                 (abort),
0657                                 (__riscv_vfneg), , vec(vec))
0658             XSIMD_RVV_OVERLOAD_FLOATS(rvvabs,
0659                                       (__riscv_vfabs), , vec(vec))
0660             XSIMD_RVV_OVERLOAD3(rvvmacc,
0661                                 (__riscv_vmacc),
0662                                 (__riscv_vmacc),
0663                                 (__riscv_vfmacc), , vec(vec, vec, vec))
0664             XSIMD_RVV_OVERLOAD3(rvvnmsac,
0665                                 (__riscv_vnmsac),
0666                                 (__riscv_vnmsac),
0667                                 (__riscv_vfnmsac), , vec(vec, vec, vec))
0668             XSIMD_RVV_OVERLOAD3(rvvmadd,
0669                                 (__riscv_vmadd),
0670                                 (__riscv_vmadd),
0671                                 (__riscv_vfmadd), , vec(vec, vec, vec))
0672             XSIMD_RVV_OVERLOAD3(rvvnmsub,
0673                                 (__riscv_vnmsub),
0674                                 (__riscv_vnmsub),
0675                                 (__riscv_vfnmsub), , vec(vec, vec, vec))
0676
0677 #define RISCV_VMSXX(XX)                                      \
0678     XSIMD_RVV_OVERLOAD3(rvvms##XX,                           \
0679                         (__riscv_vms##XX),                   \
0680                         (__riscv_vms##XX##u),                \
0681                         (__riscv_vmf##XX), , bvec(vec, vec)) \
0682     XSIMD_RVV_OVERLOAD3(rvvms##XX##_splat,                   \
0683                         (__riscv_vms##XX),                   \
0684                         (__riscv_vms##XX##u),                \
0685                         (__riscv_vmf##XX), , bvec(vec, T))
0686 #define __riscv_vmsequ __riscv_vmseq
0687 #define __riscv_vmsneu __riscv_vmsne
0688             RISCV_VMSXX(eq)
0689             RISCV_VMSXX(ne)
0690             RISCV_VMSXX(lt)
0691             RISCV_VMSXX(le)
0692             RISCV_VMSXX(gt)
0693             RISCV_VMSXX(ge)
0694 #undef __riscv_vmsequ
0695 #undef __riscv_vmsneu
0696 #undef RISCV_VMSXX
0697         } // namespace detail
0698
0699         // add
0700         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0701         XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0702         {
0703             return detail::rvvadd(lhs, rhs);
0704         }
0705
0706         // sadd
0707         template <class A, class T, detail::enable_integral_t<T> = 0>
0708         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0709         {
0710             return detail::rvvsadd(lhs, rhs);
0711         }
0712
0713         // sub
0714         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0715         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0716         {
0717             return detail::rvvsub(lhs, rhs);
0718         }
0719
0720         // ssub
0721         template <class A, class T, detail::enable_integral_t<T> = 0>
0722         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0723         {
0724             return detail::rvvssub(lhs, rhs);
0725         }
0726
0727         // mul
0728         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0729         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0730         {
0731             return detail::rvvmul(lhs, rhs);
0732         }
0733
0734         // div
0735         template <class A, class T, typename detail::rvv_enable_all_t<T> = 0>
0736         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0737         {
0738             return detail::rvvdiv(lhs, rhs);
0739         }
0740
0741         // max
0742         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0743         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0744         {
0745             return detail::rvvmax(lhs, rhs);
0746         }
0747
0748         // min
0749         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0750         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0751         {
0752             return detail::rvvmin(lhs, rhs);
0753         }
0754
0755         // neg
0756         template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
0757         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
0758         {
0759             using S = as_signed_integer_t<T>;
0760             const auto as_signed = detail::rvvreinterpret<S>(arg);
0761             const auto result = detail::rvvneg(as_signed);
0762             return detail::rvvreinterpret<T>(result);
0763         }
0764
0765         template <class A, class T, detail::rvv_enable_signed_int_or_floating_point_t<T> = 0>
0766         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
0767         {
0768             return detail::rvvneg(arg);
0769         }
0770
0771         // abs
0772         template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
0773         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
0774         {
0775             return arg;
0776         }
0777
0778         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0779         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
0780         {
0781             return detail::rvvabs(arg);
0782         }
0783
0784         // fma: x * y + z
0785         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0786         XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
0787         {
0788             // also detail::rvvmadd(x, y, z);
0789             return detail::rvvmacc(z, x, y);
0790         }
0791
0792         // fnma: z - x * y
0793         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0794         XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
0795         {
0796             // also detail::rvvnmsub(x, y, z);
0797             return detail::rvvnmsac(z, x, y);
0798         }
0799
0800         // fms: x * y - z
0801         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0802         XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
0803         {
0804             // also vfmsac(z, x, y), but lacking integer version
0805             // also vfmsub(x, y, z), but lacking integer version
0806             return -fnma(x, y, z);
0807         }
0808
0809         // fnms: - x * y - z
0810         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0811         XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
0812         {
0813             // also vfnmacc(z, x, y), but lacking integer version
0814             // also vfnmadd(x, y, z), but lacking integer version
0815             return -fma(z, x, y);
0816         }
0817
0818         /**********************
0819          * Logical operations *
0820          **********************/
0821
0822         namespace detail
0823         {
0824             XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec))
0825             XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec))
0826             XSIMD_RVV_OVERLOAD_INTS(rvvor_splat, (__riscv_vor), , vec(vec, T))
0827             XSIMD_RVV_OVERLOAD_INTS(rvvxor, (__riscv_vxor), , vec(vec, vec))
0828             XSIMD_RVV_OVERLOAD_INTS(rvvnot, (__riscv_vnot), , vec(vec))
0829             XSIMD_RVV_OVERLOAD(rvvmand, (__riscv_vmand_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
0830             XSIMD_RVV_OVERLOAD(rvvmor, (__riscv_vmor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
0831             XSIMD_RVV_OVERLOAD(rvvmxor, (__riscv_vmxor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
0832             XSIMD_RVV_OVERLOAD(rvvmandn, (__riscv_vmandn_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
0833             XSIMD_RVV_OVERLOAD(rvvmnot, (__riscv_vmnot), , bvec(bvec))
0834         }
0835
0836         // bitwise_and
0837         template <class A, class T, detail::enable_integral_t<T> = 0>
0838         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0839         {
0840             return detail::rvvand(lhs, rhs);
0841         }
0842
0843         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0844         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0845         {
0846             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
0847             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
0848             const auto result_bits = detail::rvvand(lhs_bits, rhs_bits);
0849             return detail::rvvreinterpret<T>(result_bits);
0850         }
0851
0852         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0853         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
0854         {
0855             return detail::rvvmand(lhs, rhs);
0856         }
0857
0858         // bitwise_andnot
0859         template <class A, class T, detail::enable_integral_t<T> = 0>
0860         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0861         {
0862             const auto not_rhs = detail::rvvnot(rhs);
0863             return detail::rvvand(lhs, not_rhs);
0864         }
0865
0866         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0867         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0868         {
0869             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
0870             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
0871             const auto not_rhs = detail::rvvnot(rhs_bits);
0872             const auto result_bits = detail::rvvand(lhs_bits, not_rhs);
0873             return detail::rvvreinterpret<T>(result_bits);
0874         }
0875
0876         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0877         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
0878         {
0879             return detail::rvvmandn(lhs, rhs);
0880         }
0881
0882         // bitwise_or
0883         template <class A, class T, detail::enable_integral_t<T> = 0>
0884         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0885         {
0886             return detail::rvvor(lhs, rhs);
0887         }
0888
0889         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0890         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0891         {
0892             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
0893             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
0894             const auto result_bits = detail::rvvor(lhs_bits, rhs_bits);
0895             return detail::rvvreinterpret<T>(result_bits);
0896         }
0897
0898         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0899         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
0900         {
0901             return detail::rvvmor(lhs, rhs);
0902         }
0903
0904         // bitwise_xor
0905         template <class A, class T, detail::enable_integral_t<T> = 0>
0906         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0907         {
0908             return detail::rvvxor(lhs, rhs);
0909         }
0910
0911         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0912         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0913         {
0914             const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
0915             const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
0916             const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits);
0917             return detail::rvvreinterpret<T>(result_bits);
0918         }
0919
0920         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0921         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
0922         {
0923             return detail::rvvmxor(lhs, rhs);
0924         }
0925
0926         // bitwise_not
0927         template <class A, class T, detail::enable_integral_t<T> = 0>
0928         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
0929         {
0930             return detail::rvvnot(arg);
0931         }
0932
0933         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
0934         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
0935         {
0936             const auto arg_bits = detail::rvv_to_unsigned_batch(arg);
0937             const auto result_bits = detail::rvvnot(arg_bits);
0938             return detail::rvvreinterpret<T>(result_bits);
0939         }
0940
0941         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
0942         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
0943         {
0944             return detail::rvvmnot(arg);
0945         }
0946
0947         /**********
0948          * Shifts *
0949          **********/
0950
0951         namespace detail
0952         {
0953             XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t))
0954             XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec))
0955             XSIMD_RVV_OVERLOAD2(rvvsr_splat,
0956                                 (__riscv_vsra),
0957                                 (__riscv_vsrl), , vec(vec, size_t))
0958             XSIMD_RVV_OVERLOAD2(rvvsr,
0959                                 (__riscv_vsra),
0960                                 (__riscv_vsrl), , vec(vec, uvec))
0961         } // namespace detail
0962
0963         // bitwise_lshift
0964         template <class A, class T, detail::enable_integral_t<T> = 0>
0965         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
0966         {
0967             constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
0968             assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
0969             return detail::rvvsll_splat(arg, n);
0970         }
0971
0972         template <class A, class T, detail::enable_integral_t<T> = 0>
0973         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0974         {
0975             return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
0976         }
0977
0978         // bitwise_rshift
0979         template <class A, class T, detail::enable_integral_t<T> = 0>
0980         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
0981         {
0982             constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
0983             assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
0984             return detail::rvvsr_splat(arg, n);
0985         }
0986
0987         template <class A, class T, detail::enable_integral_t<T> = 0>
0988         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
0989         {
0990             return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
0991         }
0992
0993         /**************
0994          * Reductions *
0995          **************/
0996
0997         namespace detail
0998         {
0999             XSIMD_RVV_OVERLOAD3(rvvredsum,
1000                                 (__riscv_vredsum),
1001                                 (__riscv_vredsum),
1002                                 (__riscv_vfredosum), // or __riscv_vfredusum
1003                                 , scalar_vec(vec, scalar_vec))
1004             XSIMD_RVV_OVERLOAD3(rvvredmax,
1005                                 (__riscv_vredmax),
1006                                 (__riscv_vredmaxu),
1007                                 (__riscv_vfredmax), , scalar_vec(vec, scalar_vec))
1008             XSIMD_RVV_OVERLOAD3(rvvredmin,
1009                                 (__riscv_vredmin),
1010                                 (__riscv_vredminu),
1011                                 (__riscv_vfredmin), , scalar_vec(vec, scalar_vec))
1012             XSIMD_RVV_OVERLOAD3(rvvslide1up,
1013                                 (__riscv_vslide1up),
1014                                 (__riscv_vslide1up),
1015                                 (__riscv_vfslide1up), , vec(vec, vec))
1016             XSIMD_RVV_OVERLOAD3(rvvslide1down,
1017                                 (__riscv_vslide1down),
1018                                 (__riscv_vslide1down),
1019                                 (__riscv_vfslide1down), , vec(vec, T))
1020
1021             template <class A, class T>
1022             XSIMD_INLINE T reduce_scalar(rvv_reg_t<T, types::detail::rvv_width_m1> const& arg)
1023             {
1024                 return detail::rvvmv_lane0(rvv_reg_t<T, A::width>(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST));
1025             }
1026         }
1027         // reduce_add
1028         template <class A, class T, class V = typename batch<T, A>::value_type, detail::rvv_enable_all_t<T> = 0>
1029         XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1030         {
1031             const auto zero = detail::broadcast<T, types::detail::rvv_width_m1>(T(0));
1032             const auto r = detail::rvvredsum(arg, zero);
1033             return detail::reduce_scalar<A, T>(r);
1034         }
1035
1036         // reduce_max
1037         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1038         XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1039         {
1040             const auto lowest = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::lowest());
1041             const auto r = detail::rvvredmax(arg, lowest);
1042             return detail::reduce_scalar<A, T>(r);
1043         }
1044
1045         // reduce_min
1046         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1047         XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1048         {
1049             const auto max = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::max());
1050             const auto r = detail::rvvredmin(arg, max);
1051             return detail::reduce_scalar<A, T>(r);
1052         }
1053
1054         // haddp
1055         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1056         XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<rvv>) noexcept
1057         {
1058             constexpr std::size_t size = batch<T, A>::size;
1059             T sums[size];
1060 #pragma unroll size
1061             for (std::size_t i = 0; i < size; ++i)
1062             {
1063                 sums[i] = reduce_add(row[i], rvv {});
1064             }
1065             return load_aligned<A>(sums, convert<T>(), rvv {});
1066         }
1067
1068         /***************
1069          * Comparisons *
1070          ***************/
1071
1072         // eq
1073         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1074         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1075         {
1076             return detail::rvvmseq(lhs, rhs);
1077         }
1078
1079         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1080         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
1081         {
1082             const auto neq_result = detail::rvvmxor(lhs, rhs);
1083             return detail::rvvmnot(neq_result);
1084         }
1085
1086         // neq
1087         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1088         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1089         {
1090             return detail::rvvmsne(lhs, rhs);
1091         }
1092
1093         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1094         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
1095         {
1096             return detail::rvvmxor(lhs, rhs);
1097         }
1098
1099         // lt
1100         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1101         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1102         {
1103             return detail::rvvmslt(lhs, rhs);
1104         }
1105
1106         // le
1107         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1108         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1109         {
1110             return detail::rvvmsle(lhs, rhs);
1111         }
1112
1113         // gt
1114         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1115         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1116         {
1117             return detail::rvvmsgt(lhs, rhs);
1118         }
1119
1120         // ge
1121         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1122         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1123         {
1124             return detail::rvvmsge(lhs, rhs);
1125         }
1126
1127         /*************
1128          * Selection *
1129          *************/
1130         namespace detail
1131         {
1132             XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress_tu), , vec(vec, vec, bvec))
1133         }
1134         // compress
1135         template <class A, class T>
1136         XSIMD_INLINE batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask, requires_arch<rvv>) noexcept
1137         {
1138             auto zero = broadcast<A>(T(0), rvv {});
1139             return detail::rvvcompress(zero, x, mask);
1140         }
1141
1142         /***************
1143          * Permutation *
1144          ***************/
1145         namespace detail
1146         {
1147             XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec))
1148             XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t))
1149             XSIMD_RVV_OVERLOAD(rvvslidedown, (__riscv_vslidedown), , vec(vec, size_t))
1150         }
1151
1152         // swizzle
1153         template <class A, class T, class I, I... idx>
1154         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...>, requires_arch<rvv>) noexcept
1155         {
1156             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
1157             const batch<I, A> indices { idx... };
1158             return detail::rvvrgather(arg, indices);
1159         }
1160
1161         template <class A, class T, class I, I... idx>
1162         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
1163                                                        batch_constant<I, A, idx...>,
1164                                                        requires_arch<rvv>) noexcept
1165         {
1166             const auto real = swizzle(self.real(), batch_constant<I, A, idx...> {}, rvv {});
1167             const auto imag = swizzle(self.imag(), batch_constant<I, A, idx...> {}, rvv {});
1168             return batch<std::complex<T>>(real, imag);
1169         }
1170
1171         /*************
1172          * Selection *
1173          *************/
1174
1175         // extract_pair
1176
1177         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1178         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, size_t n, requires_arch<rvv>) noexcept
1179         {
1180             const auto tmp = detail::rvvslidedown(rhs, n);
1181             return detail::rvvslideup(tmp, lhs, lhs.size - n);
1182         }
1183
1184         // select
1185         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1186         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<rvv>) noexcept
1187         {
1188             return detail::rvvmerge(b, a, cond);
1189         }
1190
1191         template <class A, class T, bool... b>
1192         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
1193         {
1194             return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
1195         }
1196
1197         // zip_lo
1198         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1199         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1200         {
1201             const auto index = detail::vindex<A, as_unsigned_integer_t<T>, 0, -1>();
1202             const auto mask = detail::pmask8<T, A::width>(0xaa);
1203             return detail::rvvmerge(detail::rvvrgather(lhs, index),
1204                                     detail::rvvrgather(rhs, index),
1205                                     mask);
1206         }
1207
1208         // zip_hi
1209         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1210         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
1211         {
1212             const auto index = detail::vindex<A, as_unsigned_integer_t<T>, batch<T, A>::size / 2, -1>();
1213             const auto mask = detail::pmask8<T, A::width>(0xaa);
1214             return detail::rvvmerge(detail::rvvrgather(lhs, index),
1215                                     detail::rvvrgather(rhs, index),
1216                                     mask);
1217         }
1218
1219         // store_complex
1220         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1221         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
1222         {
1223             const auto lo = zip_lo(src.real(), src.imag());
1224             const auto hi = zip_hi(src.real(), src.imag());
1225             T* buf = reinterpret_cast<T*>(dst);
1226             store_aligned(buf, lo, rvv {});
1227             store_aligned(buf + lo.size, hi, rvv {});
1228         }
1229
1230         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1231         XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
1232         {
1233             store_complex_aligned(dst, src, rvv {});
1234         }
1235
1236         /*****************************
1237          * Floating-point arithmetic *
1238          *****************************/
1239
1240         namespace detail
1241         {
1242             XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec))
1243             XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec))
1244             XSIMD_RVV_OVERLOAD_FLOATS(rvvfrsqrt7, (__riscv_vfrsqrt7), , vec(vec))
1245         }
1246
1247         // rsqrt
1248         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1249         XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1250         {
1251             auto approx = detail::rvvfrsqrt7(arg);
1252             approx = approx * (1.5 - (0.5 * arg * approx * approx));
1253             return approx;
1254         }
1255
1256         // sqrt
1257         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1258         XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1259         {
1260             return detail::rvvfsqrt(arg);
1261         }
1262
1263         // reciprocal
1264         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1265         XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<rvv>) noexcept
1266         {
1267             return detail::rvvfrec7(arg);
1268         }
1269
1270         /******************************
1271          * Floating-point conversions *
1272          ******************************/
1273
1274         // fast_cast
1275         namespace detail
1276         {
1277             XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C.
1278                                 (__riscv_vfcvt_rtz_x),
1279                                 (__riscv_vfcvt_rtz_xu), _DROP_1ST, vec(T, fvec))
1280             XSIMD_RVV_OVERLOAD2(rvvfcvt_rne, // round to nearest, ties to even
1281                                 (__riscv_vfcvt_x),
1282                                 (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RNE)
1283             XSIMD_RVV_OVERLOAD2(rvvfcvt_rmm, // round to nearest, ties to max magnitude
1284                                 (__riscv_vfcvt_x),
1285                                 (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RMM)
1286             XSIMD_RVV_OVERLOAD2(rvvfcvt, // round to current rounding mode.
1287                                 (__riscv_vfcvt_x),
1288                                 (__riscv_vfcvt_xu), _DROP_1ST, vec(T, fvec))
1289             XSIMD_RVV_OVERLOAD_INTS(rvvfcvt_f, (__riscv_vfcvt_f), , fvec(vec))
1290
1291             template <class T, class U>
1292             using rvv_enable_ftoi_t = typename std::enable_if<(sizeof(T) == sizeof(U) && std::is_floating_point<T>::value && !std::is_floating_point<U>::value), int>::type;
1293             template <class T, class U>
1294             using rvv_enable_itof_t = typename std::enable_if<(sizeof(T) == sizeof(U) && !std::is_floating_point<T>::value && std::is_floating_point<U>::value), int>::type;
1295
1296             template <class A, class T, class U, rvv_enable_ftoi_t<T, U> = 0>
1297             XSIMD_INLINE batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
1298             {
1299                 return rvvfcvt_rtz(U {}, arg);
1300             }
1301             template <class A, class T, class U, rvv_enable_itof_t<T, U> = 0>
1302             XSIMD_INLINE batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
1303             {
1304                 return rvvfcvt_f(arg);
1305             }
1306         }
1307
1308         /*********
1309          * Miscs *
1310          *********/
1311
1312         // set
1313         template <class A, class T, class... Args>
1314         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<rvv>, Args... args) noexcept
1315         {
1316             const std::array<T, batch<T, A>::size> tmp { args... };
1317             return load_unaligned<A>(tmp.data(), convert<T>(), rvv {});
1318         }
1319
1320         template <class A, class T, class... Args>
1321         XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<rvv>,
1322                                                    Args... args_complex) noexcept
1323         {
1324             return batch<std::complex<T>>(set(batch<T, rvv> {}, rvv {}, args_complex.real()...),
1325                                           set(batch<T, rvv> {}, rvv {}, args_complex.imag()...));
1326         }
1327
1328         template <class A, class T, class... Args>
1329         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<rvv>, Args... args) noexcept
1330         {
1331             using U = as_unsigned_integer_t<T>;
1332             const auto values = set(batch<U, rvv> {}, rvv {}, static_cast<U>(args)...);
1333             const auto zero = broadcast<A>(U(0), rvv {});
1334             detail::rvv_bool_t<T> result = detail::rvvmsne(values, zero);
1335             return result;
1336         }
1337
1338         // insert
1339         template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
1340         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
1341         {
1342             const auto mask = detail::pmask<T, A::width>(uint64_t(1) << I);
1343             return detail::rvvmerge_splat(arg, val, mask);
1344         }
1345
1346         // get
1347         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1348         XSIMD_INLINE T get(batch<T, A> const& arg, size_t i, requires_arch<rvv>) noexcept
1349         {
1350             const auto tmp = detail::rvvslidedown(arg, i);
1351             return detail::rvvmv_lane0(tmp);
1352         }
1353
1354         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1355         XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, size_t i, requires_arch<rvv>) noexcept
1356         {
1357             const auto tmpr = detail::rvvslidedown(arg.real(), i);
1358             const auto tmpi = detail::rvvslidedown(arg.imag(), i);
1359             return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
1360         }
1361
1362         // all
1363         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1364         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
1365         {
1366             return detail::rvvcpop(arg) == batch_bool<T, A>::size;
1367         }
1368
1369         // any
1370         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1371         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
1372         {
1373             return detail::rvvcpop(arg) > 0;
1374         }
1375
1376         // bitwise_cast
1377         template <class A, class T, class R, detail::rvv_enable_all_t<T> = 0, detail::rvv_enable_all_t<R> = 0>
1378         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<rvv>) noexcept
1379         {
1380             return detail::rvv_reg_t<R, A::width>(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
1381         }
1382
1383         // batch_bool_cast
1384         template <class A, class T_out, class T_in, detail::rvv_enable_all_t<T_in> = 0>
1385         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<rvv>) noexcept
1386         {
1387             using intermediate_t = typename detail::rvv_bool_t<T_out>;
1388             return intermediate_t(arg.data);
1389         }
1390
1391         // from_bool
1392         template <class A, class T, detail::rvv_enable_all_t<T> = 0>
1393         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
1394         {
1395             const auto zero = broadcast<A>(T(0), rvv {});
1396             return detail::rvvmerge_splat(zero, T(1), arg);
1397         }
1398
1399         namespace detail
1400         {
1401             template <size_t Width>
1402             XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i)
1403             {
1404                 return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8);
1405             }
1406             template <>
1407             XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf2>(vuint8m1_t arg, size_t i)
1408             {
1409                 const auto bytes = __riscv_vlmul_trunc_u8mf2(arg);
1410                 const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8);
1411                 return __riscv_vlmul_ext_u8m1(result);
1412             }
1413             template <>
1414             XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf4>(vuint8m1_t arg, size_t i)
1415             {
1416                 const auto bytes = __riscv_vlmul_trunc_u8mf4(arg);
1417                 const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8);
1418                 return __riscv_vlmul_ext_u8m1(result);
1419             }
1420             template <>
1421             XSIMD_INLINE vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf8>(vuint8m1_t arg, size_t i)
1422             {
1423                 const auto bytes = __riscv_vlmul_trunc_u8mf8(arg);
1424                 const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8);
1425                 return __riscv_vlmul_ext_u8m1(result);
1426             }
1427         }
1428
1429         // slide_left
1430         template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
1431         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1432         {
1433             const auto zero = broadcast<A>(uint8_t(0), rvv {});
1434             const auto bytes = arg.data.get_bytes();
1435             return detail::rvvreinterpret<T>(detail::rvvslideup(zero, bytes, N));
1436         }
1437
1438         // slide_right
1439         template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
1440         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1441         {
1442             using reg_t = detail::rvv_reg_t<T, A::width>;
1443             const auto bytes = arg.data.get_bytes();
1444             return reg_t(detail::rvvslidedownbytes<A::width>(bytes, N), types::detail::XSIMD_RVV_BITCAST);
1445         }
1446
1447         // isnan
1448         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1449         XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1450         {
1451             return !(arg == arg);
1452         }
1453
1454         namespace detail
1455         {
1456             template <class T>
1457             using rvv_as_signed_integer_t = as_signed_integer_t<as_unsigned_integer_t<T>>;
1458
1459             template <class A, class T, class U = rvv_as_signed_integer_t<T>>
1460             XSIMD_INLINE batch<U, A> rvvfcvt_default(batch<T, A> const& arg) noexcept
1461             {
1462                 return rvvfcvt_rne(U {}, arg);
1463             }
1464
1465             template <class A, class T, class U = rvv_as_signed_integer_t<T>>
1466             XSIMD_INLINE batch<U, A> rvvfcvt_afz(batch<T, A> const& arg) noexcept
1467             {
1468                 return rvvfcvt_rmm(U {}, arg);
1469             }
1470         }
1471
1472         // nearbyint_as_int
1473         template <class A, class T, class U = detail::rvv_as_signed_integer_t<T>>
1474         XSIMD_INLINE batch<U, A> nearbyint_as_int(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1475         {
1476             // Reference rounds ties to nearest even
1477             return detail::rvvfcvt_default(arg);
1478         }
1479
1480         // round
1481         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1482         XSIMD_INLINE batch<T, A> round(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1483         {
1484             // Round ties away from zero.
1485             const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
1486             return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {});
1487         }
1488
1489         // nearbyint
1490         template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
1491         XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<rvv>) noexcept
1492         {
1493             // Round according to current rounding mode.
1494             const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
1495             return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {});
1496         }
1497     } // namespace kernel
1498 } // namespace xsimd
1499
1500 #endif