xsimd/arch/xsimd_emulated.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_EMULATED_HPP
0013 #define XSIMD_EMULATED_HPP
0014
0015 #include <complex>
0016 #include <limits>
0017 #include <numeric>
0018 #include <type_traits>
0019
0020 #include "../arch/xsimd_scalar.hpp"
0021
0022 #include "../types/xsimd_emulated_register.hpp"
0023 #include "../types/xsimd_utils.hpp"
0024
0025 namespace xsimd
0026 {
0027     template <typename T, class A, bool... Values>
0028     struct batch_bool_constant;
0029
0030     template <class T_out, class T_in, class A>
0031     XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
0032
0033     template <typename T, class A, T... Values>
0034     struct batch_constant;
0035
0036     namespace kernel
0037     {
0038         using namespace types;
0039
0040         // fwd
0041         template <class A, class T, size_t I>
0042         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
0043         template <class A, typename T, typename ITy, ITy... Indices>
0044         XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
0045
0046         namespace detail
0047         {
0048             template <size_t I, class F, class... Bs>
0049             auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...))
0050             {
0051                 return func(bs.data[I]...);
0052             }
0053
0054             template <class F, class B, class... Bs, size_t... Is>
0055             auto emulated_apply(F func, ::xsimd::detail::index_sequence<Is...>, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
0056             {
0057                 return { emulated_apply<Is>(func, b, bs...)... };
0058             }
0059
0060             template <class B, class F, class... Bs>
0061             auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
0062             {
0063                 return emulated_apply(func, ::xsimd::detail::make_index_sequence<B::size>(), b, bs...);
0064             }
0065         }
0066
0067         // abs
0068         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0069         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0070         {
0071             return detail::emulated_apply([](T v)
0072                                           { return xsimd::abs(v); },
0073                                           self);
0074         }
0075
0076         // add
0077         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0078         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0079         {
0080             return detail::emulated_apply([](T v0, T v1)
0081                                           { return xsimd::add(v0, v1); },
0082                                           self, other);
0083         }
0084
0085         // all
0086         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0087         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
0088         {
0089             return std::all_of(self.data.begin(), self.data.end(), [](T v)
0090                                { return bool(v); });
0091         }
0092
0093         // any
0094         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0095         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
0096         {
0097             return std::any_of(self.data.begin(), self.data.end(), [](T v)
0098                                { return bool(v); });
0099         }
0100
0101         // batch_bool_cast
0102         template <class A, class T_out, class T_in, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
0103         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<emulated<N>>) noexcept
0104         {
0105             return { self.data };
0106         }
0107
0108         // bitwise_and
0109         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0110         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0111         {
0112             return detail::emulated_apply([](T v0, T v1)
0113                                           { return xsimd::bitwise_and(v0, v1); },
0114                                           self, other);
0115         }
0116
0117         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0118         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
0119         {
0120             return detail::emulated_apply([](bool v0, bool v1)
0121                                           { return xsimd::bitwise_and(v0, v1); },
0122                                           self, other);
0123         }
0124
0125         // bitwise_andnot
0126         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0127         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0128         {
0129             return detail::emulated_apply([](T v0, T v1)
0130                                           { return xsimd::bitwise_andnot(v0, v1); },
0131                                           self, other);
0132         }
0133
0134         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0135         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
0136         {
0137             return detail::emulated_apply([](bool v0, bool v1)
0138                                           { return xsimd::bitwise_andnot(v0, v1); },
0139                                           self, other);
0140         }
0141
0142         // bitwise_lshift
0143         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0144         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
0145         {
0146             return detail::emulated_apply([other](T v)
0147                                           { return xsimd::bitwise_lshift(v, other); },
0148                                           self);
0149         }
0150
0151         // bitwise_not
0152         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0153         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0154         {
0155             return detail::emulated_apply([](T v)
0156                                           { return xsimd::bitwise_not(v); },
0157                                           self);
0158         }
0159
0160         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0161         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
0162         {
0163             return detail::emulated_apply([](bool v)
0164                                           { return xsimd::bitwise_not(v); },
0165                                           self);
0166         }
0167
0168         // bitwise_or
0169         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0170         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0171         {
0172             return detail::emulated_apply([](T v0, T v1)
0173                                           { return xsimd::bitwise_or(v0, v1); },
0174                                           self, other);
0175         }
0176
0177         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0178         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
0179         {
0180             return detail::emulated_apply([](bool v0, bool v1)
0181                                           { return xsimd::bitwise_or(v0, v1); },
0182                                           self, other);
0183         }
0184
0185         // bitwise_rshift
0186         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0187         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
0188         {
0189             return detail::emulated_apply([other](T v)
0190                                           { return xsimd::bitwise_rshift(v, other); },
0191                                           self);
0192         }
0193
0194         // bitwise_xor
0195         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0196         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0197         {
0198             return detail::emulated_apply([](T v0, T v1)
0199                                           { return xsimd::bitwise_xor(v0, v1); },
0200                                           self, other);
0201         }
0202
0203         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0204         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
0205         {
0206             return detail::emulated_apply([](bool v0, bool v1)
0207                                           { return xsimd::bitwise_xor(v0, v1); },
0208                                           self, other);
0209         }
0210
0211         // bitwise_cast
0212         template <class A, class T_in, class T_out, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
0213         XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<emulated<N>>) noexcept
0214         {
0215             constexpr size_t size = batch<T_out, A>::size;
0216             std::array<T_out, size> result;
0217             char* raw_data = reinterpret_cast<char*>(result.data());
0218             const char* raw_input = reinterpret_cast<const char*>(self.data.data());
0219             memcpy(raw_data, raw_input, size * sizeof(T_out));
0220             return result;
0221         }
0222
0223         // broadcast
0224         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0225         batch<T, A> XSIMD_INLINE broadcast(T val, requires_arch<emulated<N>>) noexcept
0226         {
0227             constexpr size_t size = batch<T, A>::size;
0228             std::array<T, size> r;
0229             std::fill(r.begin(), r.end(), val);
0230             return r;
0231         }
0232
0233 #if 0
0234         // count
0235         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0236         XSIMD_INLINE size_t count(batch_bool<T, A> const& x, requires_arch<emulated<N>>) noexcept
0237         {
0238             uint64_t m = x.mask();
0239             // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
0240             m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp
0241             m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp
0242             m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp
0243             return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count
0244         }
0245 #endif
0246
0247         // store_complex
0248         namespace detail
0249         {
0250             // complex_low
0251             template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0252             XSIMD_INLINE batch<T, A> complex_low(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
0253             {
0254                 constexpr size_t size = batch<T, A>::size;
0255                 std::array<T, size> result;
0256                 for (size_t i = 0; i < size / 2; ++i)
0257                 {
0258                     result[2 * i] = self.real().data[i];
0259                     result[1 + 2 * i] = self.imag().data[i];
0260                 }
0261                 return result;
0262             }
0263             // complex_high
0264             template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0265             XSIMD_INLINE batch<T, A> complex_high(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
0266             {
0267                 constexpr size_t size = batch<T, A>::size;
0268                 std::array<T, size> result;
0269                 for (size_t i = 0; i < size / 2; ++i)
0270                 {
0271                     result[2 * i] = self.real().data[i + size / 2];
0272                     result[1 + 2 * i] = self.imag().data[i + size / 2];
0273                 }
0274                 return result;
0275             }
0276         }
0277
0278         // decr_if
0279         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0280         XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
0281         {
0282             return self - batch<T, A>(mask.data);
0283         }
0284
0285         // div
0286         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0287         XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0288         {
0289             return detail::emulated_apply([](T v0, T v1)
0290                                           { return xsimd::div(v0, v1); },
0291                                           self, other);
0292         }
0293
0294         // fast_cast
0295         namespace detail
0296         {
0297             template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
0298             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
0299             {
0300                 return detail::emulated_apply([](int32_t v)
0301                                               { return float(v); },
0302                                               self);
0303             }
0304
0305             template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
0306             XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
0307             {
0308                 return detail::emulated_apply([](uint32_t v)
0309                                               { return float(v); },
0310                                               self);
0311             }
0312
0313             template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
0314             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
0315             {
0316                 return detail::emulated_apply([](int64_t v)
0317                                               { return double(v); },
0318                                               self);
0319             }
0320
0321             template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
0322             XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
0323             {
0324                 return detail::emulated_apply([](uint64_t v)
0325                                               { return double(v); },
0326                                               self);
0327             }
0328
0329             template <class A, size_t N = 8 * sizeof(int32_t) * batch<int32_t, A>::size>
0330             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<emulated<N>>) noexcept
0331             {
0332                 return detail::emulated_apply([](float v)
0333                                               { return int32_t(v); },
0334                                               self);
0335             }
0336
0337             template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
0338             XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<emulated<N>>) noexcept
0339             {
0340                 return detail::emulated_apply([](double v)
0341                                               { return int64_t(v); },
0342                                               self);
0343             }
0344         }
0345
0346         // eq
0347         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0348         XSIMD_INLINE batch_bool<T, emulated<N>> eq(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
0349         {
0350             return detail::emulated_apply([](T v0, T v1)
0351                                           { return xsimd::eq(v0, v1); },
0352                                           self, other);
0353         }
0354
0355         template <class A, class T, size_t N = 8 * sizeof(T) * batch_bool<T, A>::size>
0356         XSIMD_INLINE batch_bool<T, emulated<N>> eq(batch_bool<T, emulated<N>> const& self, batch_bool<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
0357         {
0358             return detail::emulated_apply([](bool v0, bool v1)
0359                                           { return xsimd::eq(v0, v1); },
0360                                           self, other);
0361         }
0362
0363         // from_bool
0364         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0365         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
0366         {
0367             return detail::emulated_apply([](bool v)
0368                                           { return T(v); },
0369                                           self);
0370         }
0371
0372         // from_mask
0373         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0374         XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<emulated<N>>) noexcept
0375         {
0376             constexpr size_t size = batch<T, A>::size;
0377             std::array<bool, size> vmask;
0378             for (size_t i = 0; i < size; ++i)
0379                 vmask[i] = (mask >> i) & 1u;
0380             return vmask;
0381         }
0382
0383         // ge
0384         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0385         XSIMD_INLINE batch_bool<T, emulated<N>> ge(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
0386         {
0387             return detail::emulated_apply([](T v0, T v1)
0388                                           { return xsimd::ge(v0, v1); },
0389                                           self, other);
0390         }
0391
0392         // gt
0393         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0394         XSIMD_INLINE batch_bool<T, emulated<N>> gt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
0395         {
0396             return detail::emulated_apply([](T v0, T v1)
0397                                           { return xsimd::gt(v0, v1); },
0398                                           self, other);
0399         }
0400
0401         // haddp
0402         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0403         XSIMD_INLINE batch<T, A> haddp(batch<T, A> const* row, requires_arch<emulated<N>>) noexcept
0404         {
0405             constexpr size_t size = batch<T, A>::size;
0406             std::array<T, size> r;
0407             for (size_t i = 0; i < size; ++i)
0408                 r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front());
0409             return r;
0410         }
0411
0412         // incr_if
0413         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0414         XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
0415         {
0416             return self + batch<T, A>(mask.data);
0417         }
0418
0419         // insert
0420         template <class A, class T, size_t I, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0421         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<emulated<N>>) noexcept
0422         {
0423             batch<T, A> other = self;
0424             other.data[I] = val;
0425             return other;
0426         }
0427
0428         // isnan
0429         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
0430         XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0431         {
0432             return detail::emulated_apply([](T v)
0433                                           { return xsimd::isnan(v); },
0434                                           self);
0435         }
0436
0437         // load_aligned
0438         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0439         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
0440         {
0441             constexpr size_t size = batch<T, A>::size;
0442             std::array<T, size> res;
0443             std::copy(mem, mem + size, res.begin());
0444             return res;
0445         }
0446
0447         // load_unaligned
0448         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0449         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
0450         {
0451             constexpr size_t size = batch<T, A>::size;
0452             std::array<T, size> res;
0453             std::copy(mem, mem + size, res.begin());
0454             return res;
0455         }
0456
0457         // load_complex
0458         namespace detail
0459         {
0460             template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0461             XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& hi, batch<T, A> const& lo, requires_arch<emulated<N>>) noexcept
0462             {
0463                 constexpr size_t size = batch<T, A>::size;
0464                 std::array<T, size> real, imag;
0465                 for (size_t i = 0; i < size / 2; ++i)
0466                 {
0467                     real[i] = hi.data[2 * i];
0468                     imag[i] = hi.data[1 + 2 * i];
0469                 }
0470                 for (size_t i = 0; i < size / 2; ++i)
0471                 {
0472                     real[size / 2 + i] = lo.data[2 * i];
0473                     imag[size / 2 + i] = lo.data[1 + 2 * i];
0474                 }
0475                 return { real, imag };
0476             }
0477         }
0478
0479         // le
0480         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0481         XSIMD_INLINE batch_bool<T, emulated<N>> le(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
0482         {
0483             return detail::emulated_apply([](T v0, T v1)
0484                                           { return xsimd::le(v0, v1); },
0485                                           self, other);
0486         }
0487
0488         // lt
0489         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0490         XSIMD_INLINE batch_bool<T, emulated<N>> lt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
0491         {
0492             return detail::emulated_apply([](T v0, T v1)
0493                                           { return xsimd::lt(v0, v1); },
0494                                           self, other);
0495         }
0496
0497         // mask
0498         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0499         XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
0500         {
0501             constexpr size_t size = batch<T, A>::size;
0502             uint64_t res = 0;
0503             for (size_t i = 0; i < size; ++i)
0504                 res |= (self.data[i] ? 1u : 0u) << i;
0505             return res;
0506         }
0507
0508         // max
0509         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0510         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0511         {
0512             return detail::emulated_apply([](T v0, T v1)
0513                                           { return xsimd::max(v0, v1); },
0514                                           self, other);
0515         }
0516
0517         // min
0518         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0519         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0520         {
0521             return detail::emulated_apply([](T v0, T v1)
0522                                           { return xsimd::min(v0, v1); },
0523                                           self, other);
0524         }
0525
0526         // mul
0527         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0528         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0529         {
0530             return detail::emulated_apply([](T v0, T v1)
0531                                           { return xsimd::mul(v0, v1); },
0532                                           self, other);
0533         }
0534
0535         // nearbyint_as_int
0536         template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0537         XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(batch<T, A> const& self,
0538                                                                 requires_arch<emulated<N>>) noexcept
0539         {
0540             return detail::emulated_apply([](T v)
0541                                           { return xsimd::nearbyint_as_int(v); },
0542                                           self);
0543         }
0544
0545         // neg
0546         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0547         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0548         {
0549             return detail::emulated_apply([](T v)
0550                                           { return xsimd::neg(v); },
0551                                           self);
0552         }
0553
0554         // neq
0555         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0556         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0557         {
0558             return detail::emulated_apply([](T v0, T v1)
0559                                           { return xsimd::neq(v0, v1); },
0560                                           self, other);
0561         }
0562
0563         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0564         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
0565         {
0566             return detail::emulated_apply([](bool v0, bool v1)
0567                                           { return xsimd::neq(v0, v1); },
0568                                           self, other);
0569         }
0570
0571         // reduce_add
0572         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0573         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0574         {
0575             constexpr size_t size = batch<T, A>::size;
0576             std::array<T, size> buffer;
0577             self.store_unaligned(buffer.data());
0578             return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin());
0579         }
0580
0581         // reduce_max
0582         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0583         XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0584         {
0585             return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
0586                                    { return xsimd::max(x, y); });
0587         }
0588
0589         // reduce_min
0590         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0591         XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0592         {
0593             return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
0594                                    { return xsimd::min(x, y); });
0595         }
0596
0597         // rsqrt
0598         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0599         XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0600         {
0601             return detail::emulated_apply([](T v)
0602                                           { return xsimd::rsqrt(v); },
0603                                           self);
0604         }
0605
0606         // select
0607         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0608         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<N>>) noexcept
0609         {
0610             return detail::emulated_apply([](bool c, T t, T f)
0611                                           { return xsimd::select(c, t, f); },
0612                                           cond, true_br, false_br);
0613         }
0614
0615         template <class A, class T, bool... Values>
0616         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
0617         {
0618             constexpr size_t size = batch<T, A>::size;
0619             static_assert(sizeof...(Values) == size, "consistent init");
0620             return select((batch_bool<T, A>)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {});
0621         }
0622
0623         // shuffle
0624         template <class A, typename T, class ITy, ITy... Is>
0625         XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<batch<T, A>::size>>) noexcept
0626         {
0627             constexpr size_t size = batch<T, A>::size;
0628             batch<ITy, A> bmask = mask;
0629             std::array<T, size> res;
0630             for (size_t i = 0; i < size; ++i)
0631                 res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size];
0632             return res;
0633         }
0634
0635         // sqrt
0636         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0637         XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
0638         {
0639             return detail::emulated_apply([](T v)
0640                                           { return xsimd::sqrt(v); },
0641                                           self);
0642         }
0643
0644         // slide_left
0645         template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0646         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
0647         {
0648             constexpr size_t size = batch<T, A>::size;
0649             std::array<T, size> result;
0650             char* raw_data = reinterpret_cast<char*>(result.data());
0651             memset(raw_data, 0, M);
0652             memcpy(raw_data + M, reinterpret_cast<const char*>(x.data.data()), sizeof(T) * result.size() - M);
0653             return result;
0654         }
0655
0656         // slide_right
0657         template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0658         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
0659         {
0660             constexpr size_t size = batch<T, A>::size;
0661             std::array<T, size> result;
0662             char* raw_data = reinterpret_cast<char*>(result.data());
0663             memcpy(raw_data, reinterpret_cast<const char*>(x.data.data()) + M, sizeof(T) * result.size() - M);
0664             memset(raw_data + sizeof(T) * result.size() - M, 0, M);
0665             return result;
0666         }
0667
0668         // sadd
0669         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0670         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0671         {
0672             return detail::emulated_apply([](T v0, T v1)
0673                                           { return xsimd::sadd(v0, v1); },
0674                                           self, other);
0675         }
0676
0677         // set
0678         template <class A, class T, size_t N, class... Values>
0679         XSIMD_INLINE batch<T, emulated<N>> set(batch<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
0680         {
0681             static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
0682             return { typename batch<T, emulated<N>>::register_type { static_cast<T>(values)... } };
0683         }
0684
0685         template <class A, class T, size_t N, class... Values>
0686         XSIMD_INLINE batch_bool<T, emulated<N>> set(batch_bool<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
0687         {
0688             static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
0689             return { std::array<bool, sizeof...(Values)> { static_cast<bool>(values)... } };
0690         }
0691
0692         // ssub
0693         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0694         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0695         {
0696             return detail::emulated_apply([](T v0, T v1)
0697                                           { return xsimd::ssub(v0, v1); },
0698                                           self, other);
0699         }
0700
0701         // store_aligned
0702         template <class A, class T, size_t N>
0703         XSIMD_INLINE void store_aligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
0704         {
0705             std::copy(self.data.begin(), self.data.end(), mem);
0706         }
0707
0708         // store_unaligned
0709         template <class A, class T, size_t N>
0710         XSIMD_INLINE void store_unaligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
0711         {
0712             std::copy(self.data.begin(), self.data.end(), mem);
0713         }
0714
0715         // sub
0716         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0717         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0718         {
0719             return detail::emulated_apply([](T v0, T v1)
0720                                           { return xsimd::sub(v0, v1); },
0721                                           self, other);
0722         }
0723
0724         // swizzle
0725
0726         template <class A, typename T, class ITy, ITy... Is>
0727         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
0728         {
0729             constexpr size_t size = batch<T, A>::size;
0730             batch<ITy, A> bmask = mask;
0731             std::array<T, size> res;
0732             for (size_t i = 0; i < size; ++i)
0733                 res[i] = self.data[bmask.data[i]];
0734             return res;
0735         }
0736
0737         // zip_hi
0738         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0739         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0740         {
0741             constexpr size_t size = batch<T, A>::size;
0742             // Note: irregular behavior for odd numbers.
0743             std::array<T, size> res;
0744             if (size % 2)
0745             {
0746                 for (size_t i = 0; i < size; ++i)
0747                     res[i] = (i % 2 ? self : other).data[size / 2 + i / 2];
0748             }
0749             else
0750             {
0751                 for (size_t i = 0; i < size; ++i)
0752                     res[i] = (i % 2 ? other : self).data[size / 2 + i / 2];
0753             }
0754             return res;
0755         }
0756
0757         // zip_lo
0758         template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
0759         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
0760         {
0761             constexpr size_t size = batch<T, A>::size;
0762             // Note: irregular behavior for odd numbers.
0763             std::array<T, size> res;
0764             for (size_t i = 0; i < size; ++i)
0765                 res[i] = (i % 2 ? other : self).data[i / 2];
0766             return res;
0767         }
0768     }
0769 }
0770
0771 #endif