xsimd/arch/xsimd_wasm.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  * Copyright (c) Anutosh Bhat                                               *
0007  *                                                                          *
0008  * Distributed under the terms of the BSD 3-Clause License.                 *
0009  *                                                                          *
0010  * The full license is in the file LICENSE, distributed with this software. *
0011  ****************************************************************************/
0012
0013 #ifndef XSIMD_WASM_HPP
0014 #define XSIMD_WASM_HPP
0015
0016 #include <type_traits>
0017
0018 #include "../types/xsimd_wasm_register.hpp"
0019
0020 namespace xsimd
0021 {
0022     template <typename T, class A, bool... Values>
0023     struct batch_bool_constant;
0024
0025     template <class T_out, class T_in, class A>
0026     XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
0027
0028     template <typename T, class A, T... Values>
0029     struct batch_constant;
0030
0031     namespace kernel
0032     {
0033         using namespace types;
0034
0035         // fwd
0036         template <class A, class T, size_t I>
0037         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
0038         template <class A, typename T, typename ITy, ITy... Indices>
0039         XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
0040         template <class A, class T>
0041         XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
0042         template <class A, class T>
0043         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept;
0044
0045         // abs
0046         template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
0047         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
0048         {
0049             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0050             {
0051                 return wasm_i8x16_abs(self);
0052             }
0053             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0054             {
0055                 return wasm_i16x8_abs(self);
0056             }
0057             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0058             {
0059                 return wasm_i32x4_abs(self);
0060             }
0061             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0062             {
0063                 return wasm_i64x2_abs(self);
0064             }
0065             else
0066             {
0067                 assert(false && "unsupported arch/op combination");
0068                 return {};
0069             }
0070         }
0071
0072         template <class A>
0073         XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<wasm>) noexcept
0074         {
0075             return wasm_f32x4_abs(self);
0076         }
0077
0078         template <class A>
0079         XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<wasm>) noexcept
0080         {
0081             return wasm_f64x2_abs(self);
0082         }
0083
0084         // add
0085         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0086         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0087         {
0088             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0089             {
0090                 return wasm_i8x16_add(self, other);
0091             }
0092             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0093             {
0094                 return wasm_i16x8_add(self, other);
0095             }
0096             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0097             {
0098                 return wasm_i32x4_add(self, other);
0099             }
0100             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0101             {
0102                 return wasm_i64x2_add(self, other);
0103             }
0104             else
0105             {
0106                 assert(false && "unsupported arch/op combination");
0107                 return {};
0108             }
0109         }
0110
0111         template <class A>
0112         XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0113         {
0114             return wasm_f32x4_add(self, other);
0115         }
0116
0117         template <class A>
0118         XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0119         {
0120             return wasm_f64x2_add(self, other);
0121         }
0122
0123         // avgr
0124         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0125         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0126         {
0127             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0128             {
0129                 return wasm_u8x16_avgr(self, other);
0130             }
0131             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0132             {
0133                 return wasm_u16x8_avgr(self, other);
0134             }
0135             else
0136             {
0137                 return avgr(self, other, generic {});
0138             }
0139         }
0140
0141         // avg
0142         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
0143         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0144         {
0145             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0146             {
0147                 auto adj = ((self ^ other) << 7) >> 7;
0148                 return avgr(self, other, A {}) - adj;
0149             }
0150             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0151             {
0152                 auto adj = ((self ^ other) << 15) >> 15;
0153                 return avgr(self, other, A {}) - adj;
0154             }
0155             else
0156             {
0157                 return avg(self, other, generic {});
0158             }
0159         }
0160
0161         // all
0162         template <class A>
0163         XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
0164         {
0165             return wasm_i32x4_bitmask(self) == 0x0F;
0166         }
0167         template <class A>
0168         XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
0169         {
0170             return wasm_i64x2_bitmask(self) == 0x03;
0171         }
0172         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0173         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
0174         {
0175             return wasm_i8x16_bitmask(self) == 0xFFFF;
0176         }
0177
0178         // any
0179         template <class A>
0180         XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
0181         {
0182             return wasm_i32x4_bitmask(self) != 0;
0183         }
0184         template <class A>
0185         XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
0186         {
0187             return wasm_i64x2_bitmask(self) != 0;
0188         }
0189         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0190         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
0191         {
0192             return wasm_i8x16_bitmask(self) != 0;
0193         }
0194
0195         // batch_bool_cast
0196         template <class A, class T_out, class T_in>
0197         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<wasm>) noexcept
0198         {
0199             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
0200         }
0201
0202         // bitwise_and
0203         template <class A, class T>
0204         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0205         {
0206             return wasm_v128_and(self, other);
0207         }
0208
0209         template <class A, class T>
0210         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
0211         {
0212             return wasm_v128_and(self, other);
0213         }
0214
0215         // bitwise_andnot
0216         template <class A, class T>
0217         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0218         {
0219             return wasm_v128_andnot(self, other);
0220         }
0221
0222         template <class A, class T>
0223         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
0224         {
0225             return wasm_v128_andnot(self, other);
0226         }
0227
0228         // bitwise_cast
0229         template <class A, class T, class Tp>
0230         XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<wasm>) noexcept
0231         {
0232             return batch<Tp, A>(self.data);
0233         }
0234
0235         // bitwise_or
0236         template <class A, class T>
0237         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0238         {
0239             return wasm_v128_or(self, other);
0240         }
0241
0242         template <class A, class T>
0243         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
0244         {
0245             return wasm_v128_or(self, other);
0246         }
0247
0248         // bitwise_lshift
0249         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0250         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
0251         {
0252             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0253             {
0254                 return wasm_i8x16_shl(self, other);
0255             }
0256             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0257             {
0258                 return wasm_i16x8_shl(self, other);
0259             }
0260             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0261             {
0262                 return wasm_i32x4_shl(self, other);
0263             }
0264             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0265             {
0266                 return wasm_i64x2_shl(self, other);
0267             }
0268             else
0269             {
0270                 assert(false && "unsupported arch/op combination");
0271                 return {};
0272             }
0273         }
0274
0275         // bitwise_rshift
0276         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0277         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<wasm>) noexcept
0278         {
0279             if (std::is_signed<T>::value)
0280             {
0281                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0282                 {
0283                     return wasm_i8x16_shr(self, other);
0284                 }
0285                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0286                 {
0287                     return wasm_i16x8_shr(self, other);
0288                 }
0289                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0290                 {
0291                     return wasm_i32x4_shr(self, other);
0292                 }
0293                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0294                 {
0295                     return wasm_i64x2_shr(self, other);
0296                 }
0297                 else
0298                 {
0299                     assert(false && "unsupported arch/op combination");
0300                     return {};
0301                 }
0302             }
0303             else
0304             {
0305                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0306                 {
0307                     return wasm_u8x16_shr(self, other);
0308                 }
0309                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0310                 {
0311                     return wasm_u16x8_shr(self, other);
0312                 }
0313                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0314                 {
0315                     return wasm_u32x4_shr(self, other);
0316                 }
0317                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0318                 {
0319                     return wasm_u64x2_shr(self, other);
0320                 }
0321                 else
0322                 {
0323                     assert(false && "unsupported arch/op combination");
0324                     return {};
0325                 }
0326             }
0327         }
0328
0329         // bitwise_not
0330         template <class A, class T>
0331         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<wasm>) noexcept
0332         {
0333             return wasm_v128_not(self);
0334         }
0335
0336         template <class A, class T>
0337         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
0338         {
0339             return wasm_v128_not(self);
0340         }
0341
0342         // bitwise_xor
0343         template <class A, class T>
0344         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0345         {
0346             return wasm_v128_xor(self, other);
0347         }
0348
0349         template <class A, class T>
0350         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
0351         {
0352             return wasm_v128_xor(self, other);
0353         }
0354
0355         // broadcast
0356         template <class A>
0357         batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<wasm>) noexcept
0358         {
0359             return wasm_f32x4_splat(val);
0360         }
0361         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0362         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<wasm>) noexcept
0363         {
0364             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0365             {
0366                 return wasm_i8x16_splat(val);
0367             }
0368             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0369             {
0370                 return wasm_i16x8_splat(val);
0371             }
0372             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0373             {
0374                 return wasm_i32x4_splat(val);
0375             }
0376             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0377             {
0378                 return wasm_i64x2_splat(val);
0379             }
0380             else
0381             {
0382                 assert(false && "unsupported arch/op combination");
0383                 return {};
0384             }
0385         }
0386         template <class A>
0387         XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<wasm>) noexcept
0388         {
0389             return wasm_f64x2_splat(val);
0390         }
0391
0392         // ceil
0393         template <class A>
0394         XSIMD_INLINE batch<float, A> ceil(batch<float, A> const& self, requires_arch<wasm>) noexcept
0395         {
0396             return wasm_f32x4_ceil(self);
0397         }
0398         template <class A>
0399         XSIMD_INLINE batch<double, A> ceil(batch<double, A> const& self, requires_arch<wasm>) noexcept
0400         {
0401             return wasm_f64x2_ceil(self);
0402         }
0403
0404         // div
0405         template <class A>
0406         XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0407         {
0408             return wasm_f32x4_div(self, other);
0409         }
0410         template <class A>
0411         XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0412         {
0413             return wasm_f64x2_div(self, other);
0414         }
0415
0416         // eq
0417         template <class A>
0418         XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0419         {
0420             return wasm_f32x4_eq(self, other);
0421         }
0422         template <class A>
0423         XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
0424         {
0425             return wasm_i32x4_eq(self, other);
0426         }
0427         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0428         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0429         {
0430             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0431             {
0432                 return wasm_i8x16_eq(self, other);
0433             }
0434             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0435             {
0436                 return wasm_i16x8_eq(self, other);
0437             }
0438             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0439             {
0440                 return wasm_i32x4_eq(self, other);
0441             }
0442             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0443             {
0444                 return wasm_i64x2_eq(self, other);
0445             }
0446             else
0447             {
0448                 assert(false && "unsupported arch/op combination");
0449                 return {};
0450             }
0451         }
0452         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0453         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
0454         {
0455             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0456             {
0457                 return wasm_i8x16_eq(self, other);
0458             }
0459             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0460             {
0461                 return wasm_i16x8_eq(self, other);
0462             }
0463             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0464             {
0465                 return wasm_i32x4_eq(self, other);
0466             }
0467             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0468             {
0469                 return wasm_i64x2_eq(self, other);
0470             }
0471             else
0472             {
0473                 assert(false && "unsupported arch/op combination");
0474                 return {};
0475             }
0476         }
0477         template <class A>
0478         XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0479         {
0480             return wasm_f64x2_eq(self, other);
0481         }
0482         template <class A>
0483         XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
0484         {
0485             return wasm_i64x2_eq(self, other);
0486         }
0487
0488         // fast_cast
0489         namespace detail
0490         {
0491             template <class A>
0492             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<wasm>) noexcept
0493             {
0494                 return wasm_f32x4_convert_i32x4(self);
0495             }
0496
0497             template <class A>
0498             XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
0499             {
0500                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0501                 // adapted to wasm
0502                 v128_t xH = wasm_u64x2_shr(x, 32);
0503                 xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); //  2^84
0504                 v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
0505                 v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); //  2^52
0506                 v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); //  2^84 + 2^52
0507                 return wasm_f64x2_add(f, xL);
0508             }
0509
0510             template <class A>
0511             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
0512             {
0513                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
0514                 // adapted to wasm
0515                 v128_t xH = wasm_i32x4_shr(x, 16);
0516                 xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
0517                 xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); //  3*2^67
0518                 v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
0519                 v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); //  2^52
0520                 v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); //  3*2^67 + 2^52
0521                 return wasm_f64x2_add(f, xL);
0522             }
0523
0524             template <class A>
0525             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<wasm>) noexcept
0526             {
0527                 return wasm_i32x4_make(
0528                     static_cast<int32_t>(wasm_f32x4_extract_lane(self, 0)),
0529                     static_cast<int32_t>(wasm_f32x4_extract_lane(self, 1)),
0530                     static_cast<int32_t>(wasm_f32x4_extract_lane(self, 2)),
0531                     static_cast<int32_t>(wasm_f32x4_extract_lane(self, 3)));
0532             }
0533         }
0534
0535         // floor
0536         template <class A>
0537         XSIMD_INLINE batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
0538         {
0539             return wasm_f32x4_floor(self);
0540         }
0541
0542         template <class A>
0543         XSIMD_INLINE batch<double, A> floor(batch<double, A> const& self, requires_arch<wasm>) noexcept
0544         {
0545             return wasm_f64x2_floor(self);
0546         }
0547
0548         // from_mask
0549         template <class A>
0550         XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
0551         {
0552             alignas(A::alignment()) static const uint32_t lut[][4] = {
0553                 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
0554                 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
0555                 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0556                 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0557                 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0558                 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0559                 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0560                 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0561                 { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
0562                 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
0563                 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0564                 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0565                 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0566                 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0567                 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0568                 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0569             };
0570             assert(!(mask & ~0xFul) && "inbound mask");
0571             return wasm_v128_load((const v128_t*)lut[mask]);
0572         }
0573         template <class A>
0574         XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
0575         {
0576             alignas(A::alignment()) static const uint64_t lut[][4] = {
0577                 { 0x0000000000000000ul, 0x0000000000000000ul },
0578                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0579                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0580                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0581             };
0582             assert(!(mask & ~0x3ul) && "inbound mask");
0583             return wasm_v128_load((const v128_t*)lut[mask]);
0584         }
0585         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0586         XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<wasm>) noexcept
0587         {
0588             alignas(A::alignment()) static const uint64_t lut64[] = {
0589                 0x0000000000000000,
0590                 0x000000000000FFFF,
0591                 0x00000000FFFF0000,
0592                 0x00000000FFFFFFFF,
0593                 0x0000FFFF00000000,
0594                 0x0000FFFF0000FFFF,
0595                 0x0000FFFFFFFF0000,
0596                 0x0000FFFFFFFFFFFF,
0597                 0xFFFF000000000000,
0598                 0xFFFF00000000FFFF,
0599                 0xFFFF0000FFFF0000,
0600                 0xFFFF0000FFFFFFFF,
0601                 0xFFFFFFFF00000000,
0602                 0xFFFFFFFF0000FFFF,
0603                 0xFFFFFFFFFFFF0000,
0604                 0xFFFFFFFFFFFFFFFF,
0605             };
0606             alignas(A::alignment()) static const uint32_t lut32[] = {
0607                 0x00000000,
0608                 0x000000FF,
0609                 0x0000FF00,
0610                 0x0000FFFF,
0611                 0x00FF0000,
0612                 0x00FF00FF,
0613                 0x00FFFF00,
0614                 0x00FFFFFF,
0615                 0xFF000000,
0616                 0xFF0000FF,
0617                 0xFF00FF00,
0618                 0xFF00FFFF,
0619                 0xFFFF0000,
0620                 0xFFFF00FF,
0621                 0xFFFFFF00,
0622                 0xFFFFFFFF,
0623             };
0624             alignas(A::alignment()) static const uint32_t lut16[][4] = {
0625                 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
0626                 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
0627                 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0628                 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
0629                 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0630                 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
0631                 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0632                 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
0633                 { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
0634                 { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
0635                 { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0636                 { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
0637                 { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0638                 { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
0639                 { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0640                 { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
0641             };
0642             alignas(A::alignment()) static const uint64_t lut8[][4] = {
0643                 { 0x0000000000000000ul, 0x0000000000000000ul },
0644                 { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
0645                 { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
0646                 { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
0647             };
0648             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0649             {
0650                 assert(!(mask & ~0xFFFF) && "inbound mask");
0651                 return wasm_i32x4_make(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
0652             }
0653             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0654             {
0655                 assert(!(mask & ~0xFF) && "inbound mask");
0656                 return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]);
0657             }
0658             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0659             {
0660                 assert(!(mask & ~0xFul) && "inbound mask");
0661                 return wasm_v128_load((const v128_t*)lut16[mask]);
0662             }
0663             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0664             {
0665                 assert(!(mask & ~0x3ul) && "inbound mask");
0666                 return wasm_v128_load((const v128_t*)lut8[mask]);
0667             }
0668         }
0669
0670         // ge
0671         template <class A>
0672         XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0673         {
0674             return wasm_f32x4_ge(self, other);
0675         }
0676         template <class A>
0677         XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0678         {
0679             return wasm_f64x2_ge(self, other);
0680         }
0681
0682         // gt
0683         template <class A>
0684         XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0685         {
0686             return wasm_f32x4_gt(self, other);
0687         }
0688         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0689         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0690         {
0691             if (std::is_signed<T>::value)
0692             {
0693                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0694                 {
0695                     return wasm_i8x16_gt(self, other);
0696                 }
0697                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0698                 {
0699                     return wasm_i16x8_gt(self, other);
0700                 }
0701                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0702                 {
0703                     return wasm_i32x4_gt(self, other);
0704                 }
0705                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0706                 {
0707                     return wasm_i64x2_gt(self, other);
0708                 }
0709                 else
0710                 {
0711                     assert(false && "unsupported arch/op combination");
0712                     return {};
0713                 }
0714             }
0715             else
0716             {
0717                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0718                 {
0719                     return wasm_u8x16_gt(self, other);
0720                 }
0721                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0722                 {
0723                     return wasm_u16x8_gt(self, other);
0724                 }
0725                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0726                 {
0727                     return wasm_u32x4_gt(self, other);
0728                 }
0729                 else
0730                 {
0731                     return gt(self, other, generic {});
0732                 }
0733             }
0734         }
0735
0736         template <class A>
0737         XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0738         {
0739             return wasm_f64x2_gt(self, other);
0740         }
0741
0742         // haddp
0743         template <class A>
0744         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<wasm>) noexcept
0745         {
0746             v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5);
0747             v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7);
0748             v128_t tmp2 = wasm_i32x4_shuffle(row[2], row[3], 2, 6, 3, 7);
0749             tmp0 = wasm_f32x4_add(tmp0, tmp1);
0750             tmp1 = wasm_i32x4_shuffle(row[2], row[3], 0, 4, 1, 5);
0751             tmp1 = wasm_f32x4_add(tmp1, tmp2);
0752             tmp2 = wasm_i32x4_shuffle(tmp1, tmp0, 6, 7, 2, 3);
0753             tmp0 = wasm_i32x4_shuffle(tmp0, tmp1, 0, 1, 4, 5);
0754             return wasm_f32x4_add(tmp0, tmp2);
0755         }
0756         template <class A>
0757         XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<wasm>) noexcept
0758         {
0759             return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2),
0760                                   wasm_i64x2_shuffle(row[0], row[1], 1, 3));
0761         }
0762
0763         // insert
0764         template <class A, size_t I>
0765         XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I> pos, requires_arch<wasm>) noexcept
0766         {
0767             return wasm_f32x4_replace_lane(self, pos, val);
0768         }
0769         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0770         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<wasm>) noexcept
0771         {
0772             if (std::is_signed<T>::value)
0773             {
0774                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0775                 {
0776                     return wasm_i8x16_replace_lane(self, pos, val);
0777                 }
0778                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0779                 {
0780                     return wasm_i16x8_replace_lane(self, pos, val);
0781                 }
0782                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0783                 {
0784                     return wasm_i32x4_replace_lane(self, pos, val);
0785                 }
0786                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0787                 {
0788                     return wasm_i64x2_replace_lane(self, pos, val);
0789                 }
0790                 else
0791                 {
0792                     assert(false && "unsupported arch/op combination");
0793                     return {};
0794                 }
0795             }
0796             else
0797             {
0798                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0799                 {
0800                     return wasm_u8x16_replace_lane(self, pos, val);
0801                 }
0802                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0803                 {
0804                     return wasm_u16x8_replace_lane(self, pos, val);
0805                 }
0806                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0807                 {
0808                     return wasm_u32x4_replace_lane(self, pos, val);
0809                 }
0810                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0811                 {
0812                     return wasm_u64x2_replace_lane(self, pos, val);
0813                 }
0814                 else
0815                 {
0816                     assert(false && "unsupported arch/op combination");
0817                     return {};
0818                 }
0819             }
0820         }
0821
0822         template <class A, size_t I>
0823         XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I> pos, requires_arch<wasm>) noexcept
0824         {
0825             return wasm_f64x2_replace_lane(self, pos, val);
0826         }
0827
0828         // isnan
0829         template <class A>
0830         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<wasm>) noexcept
0831         {
0832             return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self));
0833         }
0834         template <class A>
0835         XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<wasm>) noexcept
0836         {
0837             return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self));
0838         }
0839
0840         // le
0841         template <class A>
0842         XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0843         {
0844             return wasm_f32x4_le(self, other);
0845         }
0846         template <class A>
0847         XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0848         {
0849             return wasm_f64x2_le(self, other);
0850         }
0851
0852         // load_aligned
0853         template <class A>
0854         XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
0855         {
0856             return wasm_v128_load(mem);
0857         }
0858         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0859         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
0860         {
0861             return wasm_v128_load((v128_t const*)mem);
0862         }
0863         template <class A>
0864         XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
0865         {
0866             return wasm_v128_load(mem);
0867         }
0868
0869         // load_complex
0870         namespace detail
0871         {
0872             template <class A>
0873             XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<wasm>) noexcept
0874             {
0875                 return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) };
0876             }
0877             template <class A>
0878             XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<wasm>) noexcept
0879             {
0880                 return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) };
0881             }
0882         }
0883
0884         // load_unaligned
0885         template <class A>
0886         XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<wasm>) noexcept
0887         {
0888             return wasm_v128_load(mem);
0889         }
0890         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0891         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<wasm>) noexcept
0892         {
0893             return wasm_v128_load((v128_t const*)mem);
0894         }
0895         template <class A>
0896         XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<wasm>) noexcept
0897         {
0898             return wasm_v128_load(mem);
0899         }
0900
0901         // lt
0902         template <class A>
0903         XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
0904         {
0905             return wasm_f32x4_lt(self, other);
0906         }
0907         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0908         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
0909         {
0910             if (std::is_signed<T>::value)
0911             {
0912                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0913                 {
0914                     return wasm_i8x16_lt(self, other);
0915                 }
0916                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0917                 {
0918                     return wasm_i16x8_lt(self, other);
0919                 }
0920                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0921                 {
0922                     return wasm_i32x4_lt(self, other);
0923                 }
0924                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0925                 {
0926                     return wasm_i64x2_lt(self, other);
0927                 }
0928                 else
0929                 {
0930                     assert(false && "unsupported arch/op combination");
0931                     return {};
0932                 }
0933             }
0934             else
0935             {
0936                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0937                 {
0938                     return wasm_u8x16_lt(self, other);
0939                 }
0940                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0941                 {
0942                     return wasm_u16x8_lt(self, other);
0943                 }
0944                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0945                 {
0946                     return wasm_u32x4_lt(self, other);
0947                 }
0948                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0949                 {
0950                     auto xself = wasm_v128_xor(self, wasm_i64x2_splat(std::numeric_limits<int64_t>::lowest()));
0951                     auto xother = wasm_v128_xor(other, wasm_i64x2_splat(std::numeric_limits<int64_t>::lowest()));
0952                     v128_t tmp1 = wasm_i64x2_sub(xself, xother);
0953                     v128_t tmp2 = wasm_v128_xor(xself, xother);
0954                     v128_t tmp3 = wasm_v128_andnot(xself, xother);
0955                     v128_t tmp4 = wasm_v128_andnot(tmp1, tmp2);
0956                     v128_t tmp5 = wasm_v128_or(tmp3, tmp4);
0957                     v128_t tmp6 = wasm_i32x4_shr(tmp5, 31);
0958                     return wasm_i32x4_shuffle(tmp6, wasm_i32x4_splat(0), 1, 1, 3, 3);
0959                 }
0960                 else
0961                 {
0962                     assert(false && "unsupported arch/op combination");
0963                     return {};
0964                 }
0965             }
0966         }
0967
0968         template <class A>
0969         XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
0970         {
0971             return wasm_f64x2_lt(self, other);
0972         }
0973
0974         // mask
0975         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
0976         XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
0977         {
0978             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
0979             {
0980                 return wasm_i8x16_bitmask(self);
0981             }
0982             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
0983             {
0984                 return wasm_i16x8_bitmask(self);
0985             }
0986             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
0987             {
0988                 return wasm_i32x4_bitmask(self);
0989             }
0990             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
0991             {
0992                 return wasm_i64x2_bitmask(self);
0993             }
0994             else
0995             {
0996                 assert(false && "unsupported arch/op combination");
0997                 return {};
0998             }
0999         }
1000         template <class A>
1001         XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
1002         {
1003             return wasm_i32x4_bitmask(self);
1004         }
1005
1006         template <class A>
1007         XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<wasm>) noexcept
1008         {
1009             return wasm_i64x2_bitmask(self);
1010         }
1011
1012         // max
1013         template <class A>
1014         XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1015         {
1016             return wasm_f32x4_pmax(self, other);
1017         }
1018         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1019         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1020         {
1021             return select(self > other, self, other);
1022         }
1023         template <class A>
1024         XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1025         {
1026             return wasm_f64x2_pmax(self, other);
1027         }
1028
1029         // min
1030         template <class A>
1031         XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1032         {
1033             return wasm_f32x4_pmin(self, other);
1034         }
1035         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1036         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1037         {
1038             return select(self <= other, self, other);
1039         }
1040         template <class A>
1041         XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1042         {
1043             return wasm_f64x2_pmin(self, other);
1044         }
1045
1046         // mul
1047         template <class A>
1048         XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1049         {
1050             return wasm_f32x4_mul(self, other);
1051         }
1052         template <class A>
1053         XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1054         {
1055             return wasm_f64x2_mul(self, other);
1056         }
1057
1058         // neg
1059         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1060         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<wasm>) noexcept
1061         {
1062             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1063             {
1064                 return wasm_i8x16_neg(self);
1065             }
1066             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1067             {
1068                 return wasm_i16x8_neg(self);
1069             }
1070             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1071             {
1072                 return wasm_i32x4_neg(self);
1073             }
1074             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1075             {
1076                 return wasm_i64x2_neg(self);
1077             }
1078             else
1079             {
1080                 assert(false && "unsupported arch/op combination");
1081                 return {};
1082             }
1083         }
1084
1085         template <class A>
1086         XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<wasm>) noexcept
1087         {
1088             return wasm_f32x4_neg(self);
1089         }
1090
1091         template <class A>
1092         XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<wasm>) noexcept
1093         {
1094             return wasm_f64x2_neg(self);
1095         }
1096
1097         // neq
1098         template <class A>
1099         XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1100         {
1101             return wasm_f32x4_ne(self, other);
1102         }
1103         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1104         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1105         {
1106             return ~(self == other);
1107         }
1108         template <class A>
1109         XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
1110         {
1111             return wasm_f32x4_ne(self, other);
1112         }
1113         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1114         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<wasm>) noexcept
1115         {
1116             return ~(self == other);
1117         }
1118
1119         template <class A>
1120         XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1121         {
1122             return wasm_f64x2_ne(self, other);
1123         }
1124         template <class A>
1125         XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
1126         {
1127             return wasm_f64x2_ne(self, other);
1128         }
1129
1130         // reciprocal
1131         template <class A>
1132         XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self, requires_arch<wasm>) noexcept
1133         {
1134             v128_t one = wasm_f32x4_splat(1.0f);
1135             return wasm_f32x4_div(one, self);
1136         }
1137         template <class A>
1138         XSIMD_INLINE batch<double, A> reciprocal(batch<double, A> const& self, requires_arch<wasm>) noexcept
1139         {
1140             v128_t one = wasm_f64x2_splat(1.0);
1141             return wasm_f64x2_div(one, self);
1142         }
1143
1144         // reduce_add
1145         template <class A>
1146         XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<wasm>) noexcept
1147         {
1148             v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3));
1149             v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4);
1150             v128_t tmp2 = wasm_f32x4_add(tmp0, tmp1);
1151             v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3);
1152             return wasm_f32x4_extract_lane(tmp3, 0);
1153         }
1154         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1155         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<wasm>) noexcept
1156         {
1157             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1158             {
1159                 v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
1160                 v128_t tmp1 = wasm_i32x4_add(self, tmp0);
1161                 v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0);
1162                 v128_t tmp3 = wasm_i32x4_add(tmp1, tmp2);
1163                 return wasm_i32x4_extract_lane(tmp3, 0);
1164             }
1165             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1166             {
1167                 v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
1168                 v128_t tmp1 = wasm_i64x2_add(self, tmp0);
1169                 return wasm_i64x2_extract_lane(tmp1, 0);
1170             }
1171             else
1172             {
1173                 return hadd(self, generic {});
1174             }
1175         }
1176         template <class A>
1177         XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<wasm>) noexcept
1178         {
1179             v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3);
1180             v128_t tmp1 = wasm_f64x2_add(self, tmp0);
1181             v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1);
1182             return wasm_f64x2_extract_lane(tmp2, 0);
1183         }
1184
1185         // rsqrt
1186         template <class A>
1187         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
1188         {
1189             v128_t one = wasm_f32x4_splat(1.0f);
1190             return wasm_f32x4_div(one, wasm_f32x4_sqrt(self));
1191         }
1192         template <class A>
1193         XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& self, requires_arch<wasm>) noexcept
1194         {
1195             v128_t one = wasm_f64x2_splat(1.0);
1196             return wasm_f64x2_div(one, wasm_f64x2_sqrt(self));
1197         }
1198
1199         // slide_left
1200         template <size_t N, class A, class T>
1201         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<wasm>) noexcept
1202         {
1203             return wasm_i8x16_shuffle(
1204                 wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF),
1205                 ((N) & 0xF0) ? 0 : 17 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 18 - ((N) & 0xF),
1206                 ((N) & 0xF0) ? 0 : 19 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 20 - ((N) & 0xF),
1207                 ((N) & 0xF0) ? 0 : 21 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 22 - ((N) & 0xF),
1208                 ((N) & 0xF0) ? 0 : 23 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 24 - ((N) & 0xF),
1209                 ((N) & 0xF0) ? 0 : 25 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 26 - ((N) & 0xF),
1210                 ((N) & 0xF0) ? 0 : 27 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 28 - ((N) & 0xF),
1211                 ((N) & 0xF0) ? 0 : 29 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 30 - ((N) & 0xF),
1212                 ((N) & 0xF0) ? 0 : 31 - ((N) & 0xF));
1213         }
1214
1215         // slide_right
1216         template <size_t N, class A, class T>
1217         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<wasm>) noexcept
1218         {
1219             return wasm_i8x16_shuffle(
1220                 x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0,
1221                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 1, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 2,
1222                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 3, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 4,
1223                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 5, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 6,
1224                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 7, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 8,
1225                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 9, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 10,
1226                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 11, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 12,
1227                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 13, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 14,
1228                 ((N) & 0xF0) ? 16 : ((N) & 0xF) + 15);
1229         }
1230
1231         // sadd
1232         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1233         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1234         {
1235             if (std::is_signed<T>::value)
1236             {
1237                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1238                 {
1239                     return wasm_i8x16_add_sat(self, other);
1240                 }
1241                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1242                 {
1243                     return wasm_i16x8_add_sat(self, other);
1244                 }
1245                 else
1246                 {
1247                     return sadd(self, other, generic {});
1248                 }
1249             }
1250             else
1251             {
1252                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1253                 {
1254                     return wasm_u8x16_add_sat(self, other);
1255                 }
1256                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1257                 {
1258                     return wasm_u16x8_add_sat(self, other);
1259                 }
1260                 else
1261                 {
1262                     return sadd(self, other, generic {});
1263                 }
1264             }
1265         }
1266
1267         // select
1268         template <class A>
1269         XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<wasm>) noexcept
1270         {
1271             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
1272         }
1273
1274         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1275         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
1276         {
1277             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
1278         }
1279         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1280         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
1281         {
1282             return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
1283         }
1284         template <class A>
1285         XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<wasm>) noexcept
1286         {
1287             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
1288         }
1289
1290         // shuffle
1291         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1292         XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
1293         {
1294             return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
1295         }
1296
1297         template <class A, class ITy, ITy I0, ITy I1>
1298         XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<wasm>) noexcept
1299         {
1300             return wasm_i64x2_shuffle(x, y, I0, I1);
1301         }
1302
1303         // set
1304         template <class A, class... Values>
1305         XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
1306         {
1307             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
1308             return wasm_f32x4_make(values...);
1309         }
1310
1311         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1312         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1) noexcept
1313         {
1314             return wasm_i64x2_make(v0, v1);
1315         }
1316
1317         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1318         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3) noexcept
1319         {
1320             return wasm_i32x4_make(v0, v1, v2, v3);
1321         }
1322
1323         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1324         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
1325         {
1326             return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7);
1327         }
1328
1329         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1330         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<wasm>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
1331         {
1332             return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
1333         }
1334
1335         template <class A, class... Values>
1336         XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<wasm>, Values... values) noexcept
1337         {
1338             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
1339             return wasm_f64x2_make(values...);
1340         }
1341
1342         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1343         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<wasm>, Values... values) noexcept
1344         {
1345             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
1346         }
1347
1348         template <class A, class... Values>
1349         XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<wasm>, Values... values) noexcept
1350         {
1351             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
1352             return set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data;
1353         }
1354
1355         template <class A, class... Values>
1356         XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<wasm>, Values... values) noexcept
1357         {
1358             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
1359             return set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data;
1360         }
1361
1362         // ssub
1363         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1364         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1365         {
1366             if (std::is_signed<T>::value)
1367             {
1368                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1369                 {
1370                     return wasm_i8x16_sub_sat(self, other);
1371                 }
1372                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1373                 {
1374                     return wasm_i16x8_sub_sat(self, other);
1375                 }
1376                 else
1377                 {
1378                     return ssub(self, other, generic {});
1379                 }
1380             }
1381             else
1382             {
1383                 XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1384                 {
1385                     return wasm_u8x16_sub_sat(self, other);
1386                 }
1387                 else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1388                 {
1389                     return wasm_u16x8_sub_sat(self, other);
1390                 }
1391                 else
1392                 {
1393                     return ssub(self, other, generic {});
1394                 }
1395             }
1396         }
1397
1398         // store_aligned
1399         template <class A>
1400         XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
1401         {
1402             return wasm_v128_store(mem, self);
1403         }
1404         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1405         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
1406         {
1407             return wasm_v128_store((v128_t*)mem, self);
1408         }
1409         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1410         XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
1411         {
1412             return wasm_v128_store((v128_t*)mem, self);
1413         }
1414         template <class A>
1415         XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
1416         {
1417             return wasm_v128_store(mem, self);
1418         }
1419
1420         // store_complex
1421         namespace detail
1422         {
1423             // complex_low
1424             template <class A>
1425             XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
1426             {
1427                 return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5);
1428             }
1429             // complex_high
1430             template <class A>
1431             XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<wasm>) noexcept
1432             {
1433                 return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7);
1434             }
1435             template <class A>
1436             XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
1437             {
1438                 return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2);
1439             }
1440             template <class A>
1441             XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<wasm>) noexcept
1442             {
1443                 return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3);
1444             }
1445         }
1446
1447         // store_unaligned
1448         template <class A>
1449         XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
1450         {
1451             return wasm_v128_store(mem, self);
1452         }
1453         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1454         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
1455         {
1456             return wasm_v128_store((v128_t*)mem, self);
1457         }
1458         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1459         XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
1460         {
1461             return wasm_v128_store((v128_t*)mem, self);
1462         }
1463         template <class A>
1464         XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
1465         {
1466             return wasm_v128_store(mem, self);
1467         }
1468
1469         // sub
1470         template <class A>
1471         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1472         {
1473             return wasm_f32x4_sub(self, other);
1474         }
1475         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1476         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1477         {
1478             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1479             {
1480                 return wasm_i8x16_sub(self, other);
1481             }
1482             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1483             {
1484                 return wasm_i16x8_sub(self, other);
1485             }
1486             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1487             {
1488                 return wasm_i32x4_sub(self, other);
1489             }
1490             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1491             {
1492                 return wasm_i64x2_sub(self, other);
1493             }
1494             else
1495             {
1496                 assert(false && "unsupported arch/op combination");
1497                 return {};
1498             }
1499         }
1500         template <class A>
1501         XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1502         {
1503             return wasm_f64x2_sub(self, other);
1504         }
1505
1506         // sqrt
1507         template <class A>
1508         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<wasm>) noexcept
1509         {
1510             return wasm_f32x4_sqrt(val);
1511         }
1512         template <class A>
1513         XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<wasm>) noexcept
1514         {
1515             return wasm_f64x2_sqrt(val);
1516         }
1517
1518         // swizzle
1519         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1520         XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
1521         {
1522             return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
1523         }
1524
1525         template <class A, uint64_t V0, uint64_t V1>
1526         XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
1527         {
1528             return wasm_i64x2_shuffle(self, self, V0, V1);
1529         }
1530
1531         template <class A, uint64_t V0, uint64_t V1>
1532         XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
1533         {
1534             return wasm_i64x2_shuffle(self, self, V0, V1);
1535         }
1536
1537         template <class A, uint64_t V0, uint64_t V1>
1538         XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<wasm>) noexcept
1539         {
1540             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
1541         }
1542
1543         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1544         XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
1545         {
1546             return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
1547         }
1548
1549         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1550         XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
1551         {
1552             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
1553         }
1554
1555         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1556         XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
1557         {
1558             return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
1559         }
1560
1561         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
1562         XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
1563         {
1564             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
1565         }
1566
1567         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1568                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1569         XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
1570         {
1571             return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
1572         }
1573
1574         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
1575                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
1576         XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
1577         {
1578             return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
1579         }
1580
1581         // transpose
1582         template <class A, class T>
1583         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<wasm>) noexcept
1584         {
1585             assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
1586             (void)matrix_end;
1587             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1588             {
1589                 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
1590
1591                 auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1]
1592                 auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3]
1593
1594                 auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1]
1595                 auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3]
1596
1597                 matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0]
1598                 matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1]
1599                 matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
1600                 matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
1601             }
1602             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1603             {
1604                 auto r0 = matrix_begin[0], r1 = matrix_begin[1];
1605
1606                 matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2);
1607                 matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3);
1608             }
1609             else
1610             {
1611                 transpose(matrix_begin, matrix_end, generic {});
1612             }
1613         }
1614
1615         // trunc
1616         template <class A>
1617         XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
1618         {
1619             return wasm_f32x4_trunc(self);
1620         }
1621         template <class A>
1622         XSIMD_INLINE batch<double, A> trunc(batch<double, A> const& self, requires_arch<wasm>) noexcept
1623         {
1624             return wasm_f64x2_trunc(self);
1625         }
1626
1627         // zip_hi
1628         template <class A>
1629         XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1630         {
1631             return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
1632         }
1633         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1634         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1635         {
1636             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1637             {
1638                 return wasm_i8x16_shuffle(self, other, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
1639             }
1640             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1641             {
1642                 return wasm_i16x8_shuffle(self, other, 4, 12, 5, 13, 6, 14, 7, 15);
1643             }
1644             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1645             {
1646                 return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7);
1647             }
1648             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1649             {
1650                 return wasm_i64x2_shuffle(self, other, 1, 3);
1651             }
1652             else
1653             {
1654                 assert(false && "unsupported arch/op combination");
1655                 return {};
1656             }
1657         }
1658         template <class A>
1659         XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1660         {
1661             return wasm_i64x2_shuffle(self, other, 1, 3);
1662         }
1663
1664         // zip_lo
1665         template <class A>
1666         XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<wasm>) noexcept
1667         {
1668             return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
1669         }
1670         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1671         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
1672         {
1673             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1674             {
1675                 return wasm_i8x16_shuffle(self, other, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
1676             }
1677             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1678             {
1679                 return wasm_i16x8_shuffle(self, other, 0, 8, 1, 9, 2, 10, 3, 11);
1680             }
1681             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1682             {
1683                 return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5);
1684             }
1685             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
1686             {
1687                 return wasm_i64x2_shuffle(self, other, 0, 2);
1688             }
1689             else
1690             {
1691                 assert(false && "unsupported arch/op combination");
1692                 return {};
1693             }
1694         }
1695         template <class A>
1696         XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<wasm>) noexcept
1697         {
1698             return wasm_i64x2_shuffle(self, other, 0, 2);
1699         }
1700     }
1701 }
1702
1703 #endif