arch/generic/xsimd_generic_memory.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_GENERIC_MEMORY_HPP
0013 #define XSIMD_GENERIC_MEMORY_HPP
0014
0015 #include <algorithm>
0016 #include <complex>
0017 #include <stdexcept>
0018
0019 #include "../../types/xsimd_batch_constant.hpp"
0020 #include "./xsimd_generic_details.hpp"
0021
0022 namespace xsimd
0023 {
0024     template <typename T, class A, T... Values>
0025     struct batch_constant;
0026
0027     template <typename T, class A, bool... Values>
0028     struct batch_bool_constant;
0029
0030     namespace kernel
0031     {
0032
0033         using namespace types;
0034
0035         // compress
0036         namespace detail
0037         {
0038             template <class IT, class A, class I, size_t... Is>
0039             XSIMD_INLINE batch<IT, A> create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
0040             {
0041                 batch<IT, A> swizzle_mask(IT(0));
0042                 alignas(A::alignment()) IT mask_buffer[batch<IT, A>::size] = { Is... };
0043                 size_t inserted = 0;
0044                 for (size_t i = 0; i < sizeof...(Is); ++i)
0045                     if ((bitmask >> i) & 1u)
0046                         std::swap(mask_buffer[inserted++], mask_buffer[i]);
0047                 return batch<IT, A>::load_aligned(&mask_buffer[0]);
0048             }
0049         }
0050
0051         template <typename A, typename T>
0052         XSIMD_INLINE batch<T, A>
0053         compress(batch<T, A> const& x, batch_bool<T, A> const& mask,
0054                  kernel::requires_arch<generic>) noexcept
0055         {
0056             using IT = as_unsigned_integer_t<T>;
0057             constexpr std::size_t size = batch_bool<T, A>::size;
0058             auto bitmask = mask.mask();
0059             auto z = select(mask, x, batch<T, A>((T)0));
0060             auto compress_mask = detail::create_compress_swizzle_mask<IT, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
0061             return swizzle(z, compress_mask);
0062         }
0063
0064         // expand
0065         namespace detail
0066         {
0067             template <class IT, class A, class I, size_t... Is>
0068             XSIMD_INLINE batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
0069             {
0070                 batch<IT, A> swizzle_mask(IT(0));
0071                 IT j = 0;
0072                 (void)std::initializer_list<bool> { ((swizzle_mask = insert(swizzle_mask, j, index<Is>())), (j += ((bitmask >> Is) & 1u)), true)... };
0073                 return swizzle_mask;
0074             }
0075         }
0076
0077         template <typename A, typename T>
0078         XSIMD_INLINE batch<T, A>
0079         expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
0080                kernel::requires_arch<generic>) noexcept
0081         {
0082             constexpr std::size_t size = batch_bool<T, A>::size;
0083             auto bitmask = mask.mask();
0084             auto swizzle_mask = detail::create_expand_swizzle_mask<as_unsigned_integer_t<T>, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
0085             auto z = swizzle(x, swizzle_mask);
0086             return select(mask, z, batch<T, A>(T(0)));
0087         }
0088
0089         // extract_pair
0090         template <class A, class T>
0091         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
0092         {
0093             constexpr std::size_t size = batch<T, A>::size;
0094             assert(i < size && "index in bounds");
0095
0096             alignas(A::alignment()) T self_buffer[size];
0097             self.store_aligned(self_buffer);
0098
0099             alignas(A::alignment()) T other_buffer[size];
0100             other.store_aligned(other_buffer);
0101
0102             alignas(A::alignment()) T concat_buffer[size];
0103
0104             for (std::size_t j = 0; j < (size - i); ++j)
0105             {
0106                 concat_buffer[j] = other_buffer[i + j];
0107                 if (j < i)
0108                 {
0109                     concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
0110                 }
0111             }
0112             return batch<T, A>::load_aligned(concat_buffer);
0113         }
0114
0115         // gather
0116         namespace detail
0117         {
0118             // Not using XSIMD_INLINE here as it makes msvc hand got ever on avx512
0119             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
0120             inline batch<T, A> gather(U const* src, batch<V, A> const& index,
0121                                       ::xsimd::index<N> I) noexcept
0122             {
0123                 return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
0124             }
0125
0126             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
0127             inline batch<T, A>
0128             gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
0129             {
0130                 static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
0131
0132                 const auto test = gather<N - 1, T, A>(src, index, {});
0133                 return insert(test, static_cast<T>(src[index.get(I)]), I);
0134             }
0135         } // namespace detail
0136
0137         template <typename T, typename A, typename V>
0138         XSIMD_INLINE batch<T, A>
0139         gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
0140                kernel::requires_arch<generic>) noexcept
0141         {
0142             static_assert(batch<T, A>::size == batch<V, A>::size,
0143                           "Index and destination sizes must match");
0144
0145             return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
0146         }
0147
0148         // Gather with runtime indexes and mismatched strides.
0149         template <typename T, typename A, typename U, typename V>
0150         XSIMD_INLINE detail::sizes_mismatch_t<T, U, batch<T, A>>
0151         gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
0152                kernel::requires_arch<generic>) noexcept
0153         {
0154             static_assert(batch<T, A>::size == batch<V, A>::size,
0155                           "Index and destination sizes must match");
0156
0157             return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
0158         }
0159
0160         // Gather with runtime indexes and matching strides.
0161         template <typename T, typename A, typename U, typename V>
0162         XSIMD_INLINE detail::stride_match_t<T, U, batch<T, A>>
0163         gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
0164                kernel::requires_arch<generic>) noexcept
0165         {
0166             static_assert(batch<T, A>::size == batch<V, A>::size,
0167                           "Index and destination sizes must match");
0168
0169             return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
0170         }
0171
0172         // insert
0173         template <class A, class T, size_t I>
0174         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
0175         {
0176             struct index_mask
0177             {
0178                 static constexpr bool get(size_t index, size_t /* size*/)
0179                 {
0180                     return index != I;
0181                 }
0182             };
0183             batch<T, A> tmp(val);
0184             return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
0185         }
0186
0187         // get
0188         template <class A, size_t I, class T>
0189         XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
0190         {
0191             alignas(A::alignment()) T buffer[batch<T, A>::size];
0192             self.store_aligned(&buffer[0]);
0193             return buffer[I];
0194         }
0195
0196         template <class A, size_t I, class T>
0197         XSIMD_INLINE T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
0198         {
0199             alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
0200             self.store_aligned(&buffer[0]);
0201             return buffer[I];
0202         }
0203
0204         template <class A, size_t I, class T>
0205         XSIMD_INLINE auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
0206         {
0207             alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
0208             self.store_aligned(&buffer[0]);
0209             return buffer[I];
0210         }
0211
0212         template <class A, class T>
0213         XSIMD_INLINE T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
0214         {
0215             alignas(A::alignment()) T buffer[batch<T, A>::size];
0216             self.store_aligned(&buffer[0]);
0217             return buffer[i];
0218         }
0219
0220         template <class A, class T>
0221         XSIMD_INLINE T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
0222         {
0223             alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
0224             self.store_aligned(&buffer[0]);
0225             return buffer[i];
0226         }
0227
0228         template <class A, class T>
0229         XSIMD_INLINE auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
0230         {
0231             using T2 = typename batch<std::complex<T>, A>::value_type;
0232             alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
0233             self.store_aligned(&buffer[0]);
0234             return buffer[i];
0235         }
0236
0237         // load_aligned
0238         namespace detail
0239         {
0240             template <class A, class T_in, class T_out>
0241             XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
0242             {
0243                 using batch_type_in = batch<T_in, A>;
0244                 using batch_type_out = batch<T_out, A>;
0245                 return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
0246             }
0247             template <class A, class T_in, class T_out>
0248             XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
0249             {
0250                 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
0251                 using batch_type_out = batch<T_out, A>;
0252                 alignas(A::alignment()) T_out buffer[batch_type_out::size];
0253                 std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
0254                 return batch_type_out::load_aligned(buffer);
0255             }
0256         }
0257         template <class A, class T_in, class T_out>
0258         XSIMD_INLINE batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
0259         {
0260             return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
0261         }
0262
0263         // load_unaligned
0264         namespace detail
0265         {
0266             template <class A, class T_in, class T_out>
0267             XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
0268             {
0269                 using batch_type_in = batch<T_in, A>;
0270                 using batch_type_out = batch<T_out, A>;
0271                 return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
0272             }
0273
0274             template <class A, class T_in, class T_out>
0275             XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
0276             {
0277                 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
0278                 return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
0279             }
0280         }
0281         template <class A, class T_in, class T_out>
0282         XSIMD_INLINE batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
0283         {
0284             return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
0285         }
0286
0287         // rotate_right
0288         template <size_t N, class A, class T>
0289         XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<generic>) noexcept
0290         {
0291             struct rotate_generator
0292             {
0293                 static constexpr size_t get(size_t index, size_t size)
0294                 {
0295                     return (index - N) % size;
0296                 }
0297             };
0298
0299             return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
0300         }
0301
0302         template <size_t N, class A, class T>
0303         XSIMD_INLINE batch<std::complex<T>, A> rotate_right(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
0304         {
0305             return { rotate_right<N>(self.real()), rotate_right<N>(self.imag()) };
0306         }
0307
0308         // rotate_left
0309         template <size_t N, class A, class T>
0310         XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& self, requires_arch<generic>) noexcept
0311         {
0312             struct rotate_generator
0313             {
0314                 static constexpr size_t get(size_t index, size_t size)
0315                 {
0316                     return (index + N) % size;
0317                 }
0318             };
0319
0320             return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
0321         }
0322
0323         template <size_t N, class A, class T>
0324         XSIMD_INLINE batch<std::complex<T>, A> rotate_left(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
0325         {
0326             return { rotate_left<N>(self.real()), rotate_left<N>(self.imag()) };
0327         }
0328
0329         // Scatter with runtime indexes.
0330         namespace detail
0331         {
0332             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
0333             XSIMD_INLINE void scatter(batch<T, A> const& src, U* dst,
0334                                       batch<V, A> const& index,
0335                                       ::xsimd::index<N> I) noexcept
0336             {
0337                 dst[index.get(I)] = static_cast<U>(src.get(I));
0338             }
0339
0340             template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
0341             XSIMD_INLINE void
0342             scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
0343                     ::xsimd::index<N> I) noexcept
0344             {
0345                 static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
0346
0347                 kernel::detail::scatter<N - 1, T, A, U, V>(
0348                     src, dst, index, {});
0349                 dst[index.get(I)] = static_cast<U>(src.get(I));
0350             }
0351         } // namespace detail
0352
0353         template <typename A, typename T, typename V>
0354         XSIMD_INLINE void
0355         scatter(batch<T, A> const& src, T* dst,
0356                 batch<V, A> const& index,
0357                 kernel::requires_arch<generic>) noexcept
0358         {
0359             static_assert(batch<T, A>::size == batch<V, A>::size,
0360                           "Source and index sizes must match");
0361             kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
0362                 src, dst, index, {});
0363         }
0364
0365         template <typename A, typename T, typename U, typename V>
0366         XSIMD_INLINE detail::sizes_mismatch_t<T, U, void>
0367         scatter(batch<T, A> const& src, U* dst,
0368                 batch<V, A> const& index,
0369                 kernel::requires_arch<generic>) noexcept
0370         {
0371             static_assert(batch<T, A>::size == batch<V, A>::size,
0372                           "Source and index sizes must match");
0373             kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
0374                 src, dst, index, {});
0375         }
0376
0377         template <typename A, typename T, typename U, typename V>
0378         XSIMD_INLINE detail::stride_match_t<T, U, void>
0379         scatter(batch<T, A> const& src, U* dst,
0380                 batch<V, A> const& index,
0381                 kernel::requires_arch<generic>) noexcept
0382         {
0383             static_assert(batch<T, A>::size == batch<V, A>::size,
0384                           "Source and index sizes must match");
0385             const auto tmp = batch_cast<U>(src);
0386             kernel::scatter<A>(tmp, dst, index, A {});
0387         }
0388
0389         // shuffle
0390         namespace detail
0391         {
0392             constexpr bool is_swizzle_fst(size_t)
0393             {
0394                 return true;
0395             }
0396             template <typename ITy, typename... ITys>
0397             constexpr bool is_swizzle_fst(size_t bsize, ITy index, ITys... indices)
0398             {
0399                 return index < bsize && is_swizzle_fst(bsize, indices...);
0400             }
0401             constexpr bool is_swizzle_snd(size_t)
0402             {
0403                 return true;
0404             }
0405             template <typename ITy, typename... ITys>
0406             constexpr bool is_swizzle_snd(size_t bsize, ITy index, ITys... indices)
0407             {
0408                 return index >= bsize && is_swizzle_snd(bsize, indices...);
0409             }
0410
0411             constexpr bool is_zip_lo(size_t)
0412             {
0413                 return true;
0414             }
0415
0416             template <typename ITy>
0417             constexpr bool is_zip_lo(size_t, ITy)
0418             {
0419                 return false;
0420             }
0421
0422             template <typename ITy0, typename ITy1, typename... ITys>
0423             constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
0424             {
0425                 return index0 == (bsize - (sizeof...(indices) + 2)) && index1 == (2 * bsize - (sizeof...(indices) + 2)) && is_zip_lo(bsize, indices...);
0426             }
0427
0428             constexpr bool is_zip_hi(size_t)
0429             {
0430                 return true;
0431             }
0432
0433             template <typename ITy>
0434             constexpr bool is_zip_hi(size_t, ITy)
0435             {
0436                 return false;
0437             }
0438
0439             template <typename ITy0, typename ITy1, typename... ITys>
0440             constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
0441             {
0442                 return index0 == (bsize / 2 + bsize - (sizeof...(indices) + 2)) && index1 == (bsize / 2 + 2 * bsize - (sizeof...(indices) + 2)) && is_zip_hi(bsize, indices...);
0443             }
0444
0445             constexpr bool is_select(size_t)
0446             {
0447                 return true;
0448             }
0449
0450             template <typename ITy, typename... ITys>
0451             constexpr bool is_select(size_t bsize, ITy index, ITys... indices)
0452             {
0453                 return (index < bsize ? index : index - bsize) == (bsize - sizeof...(ITys)) && is_select(bsize, indices...);
0454             }
0455
0456         }
0457
0458         template <class A, typename T, typename ITy, ITy... Indices>
0459         XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
0460         {
0461             constexpr size_t bsize = sizeof...(Indices);
0462             static_assert(bsize == batch<T, A>::size, "valid shuffle");
0463
0464             // Detect common patterns
0465             XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
0466             {
0467                 return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
0468             }
0469
0470             XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
0471             {
0472                 return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
0473             }
0474
0475             XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
0476             {
0477                 return zip_lo(x, y);
0478             }
0479
0480             XSIMD_IF_CONSTEXPR(detail::is_zip_hi(bsize, Indices...))
0481             {
0482                 return zip_hi(x, y);
0483             }
0484
0485             XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
0486             {
0487                 return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
0488             }
0489
0490 #if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED)
0491 #if __has_builtin(__builtin_shufflevector)
0492 #define builtin_shuffle __builtin_shufflevector
0493 #endif
0494 #endif
0495
0496 #if defined(builtin_shuffle)
0497             typedef T vty __attribute__((__vector_size__(sizeof(batch<T, A>))));
0498             return (typename batch<T, A>::register_type)builtin_shuffle((vty)x.data, (vty)y.data, Indices...);
0499
0500 // FIXME: my experiments show that GCC only correctly optimizes this builtin
0501 // starting at GCC 13, where it already has __builtin_shuffle_vector
0502 //
0503 // #elif __has_builtin(__builtin_shuffle) || GCC >= 6
0504 //            typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch<ITy, A>))));
0505 //            return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...});
0506 #else
0507             // Use a generic_pattern. It is suboptimal but clang optimizes this
0508             // pretty well.
0509             batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
0510             batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
0511             batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
0512             return select(select_x_lane, x_lane, y_lane);
0513 #endif
0514         }
0515
0516         // store
0517         template <class T, class A>
0518         XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
0519         {
0520             using batch_type = batch<T, A>;
0521             constexpr auto size = batch_bool<T, A>::size;
0522             alignas(A::alignment()) T buffer[size];
0523             kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
0524             for (std::size_t i = 0; i < size; ++i)
0525                 mem[i] = bool(buffer[i]);
0526         }
0527
0528         // store_aligned
0529         template <class A, class T_in, class T_out>
0530         XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
0531         {
0532             static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
0533             alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
0534             store_aligned(&buffer[0], self);
0535             std::copy(std::begin(buffer), std::end(buffer), mem);
0536         }
0537
0538         // store_unaligned
0539         template <class A, class T_in, class T_out>
0540         XSIMD_INLINE void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
0541         {
0542             static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
0543             return store_aligned<A>(mem, self, generic {});
0544         }
0545
0546         // swizzle
0547         template <class A, class T, class ITy, ITy... Vs>
0548         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
0549         {
0550             return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
0551         }
0552
0553         template <class A, class T, class ITy>
0554         XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
0555         {
0556             constexpr size_t size = batch<T, A>::size;
0557             alignas(A::alignment()) T self_buffer[size];
0558             store_aligned(&self_buffer[0], self);
0559
0560             alignas(A::alignment()) ITy mask_buffer[size];
0561             store_aligned(&mask_buffer[0], mask);
0562
0563             alignas(A::alignment()) T out_buffer[size];
0564             for (size_t i = 0; i < size; ++i)
0565                 out_buffer[i] = self_buffer[mask_buffer[i]];
0566             return batch<T, A>::load_aligned(out_buffer);
0567         }
0568
0569         template <class A, class T, class ITy>
0570         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
0571         {
0572             return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
0573         }
0574
0575         // load_complex_aligned
0576         namespace detail
0577         {
0578             template <class A, class T>
0579             XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
0580             {
0581                 static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
0582             }
0583
0584             template <class A, class T>
0585             XSIMD_INLINE batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
0586             {
0587                 static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
0588             }
0589
0590             template <class A, class T>
0591             XSIMD_INLINE batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
0592             {
0593                 static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
0594             }
0595         }
0596
0597         template <class A, class T_out, class T_in>
0598         XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
0599         {
0600             using real_batch = batch<T_out, A>;
0601             T_in const* buffer = reinterpret_cast<T_in const*>(mem);
0602             real_batch hi = real_batch::load_aligned(buffer),
0603                        lo = real_batch::load_aligned(buffer + real_batch::size);
0604             return detail::load_complex(hi, lo, A {});
0605         }
0606
0607         // load_complex_unaligned
0608         template <class A, class T_out, class T_in>
0609         XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
0610         {
0611             using real_batch = batch<T_out, A>;
0612             T_in const* buffer = reinterpret_cast<T_in const*>(mem);
0613             real_batch hi = real_batch::load_unaligned(buffer),
0614                        lo = real_batch::load_unaligned(buffer + real_batch::size);
0615             return detail::load_complex(hi, lo, A {});
0616         }
0617
0618         // store_complex_aligned
0619         template <class A, class T_out, class T_in>
0620         XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
0621         {
0622             using real_batch = batch<T_in, A>;
0623             real_batch hi = detail::complex_high(src, A {});
0624             real_batch lo = detail::complex_low(src, A {});
0625             T_out* buffer = reinterpret_cast<T_out*>(dst);
0626             lo.store_aligned(buffer);
0627             hi.store_aligned(buffer + real_batch::size);
0628         }
0629
0630         // store_compelx_unaligned
0631         template <class A, class T_out, class T_in>
0632         XSIMD_INLINE void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
0633         {
0634             using real_batch = batch<T_in, A>;
0635             real_batch hi = detail::complex_high(src, A {});
0636             real_batch lo = detail::complex_low(src, A {});
0637             T_out* buffer = reinterpret_cast<T_out*>(dst);
0638             lo.store_unaligned(buffer);
0639             hi.store_unaligned(buffer + real_batch::size);
0640         }
0641
0642         // transpose
0643         template <class A, class T>
0644         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
0645         {
0646             assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
0647             (void)matrix_end;
0648             alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
0649             for (size_t i = 0; i < batch<T, A>::size; ++i)
0650             {
0651                 matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
0652             }
0653             // FIXME: this is super naive we can probably do better.
0654             for (size_t i = 0; i < batch<T, A>::size; ++i)
0655             {
0656                 for (size_t j = 0; j < i; ++j)
0657                 {
0658                     std::swap(scratch_buffer[i * batch<T, A>::size + j],
0659                               scratch_buffer[j * batch<T, A>::size + i]);
0660                 }
0661             }
0662             for (size_t i = 0; i < batch<T, A>::size; ++i)
0663             {
0664                 matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
0665             }
0666         }
0667
0668     }
0669
0670 }
0671
0672 #endif