arch/generic/xsimd_generic_details.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_GENERIC_DETAILS_HPP
0013 #define XSIMD_GENERIC_DETAILS_HPP
0014
0015 #include <complex>
0016
0017 #include "../../math/xsimd_rem_pio2.hpp"
0018 #include "../../types/xsimd_generic_arch.hpp"
0019 #include "../../types/xsimd_utils.hpp"
0020 #include "../xsimd_constants.hpp"
0021
0022 namespace xsimd
0023 {
0024     // Forward declaration. Should we put them in a separate file?
0025     template <class T, class A>
0026     XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self) noexcept;
0027     template <class T, class A>
0028     XSIMD_INLINE batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
0029     template <class T, class A>
0030     XSIMD_INLINE bool any(batch_bool<T, A> const& self) noexcept;
0031     template <class T, class A>
0032     XSIMD_INLINE batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
0033     template <class A, class T_out, class T_in>
0034     XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
0035     template <class T, class A>
0036     XSIMD_INLINE batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
0037     template <class T_out, class T_in, class A>
0038     XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
0039     template <class T, class A>
0040     XSIMD_INLINE batch<T, A> cos(batch<T, A> const& self) noexcept;
0041     template <class T, class A>
0042     XSIMD_INLINE batch<T, A> cosh(batch<T, A> const& self) noexcept;
0043     template <class T, class A>
0044     XSIMD_INLINE batch<T, A> exp(batch<T, A> const& self) noexcept;
0045     template <class T, class A>
0046     XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
0047     template <class T, class A>
0048     XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
0049     template <class T, class A>
0050     XSIMD_INLINE batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
0051     template <class T, class A, uint64_t... Coefs>
0052     XSIMD_INLINE batch<T, A> horner(const batch<T, A>& self) noexcept;
0053     template <class T, class A>
0054     XSIMD_INLINE batch<T, A> hypot(const batch<T, A>& self) noexcept;
0055     template <class T, class A>
0056     XSIMD_INLINE batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
0057     template <class T, class A>
0058     XSIMD_INLINE batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
0059     template <class T, class A>
0060     XSIMD_INLINE batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
0061     template <class T, class A>
0062     XSIMD_INLINE typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& self) noexcept;
0063     template <class T, class A>
0064     XSIMD_INLINE typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& self) noexcept;
0065     template <class T, class A>
0066     XSIMD_INLINE typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
0067     template <class T, class A>
0068     XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
0069     template <class T, class A>
0070     XSIMD_INLINE batch<T, A> log(batch<T, A> const& self) noexcept;
0071     template <class T, class A>
0072     XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
0073     template <class T, class A>
0074     XSIMD_INLINE batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
0075     template <class T, class A>
0076     XSIMD_INLINE T reduce_add(batch<T, A> const&) noexcept;
0077     template <class T, class A>
0078     XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
0079     template <class T, class A>
0080     XSIMD_INLINE batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
0081     template <class T, class A>
0082     XSIMD_INLINE batch<T, A> sign(batch<T, A> const& self) noexcept;
0083     template <class T, class A>
0084     XSIMD_INLINE batch<T, A> signnz(batch<T, A> const& self) noexcept;
0085     template <class T, class A>
0086     XSIMD_INLINE batch<T, A> sin(batch<T, A> const& self) noexcept;
0087     template <class T, class A>
0088     XSIMD_INLINE batch<T, A> sinh(batch<T, A> const& self) noexcept;
0089     template <class T, class A>
0090     XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
0091     template <class T, class A>
0092     XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& self) noexcept;
0093     template <class T, class A>
0094     XSIMD_INLINE batch<T, A> tan(batch<T, A> const& self) noexcept;
0095     template <class T, class A>
0096     XSIMD_INLINE batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
0097     template <class T, class A>
0098     XSIMD_INLINE batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
0099     template <class T, class A>
0100     XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self) noexcept;
0101
0102     namespace kernel
0103     {
0104
0105         namespace detail
0106         {
0107             template <class F, class A, class T, class... Batches>
0108             XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
0109             {
0110                 constexpr std::size_t size = batch<T, A>::size;
0111                 alignas(A::alignment()) T self_buffer[size];
0112                 alignas(A::alignment()) T other_buffer[size];
0113                 self.store_aligned(&self_buffer[0]);
0114                 other.store_aligned(&other_buffer[0]);
0115                 for (std::size_t i = 0; i < size; ++i)
0116                 {
0117                     self_buffer[i] = func(self_buffer[i], other_buffer[i]);
0118                 }
0119                 return batch<T, A>::load_aligned(self_buffer);
0120             }
0121
0122             template <class U, class F, class A, class T>
0123             XSIMD_INLINE batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
0124             {
0125                 static_assert(batch<T, A>::size == batch<U, A>::size,
0126                               "Source and destination sizes must match");
0127                 constexpr std::size_t src_size = batch<T, A>::size;
0128                 constexpr std::size_t dest_size = batch<U, A>::size;
0129                 alignas(A::alignment()) T self_buffer[src_size];
0130                 alignas(A::alignment()) U other_buffer[dest_size];
0131                 self.store_aligned(&self_buffer[0]);
0132                 for (std::size_t i = 0; i < src_size; ++i)
0133                 {
0134                     other_buffer[i] = func(self_buffer[i]);
0135                 }
0136                 return batch<U, A>::load_aligned(other_buffer);
0137             }
0138         }
0139
0140         // some generic fast_cast conversion
0141         namespace detail
0142         {
0143             template <class A>
0144             XSIMD_INLINE batch<uint8_t, A> fast_cast(batch<int8_t, A> const& self, batch<uint8_t, A> const&, requires_arch<generic>) noexcept
0145             {
0146                 return bitwise_cast<uint8_t>(self);
0147             }
0148             template <class A>
0149             XSIMD_INLINE batch<uint16_t, A> fast_cast(batch<int16_t, A> const& self, batch<uint16_t, A> const&, requires_arch<generic>) noexcept
0150             {
0151                 return bitwise_cast<uint16_t>(self);
0152             }
0153             template <class A>
0154             XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<int32_t, A> const& self, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
0155             {
0156                 return bitwise_cast<uint32_t>(self);
0157             }
0158             template <class A>
0159             XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<int64_t, A> const& self, batch<uint64_t, A> const&, requires_arch<generic>) noexcept
0160             {
0161                 return bitwise_cast<uint64_t>(self);
0162             }
0163             template <class A>
0164             XSIMD_INLINE batch<int8_t, A> fast_cast(batch<uint8_t, A> const& self, batch<int8_t, A> const&, requires_arch<generic>) noexcept
0165             {
0166                 return bitwise_cast<int8_t>(self);
0167             }
0168             template <class A>
0169             XSIMD_INLINE batch<int16_t, A> fast_cast(batch<uint16_t, A> const& self, batch<int16_t, A> const&, requires_arch<generic>) noexcept
0170             {
0171                 return bitwise_cast<int16_t>(self);
0172             }
0173             template <class A>
0174             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<uint32_t, A> const& self, batch<int32_t, A> const&, requires_arch<generic>) noexcept
0175             {
0176                 return bitwise_cast<int32_t>(self);
0177             }
0178             template <class A>
0179             XSIMD_INLINE batch<int64_t, A> fast_cast(batch<uint64_t, A> const& self, batch<int64_t, A> const&, requires_arch<generic>) noexcept
0180             {
0181                 return bitwise_cast<int64_t>(self);
0182             }
0183
0184             // Provide a generic uint32_t -> float cast only if we have a
0185             // non-generic int32_t -> float fast_cast
0186             template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
0187             XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
0188             {
0189                 // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
0190                 batch<uint32_t, A> msk_lo(0xFFFF);
0191                 batch<float, A> cnst65536f(65536.0f);
0192
0193                 auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self                             */
0194                 auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v                                                 */
0195                 auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding                                                                */
0196                 auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding                                                                */
0197                 v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
0198                 return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
0199             }
0200
0201             // Provide a generic float -> uint32_t cast only if we have a
0202             // non-generic float -> int32_t fast_cast
0203             template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
0204             XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
0205             {
0206                 auto is_large = v >= batch<float, A>(1u << 31);
0207                 auto small_v = bitwise_cast<float>(batch_cast<int32_t>(v));
0208                 auto large_v = bitwise_cast<float>(
0209                     batch_cast<int32_t>(v - batch<float, A>(1u << 31))
0210                     ^ batch<int32_t, A>(1u << 31));
0211                 return bitwise_cast<uint32_t>(select(is_large, large_v, small_v));
0212             }
0213         }
0214
0215         namespace detail
0216         {
0217             // Generic conversion handling machinery. Each architecture must define
0218             // conversion function when such conversions exits in the form of
0219             // intrinsic. Then we use that information to automatically decide whether
0220             // to use scalar or vector conversion when doing load / store / batch_cast
0221             struct with_fast_conversion
0222             {
0223             };
0224             struct with_slow_conversion
0225             {
0226             };
0227
0228             template <class A, class From, class To, class = void>
0229             struct conversion_type_impl
0230             {
0231                 using type = with_slow_conversion;
0232             };
0233
0234             using xsimd::detail::void_t;
0235
0236             template <class A, class From, class To>
0237             struct conversion_type_impl<A, From, To,
0238                                         void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
0239                                                                   std::declval<const batch<To, A>&>(),
0240                                                                   std::declval<const A&>()))>>
0241             {
0242                 using type = with_fast_conversion;
0243             };
0244
0245             template <class A, class From, class To>
0246             using conversion_type = typename conversion_type_impl<A, From, To>::type;
0247         }
0248
0249         namespace detail
0250         {
0251             /* origin: boost/simdfunction/horn.hpp*/
0252             /*
0253              * ====================================================
0254              * copyright 2016 NumScale SAS
0255              *
0256              * Distributed under the Boost Software License, Version 1.0.
0257              * (See copy at http://boost.org/LICENSE_1_0.txt)
0258              * ====================================================
0259              */
0260             template <class B, uint64_t c>
0261             XSIMD_INLINE B coef() noexcept
0262             {
0263                 using value_type = typename B::value_type;
0264                 return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
0265             }
0266             template <class B>
0267             XSIMD_INLINE B horner(const B&) noexcept
0268             {
0269                 return B(typename B::value_type(0.));
0270             }
0271
0272             template <class B, uint64_t c0>
0273             XSIMD_INLINE B horner(const B&) noexcept
0274             {
0275                 return coef<B, c0>();
0276             }
0277
0278             template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
0279             XSIMD_INLINE B horner(const B& self) noexcept
0280             {
0281                 return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
0282             }
0283
0284             /* origin: boost/simdfunction/horn1.hpp*/
0285             /*
0286              * ====================================================
0287              * copyright 2016 NumScale SAS
0288              *
0289              * Distributed under the Boost Software License, Version 1.0.
0290              * (See copy at http://boost.org/LICENSE_1_0.txt)
0291              * ====================================================
0292              */
0293             template <class B>
0294             XSIMD_INLINE B horner1(const B&) noexcept
0295             {
0296                 return B(1.);
0297             }
0298
0299             template <class B, uint64_t c0>
0300             XSIMD_INLINE B horner1(const B& x) noexcept
0301             {
0302                 return x + detail::coef<B, c0>();
0303             }
0304
0305             template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
0306             XSIMD_INLINE B horner1(const B& x) noexcept
0307             {
0308                 return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
0309             }
0310         }
0311
0312     }
0313
0314 }
0315
0316 #endif