xsimd/arch/xsimd_avx512dq.hpp

0001 /***************************************************************************
0002  * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
0003  * Martin Renou                                                             *
0004  * Copyright (c) QuantStack                                                 *
0005  * Copyright (c) Serge Guelton                                              *
0006  *                                                                          *
0007  * Distributed under the terms of the BSD 3-Clause License.                 *
0008  *                                                                          *
0009  * The full license is in the file LICENSE, distributed with this software. *
0010  ****************************************************************************/
0011
0012 #ifndef XSIMD_AVX512_DQHPP
0013 #define XSIMD_AVX512_D_HPP
0014
0015 #include "../types/xsimd_avx512dq_register.hpp"
0016
0017 namespace xsimd
0018 {
0019
0020     namespace kernel
0021     {
0022         using namespace types;
0023
0024         // bitwise_and
0025         template <class A>
0026         XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0027         {
0028             return _mm512_and_ps(self, other);
0029         }
0030         template <class A>
0031         XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0032         {
0033             return _mm512_and_pd(self, other);
0034         }
0035
0036         // bitwise_andnot
0037         template <class A>
0038         XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0039         {
0040             return _mm512_andnot_ps(other, self);
0041         }
0042         template <class A>
0043         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0044         {
0045             return _mm512_andnot_pd(other, self);
0046         }
0047
0048         // bitwise_not
0049         template <class A>
0050         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0051         {
0052             return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
0053         }
0054         template <class A>
0055         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0056         {
0057             return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
0058         }
0059
0060         // bitwise_or
0061         template <class A>
0062         XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0063         {
0064             return _mm512_or_ps(self, other);
0065         }
0066         template <class A>
0067         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0068         {
0069             return _mm512_or_pd(self, other);
0070         }
0071
0072         template <class A, class T>
0073         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
0074         {
0075             using register_type = typename batch_bool<T, A>::register_type;
0076             return register_type(self.data | other.data);
0077         }
0078
0079         // bitwise_xor
0080         template <class A>
0081         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0082         {
0083             return _mm512_xor_ps(self, other);
0084         }
0085         template <class A>
0086         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0087         {
0088             return _mm512_xor_pd(self, other);
0089         }
0090
0091         // haddp
0092         template <class A>
0093         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
0094         {
0095             // The following folds over the vector once:
0096             // tmp1 = [a0..8, b0..8]
0097             // tmp2 = [a8..f, b8..f]
0098 #define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
0099     batch<float, avx512f> res##I;                                        \
0100     {                                                                    \
0101         auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
0102         auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
0103         res##I = _mm512_add_ps(tmp1, tmp2);                              \
0104     }
0105
0106             XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
0107             XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
0108             XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
0109             XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
0110             XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
0111             XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
0112             XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
0113             XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
0114
0115 #undef XSIMD_AVX512_HADDP_STEP1
0116
0117             // The following flds the code and shuffles so that hadd_ps produces the correct result
0118             // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
0119             // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
0120             // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
0121 #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                               \
0122     batch<float, avx2> halfx##I;                                              \
0123     {                                                                         \
0124         auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));      \
0125         auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));      \
0126                                                                               \
0127         auto resx1 = _mm512_add_ps(tmp1, tmp2);                               \
0128                                                                               \
0129         auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));      \
0130         auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));      \
0131                                                                               \
0132         auto resx2 = _mm512_add_ps(tmp3, tmp4);                               \
0133                                                                               \
0134         auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
0135         auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
0136                                                                               \
0137         auto resx3 = _mm512_add_ps(tmp5, tmp6);                               \
0138                                                                               \
0139         halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),           \
0140                                   _mm512_extractf32x8_ps(resx3, 1));          \
0141     }
0142
0143             XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
0144             XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
0145
0146 #undef XSIMD_AVX512_HADDP_STEP2
0147
0148             auto concat = _mm512_castps256_ps512(halfx0);
0149             concat = _mm512_insertf32x8(concat, halfx1, 1);
0150             return concat;
0151         }
0152
0153         // ldexp
0154         template <class A>
0155         XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
0156         {
0157             return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
0158         }
0159
0160         // mul
0161         template <class A>
0162         XSIMD_INLINE batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
0163         {
0164             return _mm512_mullo_epi64(self, other);
0165         }
0166
0167         template <class A>
0168         XSIMD_INLINE batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
0169         {
0170             return _mm512_mullo_epi64(self, other);
0171         }
0172
0173         // nearbyint_as_int
0174         template <class A>
0175         XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
0176                                                         requires_arch<avx512dq>) noexcept
0177         {
0178             return _mm512_cvtpd_epi64(self);
0179         }
0180
0181         // reduce_add
0182         template <class A>
0183         XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
0184         {
0185             __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
0186             __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
0187             __m256 res1 = _mm256_add_ps(tmp1, tmp2);
0188             return reduce_add(batch<float, avx2>(res1), avx2 {});
0189         }
0190
0191         // convert
0192         namespace detail
0193         {
0194             template <class A>
0195             XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
0196             {
0197                 return _mm512_cvtepi64_pd(self);
0198             }
0199
0200             template <class A>
0201             XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
0202             {
0203                 return _mm512_cvttpd_epi64(self);
0204             }
0205
0206         }
0207
0208     }
0209
0210 }
0211
0212 #endif