File indexing completed on 2025-08-28 09:11:31
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012 #ifndef XSIMD_AVX512_DQHPP
0013 #define XSIMD_AVX512_D_HPP
0014
0015 #include "../types/xsimd_avx512dq_register.hpp"
0016
0017 namespace xsimd
0018 {
0019
0020 namespace kernel
0021 {
0022 using namespace types;
0023
0024
0025 template <class A>
0026 XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0027 {
0028 return _mm512_and_ps(self, other);
0029 }
0030 template <class A>
0031 XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0032 {
0033 return _mm512_and_pd(self, other);
0034 }
0035
0036
0037 template <class A>
0038 XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0039 {
0040 return _mm512_andnot_ps(other, self);
0041 }
0042 template <class A>
0043 XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0044 {
0045 return _mm512_andnot_pd(other, self);
0046 }
0047
0048
0049 template <class A>
0050 XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
0051 {
0052 return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
0053 }
0054 template <class A>
0055 XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
0056 {
0057 return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
0058 }
0059
0060
0061 template <class A>
0062 XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0063 {
0064 return _mm512_or_ps(self, other);
0065 }
0066 template <class A>
0067 XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0068 {
0069 return _mm512_or_pd(self, other);
0070 }
0071
0072 template <class A, class T>
0073 XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
0074 {
0075 using register_type = typename batch_bool<T, A>::register_type;
0076 return register_type(self.data | other.data);
0077 }
0078
0079
0080 template <class A>
0081 XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
0082 {
0083 return _mm512_xor_ps(self, other);
0084 }
0085 template <class A>
0086 XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
0087 {
0088 return _mm512_xor_pd(self, other);
0089 }
0090
0091
0092 template <class A>
0093 XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
0094 {
0095
0096
0097
0098 #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
0099 batch<float, avx512f> res##I; \
0100 { \
0101 auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
0102 auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
0103 res##I = _mm512_add_ps(tmp1, tmp2); \
0104 }
0105
0106 XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
0107 XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
0108 XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
0109 XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
0110 XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
0111 XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
0112 XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
0113 XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
0114
0115 #undef XSIMD_AVX512_HADDP_STEP1
0116
0117
0118
0119
0120
0121 #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
0122 batch<float, avx2> halfx##I; \
0123 { \
0124 auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
0125 auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
0126 \
0127 auto resx1 = _mm512_add_ps(tmp1, tmp2); \
0128 \
0129 auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
0130 auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
0131 \
0132 auto resx2 = _mm512_add_ps(tmp3, tmp4); \
0133 \
0134 auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
0135 auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
0136 \
0137 auto resx3 = _mm512_add_ps(tmp5, tmp6); \
0138 \
0139 halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
0140 _mm512_extractf32x8_ps(resx3, 1)); \
0141 }
0142
0143 XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
0144 XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
0145
0146 #undef XSIMD_AVX512_HADDP_STEP2
0147
0148 auto concat = _mm512_castps256_ps512(halfx0);
0149 concat = _mm512_insertf32x8(concat, halfx1, 1);
0150 return concat;
0151 }
0152
0153
0154 template <class A>
0155 XSIMD_INLINE batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
0156 {
0157 return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
0158 }
0159
0160
0161 template <class A>
0162 XSIMD_INLINE batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
0163 {
0164 return _mm512_mullo_epi64(self, other);
0165 }
0166
0167 template <class A>
0168 XSIMD_INLINE batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
0169 {
0170 return _mm512_mullo_epi64(self, other);
0171 }
0172
0173
0174 template <class A>
0175 XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
0176 requires_arch<avx512dq>) noexcept
0177 {
0178 return _mm512_cvtpd_epi64(self);
0179 }
0180
0181
0182 template <class A>
0183 XSIMD_INLINE float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
0184 {
0185 __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
0186 __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
0187 __m256 res1 = _mm256_add_ps(tmp1, tmp2);
0188 return reduce_add(batch<float, avx2>(res1), avx2 {});
0189 }
0190
0191
0192 namespace detail
0193 {
0194 template <class A>
0195 XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
0196 {
0197 return _mm512_cvtepi64_pd(self);
0198 }
0199
0200 template <class A>
0201 XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
0202 {
0203 return _mm512_cvttpd_epi64(self);
0204 }
0205
0206 }
0207
0208 }
0209
0210 }
0211
0212 #endif