Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:27:15

0001 // Copyright 2022 The Abseil Authors.
0002 //
0003 // Licensed under the Apache License, Version 2.0 (the "License");
0004 // you may not use this file except in compliance with the License.
0005 // You may obtain a copy of the License at
0006 //
0007 //      https://www.apache.org/licenses/LICENSE-2.0
0008 //
0009 // Unless required by applicable law or agreed to in writing, software
0010 // distributed under the License is distributed on an "AS IS" BASIS,
0011 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0012 // See the License for the specific language governing permissions and
0013 // limitations under the License.
0014 
0015 #ifndef ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
0016 #define ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_
0017 
0018 #include <cstdint>
0019 
0020 #include "absl/base/config.h"
0021 
0022 // -------------------------------------------------------------------------
0023 // Many x86 and ARM machines have CRC acceleration hardware.
0024 // We can do a faster version of Extend() on such machines.
0025 // We define a translation layer for both x86 and ARM for the ease of use and
0026 // most performance gains.
0027 
0028 // This implementation requires 64-bit CRC instructions (part of SSE 4.2) and
0029 // PCLMULQDQ instructions. 32-bit builds with SSE 4.2 do exist, so the
0030 // __x86_64__ condition is necessary.
0031 #if defined(__x86_64__) && defined(__SSE4_2__) && defined(__PCLMUL__)
0032 
0033 #include <x86intrin.h>
0034 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
0035 
0036 #elif defined(_MSC_VER) && !defined(__clang__) && defined(__AVX__) && \
0037     defined(_M_AMD64)
0038 
0039 // MSVC AVX (/arch:AVX) implies SSE 4.2 and PCLMULQDQ.
0040 #include <intrin.h>
0041 #define ABSL_CRC_INTERNAL_HAVE_X86_SIMD
0042 
0043 #elif defined(__aarch64__) && defined(__LITTLE_ENDIAN__) &&                 \
0044     defined(__ARM_FEATURE_CRC32) && defined(ABSL_INTERNAL_HAVE_ARM_NEON) && \
0045     defined(__ARM_FEATURE_CRYPTO)
0046 
0047 #include <arm_acle.h>
0048 #include <arm_neon.h>
0049 #define ABSL_CRC_INTERNAL_HAVE_ARM_SIMD
0050 
0051 #endif
0052 
0053 namespace absl {
0054 ABSL_NAMESPACE_BEGIN
0055 namespace crc_internal {
0056 
0057 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD) || \
0058     defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
0059 
0060 #if defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
0061 using V128 = uint64x2_t;
0062 #else
0063 // Note: Do not use __m128i_u, it is not portable.
0064 // Use V128_LoadU() perform an unaligned load from __m128i*.
0065 using V128 = __m128i;
0066 #endif
0067 
0068 // Starting with the initial value in |crc|, accumulates a CRC32 value for
0069 // unsigned integers of different sizes.
0070 uint32_t CRC32_u8(uint32_t crc, uint8_t v);
0071 
0072 uint32_t CRC32_u16(uint32_t crc, uint16_t v);
0073 
0074 uint32_t CRC32_u32(uint32_t crc, uint32_t v);
0075 
0076 uint32_t CRC32_u64(uint32_t crc, uint64_t v);
0077 
0078 // Loads 128 bits of integer data. |src| must be 16-byte aligned.
0079 V128 V128_Load(const V128* src);
0080 
0081 // Load 128 bits of integer data. |src| does not need to be aligned.
0082 V128 V128_LoadU(const V128* src);
0083 
0084 // Store 128 bits of integer data. |src| must be 16-byte aligned.
0085 void V128_Store(V128* dst, V128 data);
0086 
0087 // Polynomially multiplies the high 64 bits of |l| and |r|.
0088 V128 V128_PMulHi(const V128 l, const V128 r);
0089 
0090 // Polynomially multiplies the low 64 bits of |l| and |r|.
0091 V128 V128_PMulLow(const V128 l, const V128 r);
0092 
0093 // Polynomially multiplies the low 64 bits of |r| and high 64 bits of |l|.
0094 V128 V128_PMul01(const V128 l, const V128 r);
0095 
0096 // Polynomially multiplies the low 64 bits of |l| and high 64 bits of |r|.
0097 V128 V128_PMul10(const V128 l, const V128 r);
0098 
0099 // Produces a XOR operation of |l| and |r|.
0100 V128 V128_Xor(const V128 l, const V128 r);
0101 
0102 // Produces an AND operation of |l| and |r|.
0103 V128 V128_And(const V128 l, const V128 r);
0104 
0105 // Sets the lower half of a 128 bit register to the given 64-bit value and
0106 // zeroes the upper half.
0107 // dst[63:0] := |r|
0108 // dst[127:64] := |0|
0109 V128 V128_From64WithZeroFill(const uint64_t r);
0110 
0111 // Shift |l| right by |imm| bytes while shifting in zeros.
0112 template <int imm>
0113 V128 V128_ShiftRight(const V128 l);
0114 
0115 // Extracts a 32-bit integer from |l|, selected with |imm|.
0116 template <int imm>
0117 int V128_Extract32(const V128 l);
0118 
0119 // Extracts a 64-bit integer from |l|, selected with |imm|.
0120 template <int imm>
0121 uint64_t V128_Extract64(const V128 l);
0122 
0123 // Extracts the low 64 bits from V128.
0124 int64_t V128_Low64(const V128 l);
0125 
0126 // Add packed 64-bit integers in |l| and |r|.
0127 V128 V128_Add64(const V128 l, const V128 r);
0128 
0129 #endif
0130 
0131 #if defined(ABSL_CRC_INTERNAL_HAVE_X86_SIMD)
0132 
0133 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) {
0134   return _mm_crc32_u8(crc, v);
0135 }
0136 
0137 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
0138   return _mm_crc32_u16(crc, v);
0139 }
0140 
0141 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
0142   return _mm_crc32_u32(crc, v);
0143 }
0144 
0145 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
0146   return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
0147 }
0148 
0149 inline V128 V128_Load(const V128* src) { return _mm_load_si128(src); }
0150 
0151 inline V128 V128_LoadU(const V128* src) { return _mm_loadu_si128(src); }
0152 
0153 inline void V128_Store(V128* dst, V128 data) { _mm_store_si128(dst, data); }
0154 
0155 inline V128 V128_PMulHi(const V128 l, const V128 r) {
0156   return _mm_clmulepi64_si128(l, r, 0x11);
0157 }
0158 
0159 inline V128 V128_PMulLow(const V128 l, const V128 r) {
0160   return _mm_clmulepi64_si128(l, r, 0x00);
0161 }
0162 
0163 inline V128 V128_PMul01(const V128 l, const V128 r) {
0164   return _mm_clmulepi64_si128(l, r, 0x01);
0165 }
0166 
0167 inline V128 V128_PMul10(const V128 l, const V128 r) {
0168   return _mm_clmulepi64_si128(l, r, 0x10);
0169 }
0170 
0171 inline V128 V128_Xor(const V128 l, const V128 r) { return _mm_xor_si128(l, r); }
0172 
0173 inline V128 V128_And(const V128 l, const V128 r) { return _mm_and_si128(l, r); }
0174 
0175 inline V128 V128_From64WithZeroFill(const uint64_t r) {
0176   return _mm_set_epi64x(static_cast<int64_t>(0), static_cast<int64_t>(r));
0177 }
0178 
0179 template <int imm>
0180 inline V128 V128_ShiftRight(const V128 l) {
0181   return _mm_srli_si128(l, imm);
0182 }
0183 
0184 template <int imm>
0185 inline int V128_Extract32(const V128 l) {
0186   return _mm_extract_epi32(l, imm);
0187 }
0188 
0189 template <int imm>
0190 inline uint64_t V128_Extract64(const V128 l) {
0191   return static_cast<uint64_t>(_mm_extract_epi64(l, imm));
0192 }
0193 
0194 inline int64_t V128_Low64(const V128 l) { return _mm_cvtsi128_si64(l); }
0195 
0196 inline V128 V128_Add64(const V128 l, const V128 r) {
0197   return _mm_add_epi64(l, r);
0198 }
0199 
0200 #elif defined(ABSL_CRC_INTERNAL_HAVE_ARM_SIMD)
0201 
0202 inline uint32_t CRC32_u8(uint32_t crc, uint8_t v) { return __crc32cb(crc, v); }
0203 
0204 inline uint32_t CRC32_u16(uint32_t crc, uint16_t v) {
0205   return __crc32ch(crc, v);
0206 }
0207 
0208 inline uint32_t CRC32_u32(uint32_t crc, uint32_t v) {
0209   return __crc32cw(crc, v);
0210 }
0211 
0212 inline uint32_t CRC32_u64(uint32_t crc, uint64_t v) {
0213   return __crc32cd(crc, v);
0214 }
0215 
0216 inline V128 V128_Load(const V128* src) {
0217   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
0218 }
0219 
0220 inline V128 V128_LoadU(const V128* src) {
0221   return vld1q_u64(reinterpret_cast<const uint64_t*>(src));
0222 }
0223 
0224 inline void V128_Store(V128* dst, V128 data) {
0225   vst1q_u64(reinterpret_cast<uint64_t*>(dst), data);
0226 }
0227 
0228 // Using inline assembly as clang does not generate the pmull2 instruction and
0229 // performance drops by 15-20%.
0230 // TODO(b/193678732): Investigate why there is a slight performance hit when
0231 // using intrinsics instead of inline assembly.
0232 inline V128 V128_PMulHi(const V128 l, const V128 r) {
0233   uint64x2_t res;
0234   __asm__ __volatile__("pmull2 %0.1q, %1.2d, %2.2d \n\t"
0235                        : "=w"(res)
0236                        : "w"(l), "w"(r));
0237   return res;
0238 }
0239 
0240 // TODO(b/193678732): Investigate why the compiler decides to move the constant
0241 // loop multiplicands from GPR to Neon registers every loop iteration.
0242 inline V128 V128_PMulLow(const V128 l, const V128 r) {
0243   uint64x2_t res;
0244   __asm__ __volatile__("pmull %0.1q, %1.1d, %2.1d \n\t"
0245                        : "=w"(res)
0246                        : "w"(l), "w"(r));
0247   return res;
0248 }
0249 
0250 inline V128 V128_PMul01(const V128 l, const V128 r) {
0251   return reinterpret_cast<V128>(vmull_p64(
0252       reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(l))),
0253       reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(r)))));
0254 }
0255 
0256 inline V128 V128_PMul10(const V128 l, const V128 r) {
0257   return reinterpret_cast<V128>(vmull_p64(
0258       reinterpret_cast<poly64_t>(vget_low_p64(vreinterpretq_p64_u64(l))),
0259       reinterpret_cast<poly64_t>(vget_high_p64(vreinterpretq_p64_u64(r)))));
0260 }
0261 
0262 inline V128 V128_Xor(const V128 l, const V128 r) { return veorq_u64(l, r); }
0263 
0264 inline V128 V128_And(const V128 l, const V128 r) { return vandq_u64(l, r); }
0265 
0266 inline V128 V128_From64WithZeroFill(const uint64_t r){
0267   constexpr uint64x2_t kZero = {0, 0};
0268   return vsetq_lane_u64(r, kZero, 0);
0269 }
0270 
0271 
0272 template <int imm>
0273 inline V128 V128_ShiftRight(const V128 l) {
0274   return vreinterpretq_u64_s8(
0275       vextq_s8(vreinterpretq_s8_u64(l), vdupq_n_s8(0), imm));
0276 }
0277 
0278 template <int imm>
0279 inline int V128_Extract32(const V128 l) {
0280   return vgetq_lane_s32(vreinterpretq_s32_u64(l), imm);
0281 }
0282 
0283 template <int imm>
0284 inline uint64_t V128_Extract64(const V128 l) {
0285   return vgetq_lane_u64(l, imm);
0286 }
0287 
0288 inline int64_t V128_Low64(const V128 l) {
0289   return vgetq_lane_s64(vreinterpretq_s64_u64(l), 0);
0290 }
0291 
0292 inline V128 V128_Add64(const V128 l, const V128 r) { return vaddq_u64(l, r); }
0293 
0294 #endif
0295 
0296 }  // namespace crc_internal
0297 ABSL_NAMESPACE_END
0298 }  // namespace absl
0299 
0300 #endif  // ABSL_CRC_INTERNAL_CRC32_X86_ARM_COMBINED_SIMD_H_