File indexing completed on 2025-01-18 09:27:16
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015 #ifndef ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
0016 #define ABSL_CRC_INTERNAL_NON_TEMPORAL_MEMCPY_H_
0017
0018 #ifdef _MSC_VER
0019 #include <intrin.h>
0020 #endif
0021
0022 #if defined(__SSE__) || defined(__AVX__)
0023
0024 #include <immintrin.h>
0025 #endif
0026
0027 #ifdef __aarch64__
0028 #include "absl/crc/internal/non_temporal_arm_intrinsics.h"
0029 #endif
0030
0031 #include <algorithm>
0032 #include <cassert>
0033 #include <cstdint>
0034 #include <cstring>
0035
0036 #include "absl/base/attributes.h"
0037 #include "absl/base/config.h"
0038 #include "absl/base/optimization.h"
0039
0040 namespace absl {
0041 ABSL_NAMESPACE_BEGIN
0042 namespace crc_internal {
0043
0044
0045
0046
0047
0048 constexpr size_t kCacheLineSize = ABSL_CACHELINE_SIZE;
0049
0050
0051
0052
0053 inline void *non_temporal_store_memcpy(void *__restrict dst,
0054 const void *__restrict src, size_t len) {
0055 #if defined(__SSE3__) || defined(__aarch64__) || \
0056 (defined(_MSC_VER) && defined(__AVX__))
0057
0058
0059
0060 uint8_t *d = reinterpret_cast<uint8_t *>(dst);
0061 const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
0062
0063
0064
0065 if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
0066 uintptr_t bytes_before_alignment_boundary =
0067 kCacheLineSize -
0068 (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
0069 size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
0070 assert(bytes_before_alignment_boundary < kCacheLineSize);
0071 memcpy(d, s, header_len);
0072 d += header_len;
0073 s += header_len;
0074 len -= header_len;
0075 }
0076
0077 if (len >= kCacheLineSize) {
0078 _mm_sfence();
0079 __m128i *dst_cacheline = reinterpret_cast<__m128i *>(d);
0080 const __m128i *src_cacheline = reinterpret_cast<const __m128i *>(s);
0081 constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m128i);
0082 size_t loops = len / kCacheLineSize;
0083
0084 while (len >= kCacheLineSize) {
0085 __m128i temp1, temp2, temp3, temp4;
0086 temp1 = _mm_lddqu_si128(src_cacheline + 0);
0087 temp2 = _mm_lddqu_si128(src_cacheline + 1);
0088 temp3 = _mm_lddqu_si128(src_cacheline + 2);
0089 temp4 = _mm_lddqu_si128(src_cacheline + 3);
0090 _mm_stream_si128(dst_cacheline + 0, temp1);
0091 _mm_stream_si128(dst_cacheline + 1, temp2);
0092 _mm_stream_si128(dst_cacheline + 2, temp3);
0093 _mm_stream_si128(dst_cacheline + 3, temp4);
0094 src_cacheline += kOpsPerCacheLine;
0095 dst_cacheline += kOpsPerCacheLine;
0096 len -= kCacheLineSize;
0097 }
0098 d += loops * kCacheLineSize;
0099 s += loops * kCacheLineSize;
0100 _mm_sfence();
0101 }
0102
0103
0104 if (len) {
0105 memcpy(d, s, len);
0106 }
0107 return dst;
0108 #else
0109
0110 return memcpy(dst, src, len);
0111 #endif
0112 }
0113
0114
0115
0116
0117 #if ABSL_HAVE_CPP_ATTRIBUTE(gnu::target) && \
0118 (defined(__x86_64__) || defined(__i386__))
0119 [[gnu::target("avx")]]
0120 #endif
0121 inline void *non_temporal_store_memcpy_avx(void *__restrict dst,
0122 const void *__restrict src,
0123 size_t len) {
0124
0125
0126
0127 #if defined(__SSE3__) || (defined(_MSC_VER) && defined(__AVX__))
0128 uint8_t *d = reinterpret_cast<uint8_t *>(dst);
0129 const uint8_t *s = reinterpret_cast<const uint8_t *>(src);
0130
0131
0132
0133 if (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1)) {
0134 uintptr_t bytes_before_alignment_boundary =
0135 kCacheLineSize -
0136 (reinterpret_cast<uintptr_t>(d) & (kCacheLineSize - 1));
0137 size_t header_len = (std::min)(bytes_before_alignment_boundary, len);
0138 assert(bytes_before_alignment_boundary < kCacheLineSize);
0139 memcpy(d, s, header_len);
0140 d += header_len;
0141 s += header_len;
0142 len -= header_len;
0143 }
0144
0145 if (len >= kCacheLineSize) {
0146 _mm_sfence();
0147 __m256i *dst_cacheline = reinterpret_cast<__m256i *>(d);
0148 const __m256i *src_cacheline = reinterpret_cast<const __m256i *>(s);
0149 constexpr int kOpsPerCacheLine = kCacheLineSize / sizeof(__m256i);
0150 size_t loops = len / kCacheLineSize;
0151
0152 while (len >= kCacheLineSize) {
0153 __m256i temp1, temp2;
0154 temp1 = _mm256_lddqu_si256(src_cacheline + 0);
0155 temp2 = _mm256_lddqu_si256(src_cacheline + 1);
0156 _mm256_stream_si256(dst_cacheline + 0, temp1);
0157 _mm256_stream_si256(dst_cacheline + 1, temp2);
0158 src_cacheline += kOpsPerCacheLine;
0159 dst_cacheline += kOpsPerCacheLine;
0160 len -= kCacheLineSize;
0161 }
0162 d += loops * kCacheLineSize;
0163 s += loops * kCacheLineSize;
0164 _mm_sfence();
0165 }
0166
0167
0168 if (len) {
0169 memcpy(d, s, len);
0170 }
0171 return dst;
0172 #else
0173 return memcpy(dst, src, len);
0174 #endif
0175 }
0176
0177 }
0178 ABSL_NAMESPACE_END
0179 }
0180
0181 #endif