File indexing completed on 2025-01-18 10:00:11
0001
0002
0003
0004
0005
0006
0007
0008
0009 #pragma once
0010
0011 #include "gloo/types.h"
0012
0013 namespace gloo {
0014
0015 template <typename T>
0016 void sum(void* c_, const void* a_, const void* b_, size_t n) {
0017 T* c = static_cast<T*>(c_);
0018 const T* a = static_cast<const T*>(a_);
0019 const T* b = static_cast<const T*>(b_);
0020 for (auto i = 0; i < n; i++) {
0021 c[i] = a[i] + b[i];
0022 }
0023 }
0024
0025 template <typename T>
0026 void sum(T* a, const T* b, size_t n) {
0027 sum<T>(a, a, b, n);
0028 }
0029
0030 template <typename T>
0031 void product(void* c_, const void* a_, const void* b_, size_t n) {
0032 T* c = static_cast<T*>(c_);
0033 const T* a = static_cast<const T*>(a_);
0034 const T* b = static_cast<const T*>(b_);
0035 for (auto i = 0; i < n; i++) {
0036 c[i] = a[i] * b[i];
0037 }
0038 }
0039
0040 template <typename T>
0041 void product(T* a, const T* b, size_t n) {
0042 product<T>(a, a, b, n);
0043 }
0044
0045 template <typename T>
0046 void max(void* c_, const void* a_, const void* b_, size_t n) {
0047 T* c = static_cast<T*>(c_);
0048 const T* a = static_cast<const T*>(a_);
0049 const T* b = static_cast<const T*>(b_);
0050 for (auto i = 0; i < n; i++) {
0051 c[i] = std::max(a[i], b[i]);
0052 }
0053 }
0054
0055 template <typename T>
0056 void max(T* a, const T* b, size_t n) {
0057 max<T>(a, a, b, n);
0058 }
0059
0060 template <typename T>
0061 void min(void* c_, const void* a_, const void* b_, size_t n) {
0062 T* c = static_cast<T*>(c_);
0063 const T* a = static_cast<const T*>(a_);
0064 const T* b = static_cast<const T*>(b_);
0065 for (auto i = 0; i < n; i++) {
0066 c[i] = std::min(a[i], b[i]);
0067 }
0068 }
0069
0070 template <typename T>
0071 void min(T* a, const T* b, size_t n) {
0072 min<T>(a, a, b, n);
0073 }
0074
0075 template <typename T>
0076 T roundUp(T value, T multiple) {
0077 T remainder = value % multiple;
0078 if (remainder == 0) {
0079 return value;
0080 }
0081 return value + multiple - remainder;
0082 }
0083
0084 inline uint32_t log2ceil(uint32_t value) {
0085 uint32_t dim = 0;
0086 #if defined(__GNUC__)
0087 if (value <= 1)
0088 return 0;
0089 dim = 32 - __builtin_clz(value - 1);
0090 #else
0091 for (uint32_t size = 1; size < value; ++dim, size <<= 1) ;
0092 #endif
0093 return dim;
0094 }
0095
0096 #if GLOO_USE_AVX
0097
0098 template <>
0099 void sum<float16>(void* c, const void* a, const void* b, size_t n);
0100 extern template void
0101 sum<float16>(void* c, const void* a, const void* b, size_t n);
0102
0103 template <>
0104 void product<float16>(void* c, const void* a, const void* b, size_t n);
0105 extern template void
0106 product<float16>(void* c, const void* a, const void* b, size_t n);
0107
0108 template <>
0109 void max<float16>(void* c, const void* a, const void* b, size_t n);
0110 extern template void
0111 max<float16>(void* c, const void* a, const void* b, size_t n);
0112
0113 template <>
0114 void min<float16>(void* c, const void* a, const void* b, size_t n);
0115 extern template void
0116 min<float16>(void* c, const void* a, const void* b, size_t n);
0117
0118 #endif
0119
0120 }