File indexing completed on 2026-04-09 07:49:32
0001 #pragma once
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <cstddef>
0012 #include <cstdint>
0013 #include <vector_types.h>
0014 #include <ostream>
0015 #include <iomanip>
0016
0017 #include <cuda.h>
0018 #include <cuda_runtime.h>
0019
0020 #include "CUDA_CHECK.h"
0021
0022
0023 template <typename T>
0024 struct SCU_Buf
0025 {
0026 T* ptr ;
0027 size_t num_item ;
0028 std::string name ;
0029
0030 CUdeviceptr pointer() const ;
0031 void free() ;
0032
0033 std::string desc() const ;
0034 };
0035
0036
0037 template <typename T>
0038 inline CUdeviceptr SCU_Buf<T>::pointer() const
0039 {
0040 return (CUdeviceptr)(uintptr_t) ptr ;
0041 }
0042
0043 template <typename T>
0044 inline void SCU_Buf<T>::free()
0045 {
0046 CUDA_CHECK( cudaFree( reinterpret_cast<void*>( ptr) ) );
0047 ptr = nullptr ;
0048 num_item = 0 ;
0049 }
0050
0051
0052 template <typename T>
0053 inline std::string SCU_Buf<T>::desc() const
0054 {
0055 std::stringstream ss ;
0056 ss << "SCU_Buf"
0057 << " (uintptr_t)ptr 0x"
0058 << std::setw(9) << std::hex << (uintptr_t)ptr << std::dec
0059 << " sizeof(T) " << std::setw(5) << sizeof(T)
0060 << " num_item " << std::setw(7) << num_item
0061 << " name " << name
0062 ;
0063 std::string str = ss.str();
0064 return str ;
0065 }
0066
0067
0068
0069
0070
0071 struct SCU
0072 {
0073 template<typename T>
0074 static CUdeviceptr DevicePointerCast( const T* d_ptr );
0075
0076 void _cudaMalloc( void** p2p, size_t size, const char* label );
0077
0078 template<typename T>
0079 T* device_alloc( unsigned num_items, const char* label );
0080
0081
0082
0083 template <typename T>
0084 static T* UploadArray(const T* array, size_t num_item );
0085
0086 template <typename T>
0087 static SCU_Buf<T> UploadBuf(const T* array, size_t num_item, const char* name );
0088
0089
0090
0091 template <typename T>
0092 static T* DownloadArray(const T* d_array, size_t num_item );
0093
0094 template <typename T>
0095 static T* DownloadBuf(const SCU_Buf<T>& buf );
0096
0097 template <typename T>
0098 static void DownloadVec(std::vector<T>& vec, const T* d_array, unsigned num_items);
0099
0100
0101
0102 template <typename T>
0103 static void FreeArray(T* d_array );
0104
0105 template <typename T>
0106 static void FreeBuf(SCU_Buf<T>& buf );
0107
0108
0109
0110 static void ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, int32_t width, int32_t height );
0111 };
0112
0113
0114 template<typename T>
0115 CUdeviceptr SCU::DevicePointerCast( const T* d_ptr )
0116 {
0117 return (CUdeviceptr) (uintptr_t) d_ptr ;
0118 }
0119
0120
0121 inline void SCU::_cudaMalloc( void** p2p, size_t size, const char* label )
0122 {
0123 cudaError_t err = cudaMalloc(p2p, size ) ;
0124 if( err != cudaSuccess )
0125 {
0126 std::stringstream ss;
0127 ss << "CUDA call (" << label << " ) failed with error: '"
0128 << cudaGetErrorString( err )
0129 << "' (" __FILE__ << ":" << __LINE__ << ")\n"
0130 ;
0131
0132 const char* msg = ss.str().c_str() ;
0133 throw CUDA_Exception(msg);
0134 }
0135 }
0136
0137
0138 template<typename T>
0139 inline T* SCU::device_alloc( unsigned num_items, const char* label )
0140 {
0141 size_t size = num_items*sizeof(T) ;
0142
0143 T* d ;
0144 _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0145
0146 return d ;
0147 }
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158
0159 template <typename T>
0160 inline T* SCU::UploadArray(const T* array, size_t num_item )
0161 {
0162 T* d_array = nullptr ;
0163 CUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( &d_array ), num_item*sizeof(T) ));
0164 CUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d_array ), array, sizeof(T)*num_item, cudaMemcpyHostToDevice ));
0165 return d_array ;
0166 }
0167
0168 template <typename T>
0169 inline SCU_Buf<T> SCU::UploadBuf(const T* array, size_t num_item, const char* name )
0170 {
0171 T* d_array = UploadArray<T>(array, num_item ) ;
0172 return { d_array, num_item, name } ;
0173 }
0174
0175
0176
0177
0178
0179
0180
0181
0182
0183
0184 template <typename T>
0185 inline T* SCU::DownloadArray(const T* d_array, size_t num_items )
0186 {
0187 T* array = new T[num_items] ;
0188 CUDA_CHECK( cudaMemcpy( array, d_array, sizeof(T)*num_items, cudaMemcpyDeviceToHost ));
0189 return array ;
0190 }
0191
0192 template <typename T>
0193 inline T* SCU::DownloadBuf(const SCU_Buf<T>& buf )
0194 {
0195 return DownloadArray<T>( buf.ptr, buf.num_item );
0196 }
0197
0198
0199
0200
0201
0202
0203
0204
0205
0206
0207 template <typename T>
0208 inline void SCU::DownloadVec(std::vector<T>& vec, const T* d_array, unsigned num_items)
0209 {
0210 unsigned num_bytes = num_items*sizeof(T) ;
0211 vec.clear();
0212 vec.resize(num_items);
0213 CUDA_CHECK( cudaMemcpy( vec.data(), d_array, num_bytes, cudaMemcpyDeviceToHost ));
0214 }
0215
0216 template void SCU::DownloadVec<float>( std::vector<float>& vec, const float* d_array, unsigned num_items) ;
0217 template void SCU::DownloadVec<unsigned>(std::vector<unsigned>& vec, const unsigned* d_array, unsigned num_items) ;
0218
0219
0220
0221
0222
0223
0224 template <typename T>
0225 inline void SCU::FreeArray(T* d_array )
0226 {
0227 CUDA_CHECK( cudaFree( reinterpret_cast<void*>( d_array ) ) );
0228 }
0229
0230 template <typename T>
0231 inline void SCU::FreeBuf(SCU_Buf<T>& buf )
0232 {
0233 buf.free() ;
0234 }
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246 inline void SCU::ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, int32_t width, int32_t height )
0247 {
0248 threadsPerBlock.x = 16 ;
0249 threadsPerBlock.y = 16 ;
0250 threadsPerBlock.z = 1 ;
0251
0252 numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0253 numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0254 numBlocks.z = 1 ;
0255 }
0256
0257 inline std::ostream& operator<<(std::ostream& os, const dim3& v)
0258 {
0259 int w = 6 ;
0260 os
0261 << "("
0262 << std::setw(w) << v.x
0263 << ","
0264 << std::setw(w) << v.y
0265 << ","
0266 << std::setw(w) << v.z
0267 << ") "
0268 ;
0269 return os;
0270 }
0271