eic-opticks/sysrap/SCU.h

0001 #pragma once
0002 /**
0003 SCU.h
0004 =======
0005
0006 TODO: merge in SCU_.h
0007
0008
0009 **/
0010
0011 #include <cstddef>
0012 #include <cstdint>
0013 #include <vector_types.h>
0014 #include <ostream>
0015 #include <iomanip>
0016
0017 #include <cuda.h>
0018 #include <cuda_runtime.h>
0019
0020 #include "CUDA_CHECK.h"
0021
0022
0023 template <typename T>
0024 struct SCU_Buf
0025 {
0026    T*          ptr ;
0027    size_t      num_item ;
0028    std::string name ;
0029
0030    CUdeviceptr pointer() const ;
0031    void free() ;
0032
0033    std::string desc() const ;
0034 };
0035
0036
0037 template <typename T>
0038 inline CUdeviceptr SCU_Buf<T>::pointer() const
0039 {
0040    return (CUdeviceptr)(uintptr_t) ptr ;
0041 }
0042
0043 template <typename T>
0044 inline void SCU_Buf<T>::free()
0045 {
0046     CUDA_CHECK( cudaFree( reinterpret_cast<void*>( ptr) ) );
0047     ptr = nullptr ;
0048     num_item = 0 ;
0049 }
0050
0051
0052 template <typename T>
0053 inline std::string SCU_Buf<T>::desc() const
0054 {
0055     std::stringstream ss ;
0056     ss << "SCU_Buf"
0057        << " (uintptr_t)ptr  0x"
0058        << std::setw(9) << std::hex << (uintptr_t)ptr << std::dec
0059        << " sizeof(T) " << std::setw(5) << sizeof(T)
0060        << " num_item "  << std::setw(7) << num_item
0061        << " name " << name
0062        ;
0063     std::string str = ss.str();
0064     return str ;
0065 }
0066
0067
0068
0069
0070
0071 struct SCU
0072 {
0073     template<typename T>
0074     static CUdeviceptr DevicePointerCast( const T* d_ptr );
0075
0076     void _cudaMalloc( void** p2p, size_t size, const char* label );
0077
0078     template<typename T>
0079     T* device_alloc( unsigned num_items, const char* label );
0080
0081
0082
0083     template <typename T>
0084     static T* UploadArray(const T* array, size_t num_item );
0085
0086     template <typename T>
0087     static SCU_Buf<T> UploadBuf(const T* array, size_t num_item, const char* name );
0088
0089
0090
0091     template <typename T>
0092     static T* DownloadArray(const T* d_array, size_t num_item );
0093
0094     template <typename T>
0095     static T* DownloadBuf(const SCU_Buf<T>& buf );
0096
0097     template <typename T>
0098     static void DownloadVec(std::vector<T>& vec, const T* d_array, unsigned num_items);
0099
0100
0101
0102     template <typename T>
0103     static void FreeArray(T* d_array );
0104
0105     template <typename T>
0106     static void FreeBuf(SCU_Buf<T>& buf );
0107
0108
0109
0110     static void ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, int32_t width, int32_t height );
0111 };
0112
0113
0114 template<typename T>
0115 CUdeviceptr SCU::DevicePointerCast( const T* d_ptr ) // static
0116 {
0117     return (CUdeviceptr) (uintptr_t) d_ptr ;
0118 }
0119
0120
0121 inline void SCU::_cudaMalloc( void** p2p, size_t size, const char* label )
0122 {
0123     cudaError_t err = cudaMalloc(p2p, size ) ;
0124     if( err != cudaSuccess )
0125     {
0126         std::stringstream ss;
0127         ss << "CUDA call (" << label << " ) failed with error: '"
0128            << cudaGetErrorString( err )
0129            << "' (" __FILE__ << ":" << __LINE__ << ")\n"
0130            ;
0131
0132         const char* msg = ss.str().c_str() ;
0133         throw CUDA_Exception(msg);
0134     }
0135 }
0136
0137
0138 template<typename T>
0139 inline T* SCU::device_alloc( unsigned num_items, const char* label )
0140 {
0141     size_t size = num_items*sizeof(T) ;
0142
0143     T* d ;
0144     _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0145
0146     return d ;
0147 }
0148
0149
0150
0151
0152 /**
0153 SCU::UploadArray
0154 -----------------
0155
0156 Allocate on device and copy from host to device
0157
0158 **/
0159 template <typename T>
0160 inline T* SCU::UploadArray(const T* array, size_t num_item ) // static
0161 {
0162     T* d_array = nullptr ;
0163     CUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( &d_array ), num_item*sizeof(T) ));
0164     CUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d_array ), array, sizeof(T)*num_item, cudaMemcpyHostToDevice ));
0165     return d_array ;
0166 }
0167
0168 template <typename T>
0169 inline SCU_Buf<T> SCU::UploadBuf(const T* array, size_t num_item, const char* name )
0170 {
0171     T* d_array = UploadArray<T>(array, num_item ) ;
0172     return { d_array, num_item, name } ;
0173 }
0174
0175
0176 /**
0177 SCU::DownloadArray
0178 -------------------
0179
0180 Allocate on host and copy from device to host
0181
0182 **/
0183
0184 template <typename T>
0185 inline T* SCU::DownloadArray(const T* d_array, size_t num_items ) // static
0186 {
0187     T* array = new T[num_items] ;
0188     CUDA_CHECK( cudaMemcpy( array, d_array, sizeof(T)*num_items, cudaMemcpyDeviceToHost ));
0189     return array ;
0190 }
0191
0192 template <typename T>
0193 inline T* SCU::DownloadBuf(const SCU_Buf<T>& buf )
0194 {
0195     return DownloadArray<T>( buf.ptr, buf.num_item );
0196 }
0197
0198
0199 /**
0200 SCU::DownloadVec
0201 -----------------
0202
0203 After CU::DownloadVec
0204
0205 **/
0206
0207 template <typename T>
0208 inline void SCU::DownloadVec(std::vector<T>& vec, const T* d_array, unsigned num_items)  // static
0209 {
0210     unsigned num_bytes = num_items*sizeof(T) ;
0211     vec.clear();
0212     vec.resize(num_items);
0213     CUDA_CHECK( cudaMemcpy( vec.data(), d_array, num_bytes, cudaMemcpyDeviceToHost ));
0214 }
0215
0216 template void SCU::DownloadVec<float>(   std::vector<float>&    vec,  const float* d_array,    unsigned num_items) ;
0217 template void SCU::DownloadVec<unsigned>(std::vector<unsigned>& vec,  const unsigned* d_array, unsigned num_items) ;
0218
0219
0220
0221
0222
0223
0224 template <typename T>
0225 inline void SCU::FreeArray(T* d_array ) // static
0226 {
0227     CUDA_CHECK( cudaFree( reinterpret_cast<void*>( d_array ) ) );
0228 }
0229
0230 template <typename T>
0231 inline void SCU::FreeBuf(SCU_Buf<T>& buf ) // static
0232 {
0233     buf.free() ;
0234 }
0235
0236
0237
0238
0239
0240
0241
0242
0243
0244
0245
0246 inline void SCU::ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, int32_t width, int32_t height ) // static
0247 {
0248     threadsPerBlock.x = 16 ;
0249     threadsPerBlock.y = 16 ;
0250     threadsPerBlock.z = 1 ;
0251
0252     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0253     numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0254     numBlocks.z = 1 ;
0255 }
0256
0257 inline std::ostream& operator<<(std::ostream& os, const dim3& v)
0258 {
0259     int w = 6 ;
0260     os
0261        << "("
0262        << std::setw(w) << v.x
0263        << ","
0264        << std::setw(w) << v.y
0265        << ","
0266        << std::setw(w) << v.z
0267        << ") "
0268        ;
0269     return os;
0270 }
0271