eic-opticks/qudarap/QU.cc

0001 #include <cassert>
0002
0003 #include "NP.hh"
0004 #include "SLOG.hh"
0005
0006 #include "spath.h"
0007 #include "sdirectory.h"
0008 #include "scuda.h"
0009 #include "squad.h"
0010 #include "ssys.h"
0011
0012 #ifndef PRODUCTION
0013 #include "srec.h"
0014 #include "sseq.h"
0015 #endif
0016
0017 #include "sphoton.h"
0018 #include "sphotonlite.h"
0019
0020 #include "sevent.h"
0021 #include "salloc.h"
0022 #include "SEventConfig.hh"
0023
0024 #include "QUDA_CHECK.h"
0025 #include "QU.hh"
0026
0027 #include "curand_kernel.h"
0028 #include "qrng.h"
0029 #include "qsim.h"
0030
0031 #include "qbase.h"
0032 #include "qprop.h"
0033 #include "qpmt.h"
0034 #include "qdebug.h"
0035 #include "qscint.h"
0036 #include "qcerenkov.h"
0037 #include "qcurandwrap.h"
0038 #include "scurandref.h"
0039 #include "qmultifilm.h"
0040
0041
0042 const plog::Severity QU::LEVEL = SLOG::EnvLevel("QU", "DEBUG") ;
0043 bool QU::MEMCHECK = ssys::getenvbool(_MEMCHECK);
0044
0045 salloc* QU::alloc = nullptr ;
0046
0047
0048 void QU::alloc_add(const char* label, uint64_t num_items, uint64_t sizeof_item ) // static
0049 {
0050    if(!alloc) alloc = SEventConfig::ALLOC ;
0051    if(alloc ) alloc->add(label, num_items, sizeof_item );
0052 }
0053
0054
0055 template <typename T>
0056 char QU::typecode()
0057 {
0058     char c = '?' ;
0059     switch(sizeof(T))
0060     {
0061         case 4: c = 'f' ; break ;
0062         case 8: c = 'd' ; break ;
0063     }
0064     return c ;
0065 }
0066
0067 template char QU::typecode<float>() ;
0068 template char QU::typecode<double>() ;
0069
0070
0071 template <typename T>
0072 std::string QU::rng_sequence_name(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) // static
0073 {
0074     std::stringstream ss ;
0075     ss << prefix
0076        << "_" << QU::typecode<T>()
0077        << "_ni" << ni
0078        << "_nj" << nj
0079        << "_nk" << nk
0080        << "_ioffset" << std::setw(6) << std::setfill('0') << ioffset
0081        << ".npy"
0082        ;
0083
0084     std::string name = ss.str();
0085     return name ;
0086 }
0087
0088 template std::string QU::rng_sequence_name<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
0089 template std::string QU::rng_sequence_name<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
0090
0091
0092
0093 template <typename T>
0094 std::string QU::rng_sequence_reldir(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) // static
0095 {
0096     std::stringstream ss ;
0097     ss << prefix
0098        << "_" << QU::typecode<T>()
0099        << "_ni" << ni
0100        << "_nj" << nj
0101        << "_nk" << nk
0102        << "_tranche" << ni_tranche_size
0103        ;
0104
0105     std::string reldir = ss.str();
0106     return reldir ;
0107 }
0108
0109 template std::string QU::rng_sequence_reldir<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
0110 template std::string QU::rng_sequence_reldir<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
0111
0112
0113
0114
0115 /**
0116 QU::UploadArray
0117 ----------------
0118
0119 Allocate on device and copy from host to device
0120
0121 **/
0122
0123 template <typename T>
0124 T* QU::UploadArray(const T* array, unsigned num_items, const char* label ) // static
0125 {
0126     size_t size = num_items*sizeof(T) ;
0127
0128     LOG(LEVEL)
0129        << " num_items " << num_items
0130        << " size " << size
0131        << " label " << ( label ? label : "-" )
0132        ;
0133
0134     LOG_IF(info, MEMCHECK)
0135        << " num_items " << num_items
0136        << " size " << size
0137        << " label " << ( label ? label : "-" )
0138        ;
0139
0140
0141     alloc_add( label, num_items, sizeof(T) ) ;
0142
0143     T* d_array = nullptr ;
0144     QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( &d_array ), size ));
0145     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d_array ), array, size, cudaMemcpyHostToDevice ));
0146     return d_array ;
0147 }
0148
0149
0150 // IF NEED THESE FROM REMOVE PKG WILL NEED TO QUDARAP_API
0151 template float*         QU::UploadArray<float>(const float* array, unsigned num_items, const char* label ) ;
0152 template double*        QU::UploadArray<double>(const double* array, unsigned num_items, const char* label) ;
0153 template unsigned*      QU::UploadArray<unsigned>(const unsigned* array, unsigned num_items, const char* label) ;
0154 template int*           QU::UploadArray<int>(const int* array, unsigned num_items, const char* label) ;
0155 template quad4*         QU::UploadArray<quad4>(const quad4* array, unsigned num_items, const char* label) ;
0156 template sphoton*       QU::UploadArray<sphoton>(const sphoton* array, unsigned num_items, const char* label) ;
0157 template sphotonlite*   QU::UploadArray<sphotonlite>(const sphotonlite* array, unsigned num_items, const char* label) ;
0158 template quad2*         QU::UploadArray<quad2>(const quad2* array, unsigned num_items, const char* label) ;
0159 template XORWOW*        QU::UploadArray<XORWOW>(const XORWOW* array, unsigned num_items, const char* label) ;
0160 template Philox*        QU::UploadArray<Philox>(const Philox* array, unsigned num_items, const char* label) ;
0161 template qcurandwrap<XORWOW>*   QU::UploadArray<qcurandwrap<XORWOW>>(const qcurandwrap<XORWOW>* array, unsigned num_items, const char* label) ;
0162 template scurandref<XORWOW>*    QU::UploadArray<scurandref<XORWOW>>(const scurandref<XORWOW>* array, unsigned num_items, const char* label) ;
0163 template qsim*          QU::UploadArray<qsim>(const qsim* array, unsigned num_items, const char* label) ;
0164 template qprop<float>*  QU::UploadArray<qprop<float>>(const qprop<float>* array, unsigned num_items, const char* label) ;
0165 template qprop<double>* QU::UploadArray<qprop<double>>(const qprop<double>* array, unsigned num_items, const char* label) ;
0166 template qpmt<float>*   QU::UploadArray<qpmt<float>>(const qpmt<float>* array, unsigned num_items, const char* label) ;
0167 template qpmt<double>*  QU::UploadArray<qpmt<double>>(const qpmt<double>* array, unsigned num_items, const char* label) ;
0168 template qmultifilm*    QU::UploadArray<qmultifilm>(const qmultifilm* array, unsigned num_items, const char* label) ;
0169 template qrng<RNG>*     QU::UploadArray<qrng<RNG>>(const qrng<RNG>* array, unsigned num_items, const char* label) ;
0170 template qbnd*          QU::UploadArray<qbnd>(const qbnd* array, unsigned num_items, const char* label) ;
0171 template sevent*        QU::UploadArray<sevent>(const sevent* array, unsigned num_items, const char* label) ;
0172 template qdebug*        QU::UploadArray<qdebug>(const qdebug* array, unsigned num_items, const char* label) ;
0173 template qscint*        QU::UploadArray<qscint>(const qscint* array, unsigned num_items, const char* label) ;
0174 template qcerenkov*     QU::UploadArray<qcerenkov>(const qcerenkov* array, unsigned num_items, const char* label) ;
0175 template qbase*         QU::UploadArray<qbase>(const qbase* array, unsigned num_items, const char* label) ;
0176
0177
0178
0179 /**
0180 QU::DownloadArray
0181 -------------------
0182
0183 Allocate on host and copy from device to host
0184
0185 **/
0186
0187 template <typename T>
0188 T* QU::DownloadArray(const T* d_array, unsigned num_items ) // static
0189 {
0190     T* array = new T[num_items] ;
0191     QUDA_CHECK( cudaMemcpy( array, d_array, sizeof(T)*num_items, cudaMemcpyDeviceToHost ));
0192     return array ;
0193 }
0194
0195
0196 template  float*         QU::DownloadArray<float>(const float* d_array, unsigned num_items) ;
0197 template  unsigned*      QU::DownloadArray<unsigned>(const unsigned* d_array, unsigned num_items) ;
0198 template  int*           QU::DownloadArray<int>(const int* d_array, unsigned num_items) ;
0199 template  quad4*         QU::DownloadArray<quad4>(const quad4* d_array, unsigned num_items) ;
0200 template  quad2*         QU::DownloadArray<quad2>(const quad2* d_array, unsigned num_items) ;
0201 template  XORWOW*        QU::DownloadArray<XORWOW>(const XORWOW* d_array, unsigned num_items) ;
0202 template  Philox*        QU::DownloadArray<Philox>(const Philox* d_array, unsigned num_items) ;
0203 template  qprop<float>*  QU::DownloadArray<qprop<float>>(const qprop<float>* d_array, unsigned num_items) ;
0204 template  qprop<double>* QU::DownloadArray<qprop<double>>(const qprop<double>* d_array, unsigned num_items) ;
0205
0206
0207 template <typename T>
0208 void QU::Download(std::vector<T>& vec, const T* d_array, unsigned num_items)  // static
0209 {
0210     vec.resize( num_items);
0211     QUDA_CHECK( cudaMemcpy( static_cast<void*>( vec.data() ), d_array, num_items*sizeof(T), cudaMemcpyDeviceToHost));
0212 }
0213
0214
0215 template QUDARAP_API void QU::Download<float>(   std::vector<float>& vec,    const float* d_array,    unsigned num_items);
0216 template QUDARAP_API void QU::Download<unsigned>(std::vector<unsigned>& vec, const unsigned* d_array, unsigned num_items);
0217 template QUDARAP_API void QU::Download<int>(     std::vector<int>& vec,      const int* d_array,      unsigned num_items);
0218 template QUDARAP_API void QU::Download<uchar4>(  std::vector<uchar4>& vec,   const uchar4* d_array,   unsigned num_items);
0219 template QUDARAP_API void QU::Download<float4>(  std::vector<float4>& vec,   const float4* d_array,   unsigned num_items);
0220 template QUDARAP_API void QU::Download<quad4>(   std::vector<quad4>& vec,    const quad4* d_array,    unsigned num_items);
0221
0222
0223
0224 template<typename T>
0225 void QU::device_free_and_alloc(T** dd, unsigned num_items ) // dd: pointer-to-device-pointer
0226 {
0227     size_t size = num_items*sizeof(T) ;
0228     LOG_IF(info, MEMCHECK) << " size " << size << " num_items " << num_items ;
0229
0230     QUDA_CHECK( cudaFree( reinterpret_cast<void*>( *dd ) ) );
0231     QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( dd ), size ));
0232     assert( *dd );
0233 }
0234
0235
0236 template QUDARAP_API void  QU::device_free_and_alloc<float>(float** dd, unsigned num_items) ;
0237 template QUDARAP_API void  QU::device_free_and_alloc<double>(double** dd, unsigned num_items) ;
0238 template QUDARAP_API void  QU::device_free_and_alloc<unsigned>(unsigned** dd, unsigned num_items) ;
0239 template QUDARAP_API void  QU::device_free_and_alloc<int>(int** dd, unsigned num_items) ;
0240 template QUDARAP_API void  QU::device_free_and_alloc<quad>(quad** dd, unsigned num_items) ;
0241 template QUDARAP_API void  QU::device_free_and_alloc<uchar4>(uchar4** dd, unsigned num_items) ;
0242 template QUDARAP_API void  QU::device_free_and_alloc<float4>(float4** dd, unsigned num_items) ;
0243 template QUDARAP_API void  QU::device_free_and_alloc<quad4>(quad4** dd, unsigned num_items) ;
0244
0245 const char* QU::_cudaMalloc_OOM_NOTES = R"( ;
0246 QU::_cudaMalloc_OOM_NOTES
0247 ==========================
0248
0249 When running with debug arrays, such as the record array, enabled
0250 it is necessary to set max_slot to something reasonable, otherwise with the
0251 default max_slot of zero, it gets set to a high value (eg M197 with 24GB)
0252 appropriate for production running with the available VRAM.
0253
0254 One million is typically reasonable for debugging::
0255
0256    export OPTICKS_MAX_SLOT=M1
0257
0258 )" ;
0259
0260
0261
0262
0263 void QU::_cudaMalloc( void** p2p, size_t size, const char* label )
0264 {
0265     cudaError_t err = cudaMalloc(p2p, size ) ;
0266     if( err != cudaSuccess )
0267     {
0268         const char* out = spath::Resolve("$DefaultOutputDir") ;
0269         salloc* estimate = SEventConfig::AllocEstimate();
0270
0271         std::stringstream ss;
0272         ss << "CUDA call (" << label << " ) failed with error: '"
0273            << cudaGetErrorString( err )
0274            << "' (" __FILE__ << ":" << __LINE__ << ")"
0275            << "\n\n"
0276            << "[SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
0277            << SEventConfig::DescEventMode()
0278            << "]SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
0279            << "\n\n"
0280            << "[alloc.desc\n"
0281            << ( alloc ? alloc->desc() : "no-alloc" )
0282            << "]alloc.desc\n"
0283            << "\n"
0284            << "[NOTES\n"
0285            << _cudaMalloc_OOM_NOTES
0286            << "]NOTES\n"
0287            << "\n\n"
0288            << "[SEventConfig::AllocEstimate\n"
0289            << ( estimate ? estimate->desc() : "no-estimate" )
0290            << "]SEventConfig::AllocEstimate\n"
0291            << "save salloc record to [" << out << "]\n" ;
0292            ;
0293
0294         std::string msg = ss.str();
0295         LOG(error) << msg ;
0296
0297         sdirectory::MakeDirs(out,0);
0298         alloc->save(out) ;
0299
0300         throw QUDA_Exception( msg.c_str() );
0301     }
0302 }
0303
0304
0305 template<typename T>
0306 T* QU::device_alloc( unsigned num_items, const char* label )
0307 {
0308     size_t size = num_items*sizeof(T) ;
0309
0310     LOG(LEVEL)
0311         << " num_items " << std::setw(10) << num_items
0312         << " size " << std::setw(10) << size
0313         << " label " << std::setw(15) << label
0314         ;
0315
0316     LOG_IF(info, MEMCHECK)
0317         << " num_items " << std::setw(10) << num_items
0318         << " size " << std::setw(10) << size
0319         << " label " << std::setw(15) << label
0320         ;
0321
0322
0323     alloc_add( label, num_items, sizeof(T) ) ;
0324
0325     T* d ;
0326     _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0327
0328     return d ;
0329 }
0330
0331 template QUDARAP_API float*     QU::device_alloc<float>(unsigned num_items, const char* label) ;
0332 template QUDARAP_API double*    QU::device_alloc<double>(unsigned num_items, const char* label) ;
0333 template QUDARAP_API unsigned*  QU::device_alloc<unsigned>(unsigned num_items, const char* label) ;
0334 template QUDARAP_API int*       QU::device_alloc<int>(unsigned num_items, const char* label) ;
0335 template QUDARAP_API uchar4*    QU::device_alloc<uchar4>(unsigned num_items, const char* label) ;
0336 template QUDARAP_API float4*    QU::device_alloc<float4>(unsigned num_items, const char* label) ;
0337 template QUDARAP_API quad*      QU::device_alloc<quad>(unsigned num_items, const char* label) ;
0338 template QUDARAP_API quad2*     QU::device_alloc<quad2>(unsigned num_items, const char* label) ;
0339 template QUDARAP_API quad4*     QU::device_alloc<quad4>(unsigned num_items, const char* label) ;
0340 template QUDARAP_API quad6*     QU::device_alloc<quad6>(unsigned num_items, const char* label) ;
0341 template QUDARAP_API sevent*    QU::device_alloc<sevent>(unsigned num_items, const char* label) ;
0342 template QUDARAP_API qdebug*    QU::device_alloc<qdebug>(unsigned num_items, const char* label) ;
0343 template QUDARAP_API sstate*    QU::device_alloc<sstate>(unsigned num_items, const char* label) ;
0344 template QUDARAP_API XORWOW*    QU::device_alloc<XORWOW>(unsigned num_items, const char* label) ;
0345 template QUDARAP_API Philox*    QU::device_alloc<Philox>(unsigned num_items, const char* label) ;
0346
0347 #ifndef PRODUCTION
0348 template QUDARAP_API srec*      QU::device_alloc<srec>(unsigned num_items, const char* label) ;
0349 template QUDARAP_API sseq*      QU::device_alloc<sseq>(unsigned num_items, const char* label) ;
0350 #endif
0351
0352 template QUDARAP_API sphoton*   QU::device_alloc<sphoton>(unsigned num_items, const char* label) ;
0353 template QUDARAP_API sphotonlite*   QU::device_alloc<sphotonlite>(unsigned num_items, const char* label) ;
0354
0355
0356 template<typename T>
0357 T* QU::device_alloc_zero(unsigned num_items, const char* label)
0358 {
0359     size_t size = num_items*sizeof(T) ;
0360
0361     LOG(LEVEL)
0362         << " num_items " << std::setw(10) << num_items
0363         << " sizeof(T) " << std::setw(10) << sizeof(T)
0364         << " size " << std::setw(10) << size
0365         << " label " << std::setw(15) << label
0366         ;
0367
0368     LOG_IF(info, MEMCHECK)
0369         << " num_items " << std::setw(10) << num_items
0370         << " sizeof(T) " << std::setw(10) << sizeof(T)
0371         << " size " << std::setw(10) << size
0372         << " label " << std::setw(15) << label
0373         ;
0374
0375
0376     alloc_add( label, num_items, sizeof(T) ) ;
0377
0378     T* d ;
0379     _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0380
0381     int value = 0 ;
0382     QUDA_CHECK( cudaMemset(d, value, size ));
0383
0384     return d ;
0385 }
0386
0387 template QUDARAP_API sphoton*   QU::device_alloc_zero<sphoton>(unsigned num_items, const char* label) ;
0388 template QUDARAP_API sphotonlite*   QU::device_alloc_zero<sphotonlite>(unsigned num_items, const char* label) ;
0389 template QUDARAP_API quad2*     QU::device_alloc_zero<quad2>(  unsigned num_items, const char* label) ;
0390 template QUDARAP_API XORWOW*    QU::device_alloc_zero<XORWOW>(  unsigned num_items, const char* label) ;
0391 template QUDARAP_API Philox*    QU::device_alloc_zero<Philox>(  unsigned num_items, const char* label) ;
0392
0393 #ifndef PRODUCTION
0394 template QUDARAP_API srec*      QU::device_alloc_zero<srec>(   unsigned num_items, const char* label) ;
0395 template QUDARAP_API sseq*      QU::device_alloc_zero<sseq>(   unsigned num_items, const char* label) ;
0396 template QUDARAP_API stag*      QU::device_alloc_zero<stag>(   unsigned num_items, const char* label) ;
0397 template QUDARAP_API sflat*     QU::device_alloc_zero<sflat>(  unsigned num_items, const char* label) ;
0398 #endif
0399
0400
0401
0402
0403 template<typename T>
0404 void QU::device_memset( T* d, int value, unsigned num_items )
0405 {
0406     size_t size = num_items*sizeof(T) ;
0407
0408     LOG_IF(info, MEMCHECK)
0409         << " num_items " << std::setw(10) << num_items
0410         << " sizeof(T) " << std::setw(10) << sizeof(T)
0411         << " size " << std::setw(10) << size
0412         ;
0413
0414     QUDA_CHECK( cudaMemset(d, value, size ));
0415 }
0416
0417 template QUDARAP_API void     QU::device_memset<int>(int*, int, unsigned ) ;
0418 template QUDARAP_API void     QU::device_memset<quad4>(quad4*, int, unsigned ) ;
0419 template QUDARAP_API void     QU::device_memset<quad6>(quad6*, int, unsigned ) ;
0420 template QUDARAP_API void     QU::device_memset<sphoton>(sphoton*, int, unsigned ) ;
0421 template QUDARAP_API void     QU::device_memset<sphotonlite>(sphotonlite*, int, unsigned ) ;
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432 template<typename T>
0433 void QU::device_free( T* d)
0434 {
0435     LOG_IF(info, MEMCHECK) ;
0436     // HMM: could use salloc to find the label ?
0437
0438     QUDA_CHECK( cudaFree(d) );
0439 }
0440
0441 template QUDARAP_API void   QU::device_free<float>(float*) ;
0442 template QUDARAP_API void   QU::device_free<double>(double*) ;
0443 template QUDARAP_API void   QU::device_free<unsigned>(unsigned*) ;
0444 template QUDARAP_API void   QU::device_free<quad2>(quad2*) ;
0445 template QUDARAP_API void   QU::device_free<quad4>(quad4*) ;
0446 template QUDARAP_API void   QU::device_free<sphoton>(sphoton*) ;
0447 template QUDARAP_API void   QU::device_free<sphotonlite>(sphotonlite*) ;
0448 template QUDARAP_API void   QU::device_free<uchar4>(uchar4*) ;
0449 template QUDARAP_API void   QU::device_free<XORWOW>(XORWOW*) ;
0450 template QUDARAP_API void   QU::device_free<Philox>(Philox*) ;
0451
0452
0453 template<typename T>
0454 int QU::copy_device_to_host( T* h, T* d,  unsigned num_items)
0455 {
0456     if( d == nullptr ) std::cerr
0457         << "QU::copy_device_to_host"
0458         << " ERROR : device pointer is null "
0459         << std::endl
0460         ;
0461
0462     if( d == nullptr ) return 1 ;
0463
0464     size_t size = num_items*sizeof(T) ;
0465     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
0466
0467     return 0 ;
0468 }
0469
0470
0471 template int QU::copy_device_to_host<int>(  int* h, int* d,  unsigned num_items);
0472 template int QU::copy_device_to_host<float>(  float* h, float* d,  unsigned num_items);
0473 template int QU::copy_device_to_host<double>( double* h, double* d,  unsigned num_items);
0474 template int QU::copy_device_to_host<quad>( quad* h, quad* d,  unsigned num_items);
0475 template int QU::copy_device_to_host<quad2>( quad2* h, quad2* d,  unsigned num_items);
0476 template int QU::copy_device_to_host<quad4>( quad4* h, quad4* d,  unsigned num_items);
0477 template int QU::copy_device_to_host<sphoton>( sphoton* h, sphoton* d,  unsigned num_items);
0478 template int QU::copy_device_to_host<sphotonlite>( sphotonlite* h, sphotonlite* d,  unsigned num_items);
0479 template int QU::copy_device_to_host<quad6>( quad6* h, quad6* d,  unsigned num_items);
0480 template int QU::copy_device_to_host<sstate>( sstate* h, sstate* d,  unsigned num_items);
0481 template int QU::copy_device_to_host<XORWOW>( XORWOW* h, XORWOW* d,  unsigned num_items);
0482 template int QU::copy_device_to_host<Philox>( Philox* h, Philox* d,  unsigned num_items);
0483 #ifndef PRODUCTION
0484 template int QU::copy_device_to_host<srec>( srec* h, srec* d,  unsigned num_items);
0485 template int QU::copy_device_to_host<sseq>( sseq* h, sseq* d,  unsigned num_items);
0486 template int QU::copy_device_to_host<stag>( stag* h, stag* d,  unsigned num_items);
0487 template int QU::copy_device_to_host<sflat>( sflat* h, sflat* d,  unsigned num_items);
0488 #endif
0489
0490
0491 /**
0492 QU::copy_device_to_host_and_free
0493 ----------------------------------
0494
0495 * Summary: when you get cudaMemcpy copyback errors look for infinite loops in kernels
0496 * Find the problem by doing things like adding loop limiters
0497
0498
0499 Normally the problem is not related to the copying but rather some issue
0500 with the kernel being called. So start by doing "binary" search
0501 simplifying the kernel to find where the issue is.
0502
0503 When a kernel misbehaves, such as going into an infinite loop for example, the
0504 connection to the GPU will typically timeout. Subsequent attempts to copyback arrays that
0505 should have been written by the kernel would then fail during the cudaMemcpy
0506 presumably because the CUDA context is lost as a result of the timeout making
0507 all the device pointers invalid. The copyback is the usual thing to fail because
0508 it is the normally the first thing to use the stale pointers after the kernel launch.
0509
0510
0511 Debug tip 0
0512 ~~~~~~~~~~~~~
0513
0514 Simply add "return 0" to call with issue, and
0515 progressivley move that forwards to find where
0516 the issue is.
0517
0518
0519 Debug tip 1 : check kernel inputs
0520 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0521
0522 Instead of doing whatever computation in the kernel,
0523 populate the output array with the inputs.
0524 This checks both having expected inputs at the kernel
0525 and the copy out machinery.
0526
0527 Debug tip 2 : check intermediate kernel results
0528 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0529
0530 Intead of doing the full kernel calculation, check the
0531 first half of the calculation by copying intermediate
0532 results into the output array.
0533
0534
0535 **/
0536
0537 template<typename T>
0538 void QU::copy_device_to_host_and_free( T* h, T* d,  unsigned num_items, const char* label)
0539 {
0540     size_t size = num_items*sizeof(T) ;
0541     LOG(LEVEL)
0542         << "copy " << num_items
0543         << " sizeof(T) " << sizeof(T)
0544         << " label " << ( label ? label : "-" )
0545         ;
0546
0547     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
0548     QUDA_CHECK( cudaFree(d) );
0549 }
0550
0551
0552 template void QU::copy_device_to_host_and_free<float>(  float* h, float* d,  unsigned num_items, const char* label );
0553 template void QU::copy_device_to_host_and_free<double>( double* h, double* d,  unsigned num_items, const char* label);
0554 template void QU::copy_device_to_host_and_free<quad>( quad* h, quad* d,  unsigned num_items, const char* label);
0555 template void QU::copy_device_to_host_and_free<quad2>( quad2* h, quad2* d,  unsigned num_items, const char* label);
0556 template void QU::copy_device_to_host_and_free<quad4>( quad4* h, quad4* d,  unsigned num_items, const char* label);
0557 template void QU::copy_device_to_host_and_free<sphoton>( sphoton* h, sphoton* d,  unsigned num_items, const char* label);
0558 template void QU::copy_device_to_host_and_free<sphotonlite>( sphotonlite* h, sphotonlite* d,  unsigned num_items, const char* label);
0559 template void QU::copy_device_to_host_and_free<quad6>( quad6* h, quad6* d,  unsigned num_items, const char* label);
0560 template void QU::copy_device_to_host_and_free<sstate>( sstate* h, sstate* d,  unsigned num_items, const char* label);
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573 template<typename T>
0574 void QU::copy_host_to_device( T* d, const T* h, unsigned num_items)
0575 {
0576     size_t size = num_items*sizeof(T) ;
0577     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d ), h , size, cudaMemcpyHostToDevice ));
0578 }
0579
0580 template void QU::copy_host_to_device<float>(    float* d,   const float* h, unsigned num_items);
0581 template void QU::copy_host_to_device<double>(   double* d,  const double* h, unsigned num_items);
0582 template void QU::copy_host_to_device<unsigned>( unsigned* d, const unsigned* h, unsigned num_items);
0583 template void QU::copy_host_to_device<sevent>(   sevent* d,   const sevent* h, unsigned num_items);
0584 template void QU::copy_host_to_device<quad4>(    quad4* d,    const quad4* h, unsigned num_items);
0585 template void QU::copy_host_to_device<sphoton>(  sphoton* d,  const sphoton* h, unsigned num_items);
0586 template void QU::copy_host_to_device<sphotonlite>(  sphotonlite* d,  const sphotonlite* h, unsigned num_items);
0587 template void QU::copy_host_to_device<quad6>(    quad6* d,    const quad6* h, unsigned num_items);
0588 template void QU::copy_host_to_device<quad2>(    quad2* d,    const quad2* h, unsigned num_items);
0589 template void QU::copy_host_to_device<XORWOW>(   XORWOW* d,   const XORWOW* h,   unsigned num_items);
0590 template void QU::copy_host_to_device<Philox>(   Philox* d,   const Philox* h,   unsigned num_items);
0591
0592 /**
0593 QU::NumItems
0594 ---------------
0595
0596 Apply heuristics to determine the number of intended GPU buffer items
0597 using the size of the template type and the shape of the NP array.
0598
0599 **/
0600
0601 template <typename T>
0602 unsigned QU::NumItems( const NP* a )
0603 {
0604     unsigned num_items = 0 ;
0605
0606     if( sizeof(T) == sizeof(float)*6*4 )   // looks like quad6
0607     {
0608         if(a->shape.size() == 3 )
0609         {
0610             assert( a->has_shape( -1, 6, 4) );
0611             num_items = a->shape[0] ;
0612         }
0613     }
0614     else if( sizeof(T) == sizeof(float)*4*4 )   // looks like quad4
0615     {
0616         if(a->shape.size() == 3 )
0617         {
0618             assert( a->has_shape( -1, 4, 4) );
0619             num_items = a->shape[0] ;
0620         }
0621         else if(a->shape.size() == 4 )
0622         {
0623             assert( a->shape[2] == 2 && a->shape[3] == 4 );
0624             num_items = a->shape[0]*a->shape[1] ;
0625         }
0626     }
0627     else if( sizeof(T) == sizeof(float)*4*2 ) // looks like quad2
0628     {
0629         if(a->shape.size() == 3 )
0630         {
0631             assert( a->has_shape( -1, 2, 4) );
0632             num_items = a->shape[0] ;
0633         }
0634         else if(a->shape.size() == 4 )
0635         {
0636             assert( a->shape[2] == 2 && a->shape[3] == 4 );
0637             num_items = a->shape[0]*a->shape[1] ;
0638         }
0639     }
0640     return num_items ;
0641 }
0642
0643 template unsigned QU::NumItems<quad2>(const NP* );
0644 template unsigned QU::NumItems<quad4>(const NP* );
0645 template unsigned QU::NumItems<quad6>(const NP* );
0646
0647
0648 /**
0649 QU::copy_host_to_device
0650 ------------------------
0651
0652 HMM: encapsulating determination of num_items is less useful than
0653 would initially expect because will always need to know
0654 and record the num_items in a shared GPU/CPU location like sevent.
0655 And also will often need to allocate the buffer first too.
0656
0657 Suggesting should generally use this via QEvt.
0658
0659 **/
0660
0661 template <typename T>
0662 unsigned QU::copy_host_to_device( T* d, const NP* a)
0663 {
0664     unsigned num_items = NumItems<T>(a);
0665     if( num_items == 0 )
0666     {
0667         LOG(fatal) << " failed to devine num_items for array " << a->sstr() << " with template type where sizeof(T) " << sizeof(T) ;
0668     }
0669
0670     if( num_items > 0 )
0671     {
0672         copy_host_to_device( d, (T*)a->bytes(), num_items );
0673     }
0674     return num_items ;
0675 }
0676
0677 template unsigned QU::copy_host_to_device<quad2>( quad2* , const NP* );
0678 template unsigned QU::copy_host_to_device<quad4>( quad4* , const NP* );
0679 template unsigned QU::copy_host_to_device<quad6>( quad6* , const NP* );
0680
0681
0682
0683
0684
0685 /**
0686 QU::ConfigureLaunch
0687 ---------------------
0688
0689
0690
0691
0692 **/
0693
0694 void QU::ConfigureLaunch( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height ) // static
0695 {
0696     threadsPerBlock.x = 512 ;
0697     threadsPerBlock.y = 1 ;
0698     threadsPerBlock.z = 1 ;
0699
0700     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0701     numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0702     numBlocks.z = 1 ;
0703
0704     // hmm this looks to not handle height other than 1
0705 }
0706
0707 void QU::ConfigureLaunch1D( dim3& numBlocks, dim3& threadsPerBlock, unsigned num, unsigned threads_per_block ) // static
0708 {
0709     threadsPerBlock.x = threads_per_block ;
0710     threadsPerBlock.y = 1 ;
0711     threadsPerBlock.z = 1 ;
0712
0713     numBlocks.x = (num + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0714     numBlocks.y = 1 ;
0715     numBlocks.z = 1 ;
0716 }
0717
0718
0719
0720 void QU::ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height ) // static
0721 {
0722     threadsPerBlock.x = 16 ;
0723     threadsPerBlock.y = 16 ;
0724     threadsPerBlock.z = 1 ;
0725
0726     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0727     numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0728     numBlocks.z = 1 ;
0729 }
0730
0731
0732 void QU::ConfigureLaunch16( dim3& numBlocks, dim3& threadsPerBlock ) // static
0733 {
0734     threadsPerBlock.x = 16 ;
0735     threadsPerBlock.y = 1 ;
0736     threadsPerBlock.z = 1 ;
0737
0738     numBlocks.x = 1 ;
0739     numBlocks.y = 1 ;
0740     numBlocks.z = 1 ;
0741 }
0742
0743
0744 std::string QU::Desc(const dim3& d, int w) // static
0745 {
0746     std::stringstream ss ;
0747     ss << "( "
0748         << std::setw(w) << d.x
0749         << " "
0750         << std::setw(w) << d.y
0751         << " "
0752         << std::setw(w) << d.z
0753         << ")"
0754         ;
0755     std::string s = ss.str();
0756     return s ;
0757 }
0758
0759 std::string QU::DescLaunch( const dim3& numBlocks, const dim3& threadsPerBlock ) // static
0760 {
0761     std::stringstream ss ;
0762     ss
0763         << " numBlocks " << Desc(numBlocks,4)
0764         << " threadsPerBlock " << Desc(threadsPerBlock, 4)
0765         ;
0766     std::string s = ss.str();
0767     return s ;
0768 }
0769