Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:11

0001 #include <cassert>
0002 
0003 #include "NP.hh"
0004 #include "SLOG.hh"
0005 
0006 #include "spath.h"
0007 #include "sdirectory.h"
0008 #include "scuda.h"
0009 #include "squad.h"
0010 #include "ssys.h"
0011 
0012 #ifndef PRODUCTION
0013 #include "srec.h"
0014 #include "sseq.h"
0015 #endif
0016 
0017 #include "sphoton.h"
0018 #include "sphotonlite.h"
0019 
0020 #include "sevent.h"
0021 #include "salloc.h"
0022 #include "SEventConfig.hh"
0023 
0024 #include "QUDA_CHECK.h"
0025 #include "QU.hh"
0026 
0027 #include "curand_kernel.h"
0028 #include "qrng.h"
0029 #include "qsim.h"
0030 
0031 #include "qbase.h"
0032 #include "qprop.h"
0033 #include "qpmt.h"
0034 #include "qdebug.h"
0035 #include "qscint.h"
0036 #include "qcerenkov.h"
0037 #include "qcurandwrap.h"
0038 #include "scurandref.h"
0039 #include "qmultifilm.h"
0040 
0041 
0042 const plog::Severity QU::LEVEL = SLOG::EnvLevel("QU", "DEBUG") ;
0043 bool QU::MEMCHECK = ssys::getenvbool(_MEMCHECK);
0044 
0045 salloc* QU::alloc = nullptr ;
0046 
0047 
0048 void QU::alloc_add(const char* label, uint64_t num_items, uint64_t sizeof_item ) // static
0049 {
0050    if(!alloc) alloc = SEventConfig::ALLOC ;
0051    if(alloc ) alloc->add(label, num_items, sizeof_item );
0052 }
0053 
0054 
0055 template <typename T>
0056 char QU::typecode()
0057 {
0058     char c = '?' ;
0059     switch(sizeof(T))
0060     {
0061         case 4: c = 'f' ; break ;
0062         case 8: c = 'd' ; break ;
0063     }
0064     return c ;
0065 }
0066 
0067 template char QU::typecode<float>() ;
0068 template char QU::typecode<double>() ;
0069 
0070 
0071 template <typename T>
0072 std::string QU::rng_sequence_name(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) // static
0073 {
0074     std::stringstream ss ;
0075     ss << prefix
0076        << "_" << QU::typecode<T>()
0077        << "_ni" << ni
0078        << "_nj" << nj
0079        << "_nk" << nk
0080        << "_ioffset" << std::setw(6) << std::setfill('0') << ioffset
0081        << ".npy"
0082        ;
0083 
0084     std::string name = ss.str();
0085     return name ;
0086 }
0087 
0088 template std::string QU::rng_sequence_name<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
0089 template std::string QU::rng_sequence_name<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
0090 
0091 
0092 
0093 template <typename T>
0094 std::string QU::rng_sequence_reldir(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) // static
0095 {
0096     std::stringstream ss ;
0097     ss << prefix
0098        << "_" << QU::typecode<T>()
0099        << "_ni" << ni
0100        << "_nj" << nj
0101        << "_nk" << nk
0102        << "_tranche" << ni_tranche_size
0103        ;
0104 
0105     std::string reldir = ss.str();
0106     return reldir ;
0107 }
0108 
0109 template std::string QU::rng_sequence_reldir<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
0110 template std::string QU::rng_sequence_reldir<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
0111 
0112 
0113 
0114 
0115 /**
0116 QU::UploadArray
0117 ----------------
0118 
0119 Allocate on device and copy from host to device
0120 
0121 **/
0122 
0123 template <typename T>
0124 T* QU::UploadArray(const T* array, unsigned num_items, const char* label ) // static
0125 {
0126     size_t size = num_items*sizeof(T) ;
0127 
0128     LOG(LEVEL)
0129        << " num_items " << num_items
0130        << " size " << size
0131        << " label " << ( label ? label : "-" )
0132        ;
0133 
0134     LOG_IF(info, MEMCHECK)
0135        << " num_items " << num_items
0136        << " size " << size
0137        << " label " << ( label ? label : "-" )
0138        ;
0139 
0140 
0141     alloc_add( label, num_items, sizeof(T) ) ;
0142 
0143     T* d_array = nullptr ;
0144     QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( &d_array ), size ));
0145     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d_array ), array, size, cudaMemcpyHostToDevice ));
0146     return d_array ;
0147 }
0148 
0149 
0150 // IF NEED THESE FROM REMOVE PKG WILL NEED TO QUDARAP_API
0151 template float*         QU::UploadArray<float>(const float* array, unsigned num_items, const char* label ) ;
0152 template double*        QU::UploadArray<double>(const double* array, unsigned num_items, const char* label) ;
0153 template unsigned*      QU::UploadArray<unsigned>(const unsigned* array, unsigned num_items, const char* label) ;
0154 template int*           QU::UploadArray<int>(const int* array, unsigned num_items, const char* label) ;
0155 template quad4*         QU::UploadArray<quad4>(const quad4* array, unsigned num_items, const char* label) ;
0156 template sphoton*       QU::UploadArray<sphoton>(const sphoton* array, unsigned num_items, const char* label) ;
0157 template sphotonlite*   QU::UploadArray<sphotonlite>(const sphotonlite* array, unsigned num_items, const char* label) ;
0158 template quad2*         QU::UploadArray<quad2>(const quad2* array, unsigned num_items, const char* label) ;
0159 template XORWOW*        QU::UploadArray<XORWOW>(const XORWOW* array, unsigned num_items, const char* label) ;
0160 template Philox*        QU::UploadArray<Philox>(const Philox* array, unsigned num_items, const char* label) ;
0161 template qcurandwrap<XORWOW>*   QU::UploadArray<qcurandwrap<XORWOW>>(const qcurandwrap<XORWOW>* array, unsigned num_items, const char* label) ;
0162 template scurandref<XORWOW>*    QU::UploadArray<scurandref<XORWOW>>(const scurandref<XORWOW>* array, unsigned num_items, const char* label) ;
0163 template qsim*          QU::UploadArray<qsim>(const qsim* array, unsigned num_items, const char* label) ;
0164 template qprop<float>*  QU::UploadArray<qprop<float>>(const qprop<float>* array, unsigned num_items, const char* label) ;
0165 template qprop<double>* QU::UploadArray<qprop<double>>(const qprop<double>* array, unsigned num_items, const char* label) ;
0166 template qpmt<float>*   QU::UploadArray<qpmt<float>>(const qpmt<float>* array, unsigned num_items, const char* label) ;
0167 template qpmt<double>*  QU::UploadArray<qpmt<double>>(const qpmt<double>* array, unsigned num_items, const char* label) ;
0168 template qmultifilm*    QU::UploadArray<qmultifilm>(const qmultifilm* array, unsigned num_items, const char* label) ;
0169 template qrng<RNG>*     QU::UploadArray<qrng<RNG>>(const qrng<RNG>* array, unsigned num_items, const char* label) ;
0170 template qbnd*          QU::UploadArray<qbnd>(const qbnd* array, unsigned num_items, const char* label) ;
0171 template sevent*        QU::UploadArray<sevent>(const sevent* array, unsigned num_items, const char* label) ;
0172 template qdebug*        QU::UploadArray<qdebug>(const qdebug* array, unsigned num_items, const char* label) ;
0173 template qscint*        QU::UploadArray<qscint>(const qscint* array, unsigned num_items, const char* label) ;
0174 template qcerenkov*     QU::UploadArray<qcerenkov>(const qcerenkov* array, unsigned num_items, const char* label) ;
0175 template qbase*         QU::UploadArray<qbase>(const qbase* array, unsigned num_items, const char* label) ;
0176 
0177 
0178 
0179 /**
0180 QU::DownloadArray
0181 -------------------
0182 
0183 Allocate on host and copy from device to host
0184 
0185 **/
0186 
0187 template <typename T>
0188 T* QU::DownloadArray(const T* d_array, unsigned num_items ) // static
0189 {
0190     T* array = new T[num_items] ;
0191     QUDA_CHECK( cudaMemcpy( array, d_array, sizeof(T)*num_items, cudaMemcpyDeviceToHost ));
0192     return array ;
0193 }
0194 
0195 
0196 template  float*         QU::DownloadArray<float>(const float* d_array, unsigned num_items) ;
0197 template  unsigned*      QU::DownloadArray<unsigned>(const unsigned* d_array, unsigned num_items) ;
0198 template  int*           QU::DownloadArray<int>(const int* d_array, unsigned num_items) ;
0199 template  quad4*         QU::DownloadArray<quad4>(const quad4* d_array, unsigned num_items) ;
0200 template  quad2*         QU::DownloadArray<quad2>(const quad2* d_array, unsigned num_items) ;
0201 template  XORWOW*        QU::DownloadArray<XORWOW>(const XORWOW* d_array, unsigned num_items) ;
0202 template  Philox*        QU::DownloadArray<Philox>(const Philox* d_array, unsigned num_items) ;
0203 template  qprop<float>*  QU::DownloadArray<qprop<float>>(const qprop<float>* d_array, unsigned num_items) ;
0204 template  qprop<double>* QU::DownloadArray<qprop<double>>(const qprop<double>* d_array, unsigned num_items) ;
0205 
0206 
0207 template <typename T>
0208 void QU::Download(std::vector<T>& vec, const T* d_array, unsigned num_items)  // static
0209 {
0210     vec.resize( num_items);
0211     QUDA_CHECK( cudaMemcpy( static_cast<void*>( vec.data() ), d_array, num_items*sizeof(T), cudaMemcpyDeviceToHost));
0212 }
0213 
0214 
0215 template QUDARAP_API void QU::Download<float>(   std::vector<float>& vec,    const float* d_array,    unsigned num_items);
0216 template QUDARAP_API void QU::Download<unsigned>(std::vector<unsigned>& vec, const unsigned* d_array, unsigned num_items);
0217 template QUDARAP_API void QU::Download<int>(     std::vector<int>& vec,      const int* d_array,      unsigned num_items);
0218 template QUDARAP_API void QU::Download<uchar4>(  std::vector<uchar4>& vec,   const uchar4* d_array,   unsigned num_items);
0219 template QUDARAP_API void QU::Download<float4>(  std::vector<float4>& vec,   const float4* d_array,   unsigned num_items);
0220 template QUDARAP_API void QU::Download<quad4>(   std::vector<quad4>& vec,    const quad4* d_array,    unsigned num_items);
0221 
0222 
0223 
0224 template<typename T>
0225 void QU::device_free_and_alloc(T** dd, unsigned num_items ) // dd: pointer-to-device-pointer
0226 {
0227     size_t size = num_items*sizeof(T) ;
0228     LOG_IF(info, MEMCHECK) << " size " << size << " num_items " << num_items ;
0229 
0230     QUDA_CHECK( cudaFree( reinterpret_cast<void*>( *dd ) ) );
0231     QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( dd ), size ));
0232     assert( *dd );
0233 }
0234 
0235 
0236 template QUDARAP_API void  QU::device_free_and_alloc<float>(float** dd, unsigned num_items) ;
0237 template QUDARAP_API void  QU::device_free_and_alloc<double>(double** dd, unsigned num_items) ;
0238 template QUDARAP_API void  QU::device_free_and_alloc<unsigned>(unsigned** dd, unsigned num_items) ;
0239 template QUDARAP_API void  QU::device_free_and_alloc<int>(int** dd, unsigned num_items) ;
0240 template QUDARAP_API void  QU::device_free_and_alloc<quad>(quad** dd, unsigned num_items) ;
0241 template QUDARAP_API void  QU::device_free_and_alloc<uchar4>(uchar4** dd, unsigned num_items) ;
0242 template QUDARAP_API void  QU::device_free_and_alloc<float4>(float4** dd, unsigned num_items) ;
0243 template QUDARAP_API void  QU::device_free_and_alloc<quad4>(quad4** dd, unsigned num_items) ;
0244 
0245 const char* QU::_cudaMalloc_OOM_NOTES = R"( ;
0246 QU::_cudaMalloc_OOM_NOTES
0247 ==========================
0248 
0249 When running with debug arrays, such as the record array, enabled
0250 it is necessary to set max_slot to something reasonable, otherwise with the
0251 default max_slot of zero, it gets set to a high value (eg M197 with 24GB)
0252 appropriate for production running with the available VRAM.
0253 
0254 One million is typically reasonable for debugging::
0255 
0256    export OPTICKS_MAX_SLOT=M1
0257 
0258 )" ;
0259 
0260 
0261 
0262 
0263 void QU::_cudaMalloc( void** p2p, size_t size, const char* label )
0264 {
0265     cudaError_t err = cudaMalloc(p2p, size ) ;
0266     if( err != cudaSuccess )
0267     {
0268         const char* out = spath::Resolve("$DefaultOutputDir") ;
0269         salloc* estimate = SEventConfig::AllocEstimate();
0270 
0271         std::stringstream ss;
0272         ss << "CUDA call (" << label << " ) failed with error: '"
0273            << cudaGetErrorString( err )
0274            << "' (" __FILE__ << ":" << __LINE__ << ")"
0275            << "\n\n"
0276            << "[SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
0277            << SEventConfig::DescEventMode()
0278            << "]SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
0279            << "\n\n"
0280            << "[alloc.desc\n"
0281            << ( alloc ? alloc->desc() : "no-alloc" )
0282            << "]alloc.desc\n"
0283            << "\n"
0284            << "[NOTES\n"
0285            << _cudaMalloc_OOM_NOTES
0286            << "]NOTES\n"
0287            << "\n\n"
0288            << "[SEventConfig::AllocEstimate\n"
0289            << ( estimate ? estimate->desc() : "no-estimate" )
0290            << "]SEventConfig::AllocEstimate\n"
0291            << "save salloc record to [" << out << "]\n" ;
0292            ;
0293 
0294         std::string msg = ss.str();
0295         LOG(error) << msg ;
0296 
0297         sdirectory::MakeDirs(out,0);
0298         alloc->save(out) ;
0299 
0300         throw QUDA_Exception( msg.c_str() );
0301     }
0302 }
0303 
0304 
0305 template<typename T>
0306 T* QU::device_alloc( unsigned num_items, const char* label )
0307 {
0308     size_t size = num_items*sizeof(T) ;
0309 
0310     LOG(LEVEL)
0311         << " num_items " << std::setw(10) << num_items
0312         << " size " << std::setw(10) << size
0313         << " label " << std::setw(15) << label
0314         ;
0315 
0316     LOG_IF(info, MEMCHECK)
0317         << " num_items " << std::setw(10) << num_items
0318         << " size " << std::setw(10) << size
0319         << " label " << std::setw(15) << label
0320         ;
0321 
0322 
0323     alloc_add( label, num_items, sizeof(T) ) ;
0324 
0325     T* d ;
0326     _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0327 
0328     return d ;
0329 }
0330 
0331 template QUDARAP_API float*     QU::device_alloc<float>(unsigned num_items, const char* label) ;
0332 template QUDARAP_API double*    QU::device_alloc<double>(unsigned num_items, const char* label) ;
0333 template QUDARAP_API unsigned*  QU::device_alloc<unsigned>(unsigned num_items, const char* label) ;
0334 template QUDARAP_API int*       QU::device_alloc<int>(unsigned num_items, const char* label) ;
0335 template QUDARAP_API uchar4*    QU::device_alloc<uchar4>(unsigned num_items, const char* label) ;
0336 template QUDARAP_API float4*    QU::device_alloc<float4>(unsigned num_items, const char* label) ;
0337 template QUDARAP_API quad*      QU::device_alloc<quad>(unsigned num_items, const char* label) ;
0338 template QUDARAP_API quad2*     QU::device_alloc<quad2>(unsigned num_items, const char* label) ;
0339 template QUDARAP_API quad4*     QU::device_alloc<quad4>(unsigned num_items, const char* label) ;
0340 template QUDARAP_API quad6*     QU::device_alloc<quad6>(unsigned num_items, const char* label) ;
0341 template QUDARAP_API sevent*    QU::device_alloc<sevent>(unsigned num_items, const char* label) ;
0342 template QUDARAP_API qdebug*    QU::device_alloc<qdebug>(unsigned num_items, const char* label) ;
0343 template QUDARAP_API sstate*    QU::device_alloc<sstate>(unsigned num_items, const char* label) ;
0344 template QUDARAP_API XORWOW*    QU::device_alloc<XORWOW>(unsigned num_items, const char* label) ;
0345 template QUDARAP_API Philox*    QU::device_alloc<Philox>(unsigned num_items, const char* label) ;
0346 
0347 #ifndef PRODUCTION
0348 template QUDARAP_API srec*      QU::device_alloc<srec>(unsigned num_items, const char* label) ;
0349 template QUDARAP_API sseq*      QU::device_alloc<sseq>(unsigned num_items, const char* label) ;
0350 #endif
0351 
0352 template QUDARAP_API sphoton*   QU::device_alloc<sphoton>(unsigned num_items, const char* label) ;
0353 template QUDARAP_API sphotonlite*   QU::device_alloc<sphotonlite>(unsigned num_items, const char* label) ;
0354 
0355 
0356 template<typename T>
0357 T* QU::device_alloc_zero(unsigned num_items, const char* label)
0358 {
0359     size_t size = num_items*sizeof(T) ;
0360 
0361     LOG(LEVEL)
0362         << " num_items " << std::setw(10) << num_items
0363         << " sizeof(T) " << std::setw(10) << sizeof(T)
0364         << " size " << std::setw(10) << size
0365         << " label " << std::setw(15) << label
0366         ;
0367 
0368     LOG_IF(info, MEMCHECK)
0369         << " num_items " << std::setw(10) << num_items
0370         << " sizeof(T) " << std::setw(10) << sizeof(T)
0371         << " size " << std::setw(10) << size
0372         << " label " << std::setw(15) << label
0373         ;
0374 
0375 
0376     alloc_add( label, num_items, sizeof(T) ) ;
0377 
0378     T* d ;
0379     _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0380 
0381     int value = 0 ;
0382     QUDA_CHECK( cudaMemset(d, value, size ));
0383 
0384     return d ;
0385 }
0386 
0387 template QUDARAP_API sphoton*   QU::device_alloc_zero<sphoton>(unsigned num_items, const char* label) ;
0388 template QUDARAP_API sphotonlite*   QU::device_alloc_zero<sphotonlite>(unsigned num_items, const char* label) ;
0389 template QUDARAP_API quad2*     QU::device_alloc_zero<quad2>(  unsigned num_items, const char* label) ;
0390 template QUDARAP_API XORWOW*    QU::device_alloc_zero<XORWOW>(  unsigned num_items, const char* label) ;
0391 template QUDARAP_API Philox*    QU::device_alloc_zero<Philox>(  unsigned num_items, const char* label) ;
0392 
0393 #ifndef PRODUCTION
0394 template QUDARAP_API srec*      QU::device_alloc_zero<srec>(   unsigned num_items, const char* label) ;
0395 template QUDARAP_API sseq*      QU::device_alloc_zero<sseq>(   unsigned num_items, const char* label) ;
0396 template QUDARAP_API stag*      QU::device_alloc_zero<stag>(   unsigned num_items, const char* label) ;
0397 template QUDARAP_API sflat*     QU::device_alloc_zero<sflat>(  unsigned num_items, const char* label) ;
0398 #endif
0399 
0400 
0401 
0402 
0403 template<typename T>
0404 void QU::device_memset( T* d, int value, unsigned num_items )
0405 {
0406     size_t size = num_items*sizeof(T) ;
0407 
0408     LOG_IF(info, MEMCHECK)
0409         << " num_items " << std::setw(10) << num_items
0410         << " sizeof(T) " << std::setw(10) << sizeof(T)
0411         << " size " << std::setw(10) << size
0412         ;
0413 
0414     QUDA_CHECK( cudaMemset(d, value, size ));
0415 }
0416 
0417 template QUDARAP_API void     QU::device_memset<int>(int*, int, unsigned ) ;
0418 template QUDARAP_API void     QU::device_memset<quad4>(quad4*, int, unsigned ) ;
0419 template QUDARAP_API void     QU::device_memset<quad6>(quad6*, int, unsigned ) ;
0420 template QUDARAP_API void     QU::device_memset<sphoton>(sphoton*, int, unsigned ) ;
0421 template QUDARAP_API void     QU::device_memset<sphotonlite>(sphotonlite*, int, unsigned ) ;
0422 
0423 
0424 
0425 
0426 
0427 
0428 
0429 
0430 
0431 
0432 template<typename T>
0433 void QU::device_free( T* d)
0434 {
0435     LOG_IF(info, MEMCHECK) ;
0436     // HMM: could use salloc to find the label ?
0437 
0438     QUDA_CHECK( cudaFree(d) );
0439 }
0440 
0441 template QUDARAP_API void   QU::device_free<float>(float*) ;
0442 template QUDARAP_API void   QU::device_free<double>(double*) ;
0443 template QUDARAP_API void   QU::device_free<unsigned>(unsigned*) ;
0444 template QUDARAP_API void   QU::device_free<quad2>(quad2*) ;
0445 template QUDARAP_API void   QU::device_free<quad4>(quad4*) ;
0446 template QUDARAP_API void   QU::device_free<sphoton>(sphoton*) ;
0447 template QUDARAP_API void   QU::device_free<sphotonlite>(sphotonlite*) ;
0448 template QUDARAP_API void   QU::device_free<uchar4>(uchar4*) ;
0449 template QUDARAP_API void   QU::device_free<XORWOW>(XORWOW*) ;
0450 template QUDARAP_API void   QU::device_free<Philox>(Philox*) ;
0451 
0452 
0453 template<typename T>
0454 int QU::copy_device_to_host( T* h, T* d,  unsigned num_items)
0455 {
0456     if( d == nullptr ) std::cerr
0457         << "QU::copy_device_to_host"
0458         << " ERROR : device pointer is null "
0459         << std::endl
0460         ;
0461 
0462     if( d == nullptr ) return 1 ;
0463 
0464     size_t size = num_items*sizeof(T) ;
0465     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
0466 
0467     return 0 ;
0468 }
0469 
0470 
0471 template int QU::copy_device_to_host<int>(  int* h, int* d,  unsigned num_items);
0472 template int QU::copy_device_to_host<float>(  float* h, float* d,  unsigned num_items);
0473 template int QU::copy_device_to_host<double>( double* h, double* d,  unsigned num_items);
0474 template int QU::copy_device_to_host<quad>( quad* h, quad* d,  unsigned num_items);
0475 template int QU::copy_device_to_host<quad2>( quad2* h, quad2* d,  unsigned num_items);
0476 template int QU::copy_device_to_host<quad4>( quad4* h, quad4* d,  unsigned num_items);
0477 template int QU::copy_device_to_host<sphoton>( sphoton* h, sphoton* d,  unsigned num_items);
0478 template int QU::copy_device_to_host<sphotonlite>( sphotonlite* h, sphotonlite* d,  unsigned num_items);
0479 template int QU::copy_device_to_host<quad6>( quad6* h, quad6* d,  unsigned num_items);
0480 template int QU::copy_device_to_host<sstate>( sstate* h, sstate* d,  unsigned num_items);
0481 template int QU::copy_device_to_host<XORWOW>( XORWOW* h, XORWOW* d,  unsigned num_items);
0482 template int QU::copy_device_to_host<Philox>( Philox* h, Philox* d,  unsigned num_items);
0483 #ifndef PRODUCTION
0484 template int QU::copy_device_to_host<srec>( srec* h, srec* d,  unsigned num_items);
0485 template int QU::copy_device_to_host<sseq>( sseq* h, sseq* d,  unsigned num_items);
0486 template int QU::copy_device_to_host<stag>( stag* h, stag* d,  unsigned num_items);
0487 template int QU::copy_device_to_host<sflat>( sflat* h, sflat* d,  unsigned num_items);
0488 #endif
0489 
0490 
0491 /**
0492 QU::copy_device_to_host_and_free
0493 ----------------------------------
0494 
0495 * Summary: when you get cudaMemcpy copyback errors look for infinite loops in kernels
0496 * Find the problem by doing things like adding loop limiters
0497 
0498 
0499 Normally the problem is not related to the copying but rather some issue
0500 with the kernel being called. So start by doing "binary" search
0501 simplifying the kernel to find where the issue is.
0502 
0503 When a kernel misbehaves, such as going into an infinite loop for example, the
0504 connection to the GPU will typically timeout. Subsequent attempts to copyback arrays that
0505 should have been written by the kernel would then fail during the cudaMemcpy
0506 presumably because the CUDA context is lost as a result of the timeout making
0507 all the device pointers invalid. The copyback is the usual thing to fail because
0508 it is the normally the first thing to use the stale pointers after the kernel launch.
0509 
0510 
0511 Debug tip 0
0512 ~~~~~~~~~~~~~
0513 
0514 Simply add "return 0" to call with issue, and
0515 progressivley move that forwards to find where
0516 the issue is.
0517 
0518 
0519 Debug tip 1 : check kernel inputs
0520 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0521 
0522 Instead of doing whatever computation in the kernel,
0523 populate the output array with the inputs.
0524 This checks both having expected inputs at the kernel
0525 and the copy out machinery.
0526 
0527 Debug tip 2 : check intermediate kernel results
0528 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
0529 
0530 Intead of doing the full kernel calculation, check the
0531 first half of the calculation by copying intermediate
0532 results into the output array.
0533 
0534 
0535 **/
0536 
0537 template<typename T>
0538 void QU::copy_device_to_host_and_free( T* h, T* d,  unsigned num_items, const char* label)
0539 {
0540     size_t size = num_items*sizeof(T) ;
0541     LOG(LEVEL)
0542         << "copy " << num_items
0543         << " sizeof(T) " << sizeof(T)
0544         << " label " << ( label ? label : "-" )
0545         ;
0546 
0547     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
0548     QUDA_CHECK( cudaFree(d) );
0549 }
0550 
0551 
0552 template void QU::copy_device_to_host_and_free<float>(  float* h, float* d,  unsigned num_items, const char* label );
0553 template void QU::copy_device_to_host_and_free<double>( double* h, double* d,  unsigned num_items, const char* label);
0554 template void QU::copy_device_to_host_and_free<quad>( quad* h, quad* d,  unsigned num_items, const char* label);
0555 template void QU::copy_device_to_host_and_free<quad2>( quad2* h, quad2* d,  unsigned num_items, const char* label);
0556 template void QU::copy_device_to_host_and_free<quad4>( quad4* h, quad4* d,  unsigned num_items, const char* label);
0557 template void QU::copy_device_to_host_and_free<sphoton>( sphoton* h, sphoton* d,  unsigned num_items, const char* label);
0558 template void QU::copy_device_to_host_and_free<sphotonlite>( sphotonlite* h, sphotonlite* d,  unsigned num_items, const char* label);
0559 template void QU::copy_device_to_host_and_free<quad6>( quad6* h, quad6* d,  unsigned num_items, const char* label);
0560 template void QU::copy_device_to_host_and_free<sstate>( sstate* h, sstate* d,  unsigned num_items, const char* label);
0561 
0562 
0563 
0564 
0565 
0566 
0567 
0568 
0569 
0570 
0571 
0572 
0573 template<typename T>
0574 void QU::copy_host_to_device( T* d, const T* h, unsigned num_items)
0575 {
0576     size_t size = num_items*sizeof(T) ;
0577     QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d ), h , size, cudaMemcpyHostToDevice ));
0578 }
0579 
0580 template void QU::copy_host_to_device<float>(    float* d,   const float* h, unsigned num_items);
0581 template void QU::copy_host_to_device<double>(   double* d,  const double* h, unsigned num_items);
0582 template void QU::copy_host_to_device<unsigned>( unsigned* d, const unsigned* h, unsigned num_items);
0583 template void QU::copy_host_to_device<sevent>(   sevent* d,   const sevent* h, unsigned num_items);
0584 template void QU::copy_host_to_device<quad4>(    quad4* d,    const quad4* h, unsigned num_items);
0585 template void QU::copy_host_to_device<sphoton>(  sphoton* d,  const sphoton* h, unsigned num_items);
0586 template void QU::copy_host_to_device<sphotonlite>(  sphotonlite* d,  const sphotonlite* h, unsigned num_items);
0587 template void QU::copy_host_to_device<quad6>(    quad6* d,    const quad6* h, unsigned num_items);
0588 template void QU::copy_host_to_device<quad2>(    quad2* d,    const quad2* h, unsigned num_items);
0589 template void QU::copy_host_to_device<XORWOW>(   XORWOW* d,   const XORWOW* h,   unsigned num_items);
0590 template void QU::copy_host_to_device<Philox>(   Philox* d,   const Philox* h,   unsigned num_items);
0591 
0592 /**
0593 QU::NumItems
0594 ---------------
0595 
0596 Apply heuristics to determine the number of intended GPU buffer items
0597 using the size of the template type and the shape of the NP array.
0598 
0599 **/
0600 
0601 template <typename T>
0602 unsigned QU::NumItems( const NP* a )
0603 {
0604     unsigned num_items = 0 ;
0605 
0606     if( sizeof(T) == sizeof(float)*6*4 )   // looks like quad6
0607     {
0608         if(a->shape.size() == 3 )
0609         {
0610             assert( a->has_shape( -1, 6, 4) );
0611             num_items = a->shape[0] ;
0612         }
0613     }
0614     else if( sizeof(T) == sizeof(float)*4*4 )   // looks like quad4
0615     {
0616         if(a->shape.size() == 3 )
0617         {
0618             assert( a->has_shape( -1, 4, 4) );
0619             num_items = a->shape[0] ;
0620         }
0621         else if(a->shape.size() == 4 )
0622         {
0623             assert( a->shape[2] == 2 && a->shape[3] == 4 );
0624             num_items = a->shape[0]*a->shape[1] ;
0625         }
0626     }
0627     else if( sizeof(T) == sizeof(float)*4*2 ) // looks like quad2
0628     {
0629         if(a->shape.size() == 3 )
0630         {
0631             assert( a->has_shape( -1, 2, 4) );
0632             num_items = a->shape[0] ;
0633         }
0634         else if(a->shape.size() == 4 )
0635         {
0636             assert( a->shape[2] == 2 && a->shape[3] == 4 );
0637             num_items = a->shape[0]*a->shape[1] ;
0638         }
0639     }
0640     return num_items ;
0641 }
0642 
0643 template unsigned QU::NumItems<quad2>(const NP* );
0644 template unsigned QU::NumItems<quad4>(const NP* );
0645 template unsigned QU::NumItems<quad6>(const NP* );
0646 
0647 
0648 /**
0649 QU::copy_host_to_device
0650 ------------------------
0651 
0652 HMM: encapsulating determination of num_items is less useful than
0653 would initially expect because will always need to know
0654 and record the num_items in a shared GPU/CPU location like sevent.
0655 And also will often need to allocate the buffer first too.
0656 
0657 Suggesting should generally use this via QEvt.
0658 
0659 **/
0660 
0661 template <typename T>
0662 unsigned QU::copy_host_to_device( T* d, const NP* a)
0663 {
0664     unsigned num_items = NumItems<T>(a);
0665     if( num_items == 0 )
0666     {
0667         LOG(fatal) << " failed to devine num_items for array " << a->sstr() << " with template type where sizeof(T) " << sizeof(T) ;
0668     }
0669 
0670     if( num_items > 0 )
0671     {
0672         copy_host_to_device( d, (T*)a->bytes(), num_items );
0673     }
0674     return num_items ;
0675 }
0676 
0677 template unsigned QU::copy_host_to_device<quad2>( quad2* , const NP* );
0678 template unsigned QU::copy_host_to_device<quad4>( quad4* , const NP* );
0679 template unsigned QU::copy_host_to_device<quad6>( quad6* , const NP* );
0680 
0681 
0682 
0683 
0684 
0685 /**
0686 QU::ConfigureLaunch
0687 ---------------------
0688 
0689 
0690 
0691 
0692 **/
0693 
0694 void QU::ConfigureLaunch( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height ) // static
0695 {
0696     threadsPerBlock.x = 512 ;
0697     threadsPerBlock.y = 1 ;
0698     threadsPerBlock.z = 1 ;
0699 
0700     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0701     numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0702     numBlocks.z = 1 ;
0703 
0704     // hmm this looks to not handle height other than 1
0705 }
0706 
0707 void QU::ConfigureLaunch1D( dim3& numBlocks, dim3& threadsPerBlock, unsigned num, unsigned threads_per_block ) // static
0708 {
0709     threadsPerBlock.x = threads_per_block ;
0710     threadsPerBlock.y = 1 ;
0711     threadsPerBlock.z = 1 ;
0712 
0713     numBlocks.x = (num + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0714     numBlocks.y = 1 ;
0715     numBlocks.z = 1 ;
0716 }
0717 
0718 
0719 
0720 void QU::ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height ) // static
0721 {
0722     threadsPerBlock.x = 16 ;
0723     threadsPerBlock.y = 16 ;
0724     threadsPerBlock.z = 1 ;
0725 
0726     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0727     numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0728     numBlocks.z = 1 ;
0729 }
0730 
0731 
0732 void QU::ConfigureLaunch16( dim3& numBlocks, dim3& threadsPerBlock ) // static
0733 {
0734     threadsPerBlock.x = 16 ;
0735     threadsPerBlock.y = 1 ;
0736     threadsPerBlock.z = 1 ;
0737 
0738     numBlocks.x = 1 ;
0739     numBlocks.y = 1 ;
0740     numBlocks.z = 1 ;
0741 }
0742 
0743 
0744 std::string QU::Desc(const dim3& d, int w) // static
0745 {
0746     std::stringstream ss ;
0747     ss << "( "
0748         << std::setw(w) << d.x
0749         << " "
0750         << std::setw(w) << d.y
0751         << " "
0752         << std::setw(w) << d.z
0753         << ")"
0754         ;
0755     std::string s = ss.str();
0756     return s ;
0757 }
0758 
0759 std::string QU::DescLaunch( const dim3& numBlocks, const dim3& threadsPerBlock ) // static
0760 {
0761     std::stringstream ss ;
0762     ss
0763         << " numBlocks " << Desc(numBlocks,4)
0764         << " threadsPerBlock " << Desc(threadsPerBlock, 4)
0765         ;
0766     std::string s = ss.str();
0767     return s ;
0768 }
0769