File indexing completed on 2026-04-09 07:49:11
0001 #include <cassert>
0002
0003 #include "NP.hh"
0004 #include "SLOG.hh"
0005
0006 #include "spath.h"
0007 #include "sdirectory.h"
0008 #include "scuda.h"
0009 #include "squad.h"
0010 #include "ssys.h"
0011
0012 #ifndef PRODUCTION
0013 #include "srec.h"
0014 #include "sseq.h"
0015 #endif
0016
0017 #include "sphoton.h"
0018 #include "sphotonlite.h"
0019
0020 #include "sevent.h"
0021 #include "salloc.h"
0022 #include "SEventConfig.hh"
0023
0024 #include "QUDA_CHECK.h"
0025 #include "QU.hh"
0026
0027 #include "curand_kernel.h"
0028 #include "qrng.h"
0029 #include "qsim.h"
0030
0031 #include "qbase.h"
0032 #include "qprop.h"
0033 #include "qpmt.h"
0034 #include "qdebug.h"
0035 #include "qscint.h"
0036 #include "qcerenkov.h"
0037 #include "qcurandwrap.h"
0038 #include "scurandref.h"
0039 #include "qmultifilm.h"
0040
0041
0042 const plog::Severity QU::LEVEL = SLOG::EnvLevel("QU", "DEBUG") ;
0043 bool QU::MEMCHECK = ssys::getenvbool(_MEMCHECK);
0044
0045 salloc* QU::alloc = nullptr ;
0046
0047
0048 void QU::alloc_add(const char* label, uint64_t num_items, uint64_t sizeof_item )
0049 {
0050 if(!alloc) alloc = SEventConfig::ALLOC ;
0051 if(alloc ) alloc->add(label, num_items, sizeof_item );
0052 }
0053
0054
0055 template <typename T>
0056 char QU::typecode()
0057 {
0058 char c = '?' ;
0059 switch(sizeof(T))
0060 {
0061 case 4: c = 'f' ; break ;
0062 case 8: c = 'd' ; break ;
0063 }
0064 return c ;
0065 }
0066
0067 template char QU::typecode<float>() ;
0068 template char QU::typecode<double>() ;
0069
0070
0071 template <typename T>
0072 std::string QU::rng_sequence_name(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset )
0073 {
0074 std::stringstream ss ;
0075 ss << prefix
0076 << "_" << QU::typecode<T>()
0077 << "_ni" << ni
0078 << "_nj" << nj
0079 << "_nk" << nk
0080 << "_ioffset" << std::setw(6) << std::setfill('0') << ioffset
0081 << ".npy"
0082 ;
0083
0084 std::string name = ss.str();
0085 return name ;
0086 }
0087
0088 template std::string QU::rng_sequence_name<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
0089 template std::string QU::rng_sequence_name<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ioffset ) ;
0090
0091
0092
0093 template <typename T>
0094 std::string QU::rng_sequence_reldir(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size )
0095 {
0096 std::stringstream ss ;
0097 ss << prefix
0098 << "_" << QU::typecode<T>()
0099 << "_ni" << ni
0100 << "_nj" << nj
0101 << "_nk" << nk
0102 << "_tranche" << ni_tranche_size
0103 ;
0104
0105 std::string reldir = ss.str();
0106 return reldir ;
0107 }
0108
0109 template std::string QU::rng_sequence_reldir<float>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
0110 template std::string QU::rng_sequence_reldir<double>(const char* prefix, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size ) ;
0111
0112
0113
0114
0115
0116
0117
0118
0119
0120
0121
0122
0123 template <typename T>
0124 T* QU::UploadArray(const T* array, unsigned num_items, const char* label )
0125 {
0126 size_t size = num_items*sizeof(T) ;
0127
0128 LOG(LEVEL)
0129 << " num_items " << num_items
0130 << " size " << size
0131 << " label " << ( label ? label : "-" )
0132 ;
0133
0134 LOG_IF(info, MEMCHECK)
0135 << " num_items " << num_items
0136 << " size " << size
0137 << " label " << ( label ? label : "-" )
0138 ;
0139
0140
0141 alloc_add( label, num_items, sizeof(T) ) ;
0142
0143 T* d_array = nullptr ;
0144 QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( &d_array ), size ));
0145 QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d_array ), array, size, cudaMemcpyHostToDevice ));
0146 return d_array ;
0147 }
0148
0149
0150
0151 template float* QU::UploadArray<float>(const float* array, unsigned num_items, const char* label ) ;
0152 template double* QU::UploadArray<double>(const double* array, unsigned num_items, const char* label) ;
0153 template unsigned* QU::UploadArray<unsigned>(const unsigned* array, unsigned num_items, const char* label) ;
0154 template int* QU::UploadArray<int>(const int* array, unsigned num_items, const char* label) ;
0155 template quad4* QU::UploadArray<quad4>(const quad4* array, unsigned num_items, const char* label) ;
0156 template sphoton* QU::UploadArray<sphoton>(const sphoton* array, unsigned num_items, const char* label) ;
0157 template sphotonlite* QU::UploadArray<sphotonlite>(const sphotonlite* array, unsigned num_items, const char* label) ;
0158 template quad2* QU::UploadArray<quad2>(const quad2* array, unsigned num_items, const char* label) ;
0159 template XORWOW* QU::UploadArray<XORWOW>(const XORWOW* array, unsigned num_items, const char* label) ;
0160 template Philox* QU::UploadArray<Philox>(const Philox* array, unsigned num_items, const char* label) ;
0161 template qcurandwrap<XORWOW>* QU::UploadArray<qcurandwrap<XORWOW>>(const qcurandwrap<XORWOW>* array, unsigned num_items, const char* label) ;
0162 template scurandref<XORWOW>* QU::UploadArray<scurandref<XORWOW>>(const scurandref<XORWOW>* array, unsigned num_items, const char* label) ;
0163 template qsim* QU::UploadArray<qsim>(const qsim* array, unsigned num_items, const char* label) ;
0164 template qprop<float>* QU::UploadArray<qprop<float>>(const qprop<float>* array, unsigned num_items, const char* label) ;
0165 template qprop<double>* QU::UploadArray<qprop<double>>(const qprop<double>* array, unsigned num_items, const char* label) ;
0166 template qpmt<float>* QU::UploadArray<qpmt<float>>(const qpmt<float>* array, unsigned num_items, const char* label) ;
0167 template qpmt<double>* QU::UploadArray<qpmt<double>>(const qpmt<double>* array, unsigned num_items, const char* label) ;
0168 template qmultifilm* QU::UploadArray<qmultifilm>(const qmultifilm* array, unsigned num_items, const char* label) ;
0169 template qrng<RNG>* QU::UploadArray<qrng<RNG>>(const qrng<RNG>* array, unsigned num_items, const char* label) ;
0170 template qbnd* QU::UploadArray<qbnd>(const qbnd* array, unsigned num_items, const char* label) ;
0171 template sevent* QU::UploadArray<sevent>(const sevent* array, unsigned num_items, const char* label) ;
0172 template qdebug* QU::UploadArray<qdebug>(const qdebug* array, unsigned num_items, const char* label) ;
0173 template qscint* QU::UploadArray<qscint>(const qscint* array, unsigned num_items, const char* label) ;
0174 template qcerenkov* QU::UploadArray<qcerenkov>(const qcerenkov* array, unsigned num_items, const char* label) ;
0175 template qbase* QU::UploadArray<qbase>(const qbase* array, unsigned num_items, const char* label) ;
0176
0177
0178
0179
0180
0181
0182
0183
0184
0185
0186
0187 template <typename T>
0188 T* QU::DownloadArray(const T* d_array, unsigned num_items )
0189 {
0190 T* array = new T[num_items] ;
0191 QUDA_CHECK( cudaMemcpy( array, d_array, sizeof(T)*num_items, cudaMemcpyDeviceToHost ));
0192 return array ;
0193 }
0194
0195
0196 template float* QU::DownloadArray<float>(const float* d_array, unsigned num_items) ;
0197 template unsigned* QU::DownloadArray<unsigned>(const unsigned* d_array, unsigned num_items) ;
0198 template int* QU::DownloadArray<int>(const int* d_array, unsigned num_items) ;
0199 template quad4* QU::DownloadArray<quad4>(const quad4* d_array, unsigned num_items) ;
0200 template quad2* QU::DownloadArray<quad2>(const quad2* d_array, unsigned num_items) ;
0201 template XORWOW* QU::DownloadArray<XORWOW>(const XORWOW* d_array, unsigned num_items) ;
0202 template Philox* QU::DownloadArray<Philox>(const Philox* d_array, unsigned num_items) ;
0203 template qprop<float>* QU::DownloadArray<qprop<float>>(const qprop<float>* d_array, unsigned num_items) ;
0204 template qprop<double>* QU::DownloadArray<qprop<double>>(const qprop<double>* d_array, unsigned num_items) ;
0205
0206
0207 template <typename T>
0208 void QU::Download(std::vector<T>& vec, const T* d_array, unsigned num_items)
0209 {
0210 vec.resize( num_items);
0211 QUDA_CHECK( cudaMemcpy( static_cast<void*>( vec.data() ), d_array, num_items*sizeof(T), cudaMemcpyDeviceToHost));
0212 }
0213
0214
0215 template QUDARAP_API void QU::Download<float>( std::vector<float>& vec, const float* d_array, unsigned num_items);
0216 template QUDARAP_API void QU::Download<unsigned>(std::vector<unsigned>& vec, const unsigned* d_array, unsigned num_items);
0217 template QUDARAP_API void QU::Download<int>( std::vector<int>& vec, const int* d_array, unsigned num_items);
0218 template QUDARAP_API void QU::Download<uchar4>( std::vector<uchar4>& vec, const uchar4* d_array, unsigned num_items);
0219 template QUDARAP_API void QU::Download<float4>( std::vector<float4>& vec, const float4* d_array, unsigned num_items);
0220 template QUDARAP_API void QU::Download<quad4>( std::vector<quad4>& vec, const quad4* d_array, unsigned num_items);
0221
0222
0223
0224 template<typename T>
0225 void QU::device_free_and_alloc(T** dd, unsigned num_items )
0226 {
0227 size_t size = num_items*sizeof(T) ;
0228 LOG_IF(info, MEMCHECK) << " size " << size << " num_items " << num_items ;
0229
0230 QUDA_CHECK( cudaFree( reinterpret_cast<void*>( *dd ) ) );
0231 QUDA_CHECK( cudaMalloc(reinterpret_cast<void**>( dd ), size ));
0232 assert( *dd );
0233 }
0234
0235
0236 template QUDARAP_API void QU::device_free_and_alloc<float>(float** dd, unsigned num_items) ;
0237 template QUDARAP_API void QU::device_free_and_alloc<double>(double** dd, unsigned num_items) ;
0238 template QUDARAP_API void QU::device_free_and_alloc<unsigned>(unsigned** dd, unsigned num_items) ;
0239 template QUDARAP_API void QU::device_free_and_alloc<int>(int** dd, unsigned num_items) ;
0240 template QUDARAP_API void QU::device_free_and_alloc<quad>(quad** dd, unsigned num_items) ;
0241 template QUDARAP_API void QU::device_free_and_alloc<uchar4>(uchar4** dd, unsigned num_items) ;
0242 template QUDARAP_API void QU::device_free_and_alloc<float4>(float4** dd, unsigned num_items) ;
0243 template QUDARAP_API void QU::device_free_and_alloc<quad4>(quad4** dd, unsigned num_items) ;
0244
0245 const char* QU::_cudaMalloc_OOM_NOTES = R"( ;
0246 QU::_cudaMalloc_OOM_NOTES
0247 ==========================
0248
0249 When running with debug arrays, such as the record array, enabled
0250 it is necessary to set max_slot to something reasonable, otherwise with the
0251 default max_slot of zero, it gets set to a high value (eg M197 with 24GB)
0252 appropriate for production running with the available VRAM.
0253
0254 One million is typically reasonable for debugging::
0255
0256 export OPTICKS_MAX_SLOT=M1
0257
0258 )" ;
0259
0260
0261
0262
0263 void QU::_cudaMalloc( void** p2p, size_t size, const char* label )
0264 {
0265 cudaError_t err = cudaMalloc(p2p, size ) ;
0266 if( err != cudaSuccess )
0267 {
0268 const char* out = spath::Resolve("$DefaultOutputDir") ;
0269 salloc* estimate = SEventConfig::AllocEstimate();
0270
0271 std::stringstream ss;
0272 ss << "CUDA call (" << label << " ) failed with error: '"
0273 << cudaGetErrorString( err )
0274 << "' (" __FILE__ << ":" << __LINE__ << ")"
0275 << "\n\n"
0276 << "[SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
0277 << SEventConfig::DescEventMode()
0278 << "]SEventConfig::DescEventMode (use of DebugHeavy/DebugLite EventMode with high stats is typical cause of OOM errors)\n"
0279 << "\n\n"
0280 << "[alloc.desc\n"
0281 << ( alloc ? alloc->desc() : "no-alloc" )
0282 << "]alloc.desc\n"
0283 << "\n"
0284 << "[NOTES\n"
0285 << _cudaMalloc_OOM_NOTES
0286 << "]NOTES\n"
0287 << "\n\n"
0288 << "[SEventConfig::AllocEstimate\n"
0289 << ( estimate ? estimate->desc() : "no-estimate" )
0290 << "]SEventConfig::AllocEstimate\n"
0291 << "save salloc record to [" << out << "]\n" ;
0292 ;
0293
0294 std::string msg = ss.str();
0295 LOG(error) << msg ;
0296
0297 sdirectory::MakeDirs(out,0);
0298 alloc->save(out) ;
0299
0300 throw QUDA_Exception( msg.c_str() );
0301 }
0302 }
0303
0304
0305 template<typename T>
0306 T* QU::device_alloc( unsigned num_items, const char* label )
0307 {
0308 size_t size = num_items*sizeof(T) ;
0309
0310 LOG(LEVEL)
0311 << " num_items " << std::setw(10) << num_items
0312 << " size " << std::setw(10) << size
0313 << " label " << std::setw(15) << label
0314 ;
0315
0316 LOG_IF(info, MEMCHECK)
0317 << " num_items " << std::setw(10) << num_items
0318 << " size " << std::setw(10) << size
0319 << " label " << std::setw(15) << label
0320 ;
0321
0322
0323 alloc_add( label, num_items, sizeof(T) ) ;
0324
0325 T* d ;
0326 _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0327
0328 return d ;
0329 }
0330
0331 template QUDARAP_API float* QU::device_alloc<float>(unsigned num_items, const char* label) ;
0332 template QUDARAP_API double* QU::device_alloc<double>(unsigned num_items, const char* label) ;
0333 template QUDARAP_API unsigned* QU::device_alloc<unsigned>(unsigned num_items, const char* label) ;
0334 template QUDARAP_API int* QU::device_alloc<int>(unsigned num_items, const char* label) ;
0335 template QUDARAP_API uchar4* QU::device_alloc<uchar4>(unsigned num_items, const char* label) ;
0336 template QUDARAP_API float4* QU::device_alloc<float4>(unsigned num_items, const char* label) ;
0337 template QUDARAP_API quad* QU::device_alloc<quad>(unsigned num_items, const char* label) ;
0338 template QUDARAP_API quad2* QU::device_alloc<quad2>(unsigned num_items, const char* label) ;
0339 template QUDARAP_API quad4* QU::device_alloc<quad4>(unsigned num_items, const char* label) ;
0340 template QUDARAP_API quad6* QU::device_alloc<quad6>(unsigned num_items, const char* label) ;
0341 template QUDARAP_API sevent* QU::device_alloc<sevent>(unsigned num_items, const char* label) ;
0342 template QUDARAP_API qdebug* QU::device_alloc<qdebug>(unsigned num_items, const char* label) ;
0343 template QUDARAP_API sstate* QU::device_alloc<sstate>(unsigned num_items, const char* label) ;
0344 template QUDARAP_API XORWOW* QU::device_alloc<XORWOW>(unsigned num_items, const char* label) ;
0345 template QUDARAP_API Philox* QU::device_alloc<Philox>(unsigned num_items, const char* label) ;
0346
0347 #ifndef PRODUCTION
0348 template QUDARAP_API srec* QU::device_alloc<srec>(unsigned num_items, const char* label) ;
0349 template QUDARAP_API sseq* QU::device_alloc<sseq>(unsigned num_items, const char* label) ;
0350 #endif
0351
0352 template QUDARAP_API sphoton* QU::device_alloc<sphoton>(unsigned num_items, const char* label) ;
0353 template QUDARAP_API sphotonlite* QU::device_alloc<sphotonlite>(unsigned num_items, const char* label) ;
0354
0355
0356 template<typename T>
0357 T* QU::device_alloc_zero(unsigned num_items, const char* label)
0358 {
0359 size_t size = num_items*sizeof(T) ;
0360
0361 LOG(LEVEL)
0362 << " num_items " << std::setw(10) << num_items
0363 << " sizeof(T) " << std::setw(10) << sizeof(T)
0364 << " size " << std::setw(10) << size
0365 << " label " << std::setw(15) << label
0366 ;
0367
0368 LOG_IF(info, MEMCHECK)
0369 << " num_items " << std::setw(10) << num_items
0370 << " sizeof(T) " << std::setw(10) << sizeof(T)
0371 << " size " << std::setw(10) << size
0372 << " label " << std::setw(15) << label
0373 ;
0374
0375
0376 alloc_add( label, num_items, sizeof(T) ) ;
0377
0378 T* d ;
0379 _cudaMalloc( reinterpret_cast<void**>( &d ), size, label );
0380
0381 int value = 0 ;
0382 QUDA_CHECK( cudaMemset(d, value, size ));
0383
0384 return d ;
0385 }
0386
0387 template QUDARAP_API sphoton* QU::device_alloc_zero<sphoton>(unsigned num_items, const char* label) ;
0388 template QUDARAP_API sphotonlite* QU::device_alloc_zero<sphotonlite>(unsigned num_items, const char* label) ;
0389 template QUDARAP_API quad2* QU::device_alloc_zero<quad2>( unsigned num_items, const char* label) ;
0390 template QUDARAP_API XORWOW* QU::device_alloc_zero<XORWOW>( unsigned num_items, const char* label) ;
0391 template QUDARAP_API Philox* QU::device_alloc_zero<Philox>( unsigned num_items, const char* label) ;
0392
0393 #ifndef PRODUCTION
0394 template QUDARAP_API srec* QU::device_alloc_zero<srec>( unsigned num_items, const char* label) ;
0395 template QUDARAP_API sseq* QU::device_alloc_zero<sseq>( unsigned num_items, const char* label) ;
0396 template QUDARAP_API stag* QU::device_alloc_zero<stag>( unsigned num_items, const char* label) ;
0397 template QUDARAP_API sflat* QU::device_alloc_zero<sflat>( unsigned num_items, const char* label) ;
0398 #endif
0399
0400
0401
0402
0403 template<typename T>
0404 void QU::device_memset( T* d, int value, unsigned num_items )
0405 {
0406 size_t size = num_items*sizeof(T) ;
0407
0408 LOG_IF(info, MEMCHECK)
0409 << " num_items " << std::setw(10) << num_items
0410 << " sizeof(T) " << std::setw(10) << sizeof(T)
0411 << " size " << std::setw(10) << size
0412 ;
0413
0414 QUDA_CHECK( cudaMemset(d, value, size ));
0415 }
0416
0417 template QUDARAP_API void QU::device_memset<int>(int*, int, unsigned ) ;
0418 template QUDARAP_API void QU::device_memset<quad4>(quad4*, int, unsigned ) ;
0419 template QUDARAP_API void QU::device_memset<quad6>(quad6*, int, unsigned ) ;
0420 template QUDARAP_API void QU::device_memset<sphoton>(sphoton*, int, unsigned ) ;
0421 template QUDARAP_API void QU::device_memset<sphotonlite>(sphotonlite*, int, unsigned ) ;
0422
0423
0424
0425
0426
0427
0428
0429
0430
0431
0432 template<typename T>
0433 void QU::device_free( T* d)
0434 {
0435 LOG_IF(info, MEMCHECK) ;
0436
0437
0438 QUDA_CHECK( cudaFree(d) );
0439 }
0440
0441 template QUDARAP_API void QU::device_free<float>(float*) ;
0442 template QUDARAP_API void QU::device_free<double>(double*) ;
0443 template QUDARAP_API void QU::device_free<unsigned>(unsigned*) ;
0444 template QUDARAP_API void QU::device_free<quad2>(quad2*) ;
0445 template QUDARAP_API void QU::device_free<quad4>(quad4*) ;
0446 template QUDARAP_API void QU::device_free<sphoton>(sphoton*) ;
0447 template QUDARAP_API void QU::device_free<sphotonlite>(sphotonlite*) ;
0448 template QUDARAP_API void QU::device_free<uchar4>(uchar4*) ;
0449 template QUDARAP_API void QU::device_free<XORWOW>(XORWOW*) ;
0450 template QUDARAP_API void QU::device_free<Philox>(Philox*) ;
0451
0452
0453 template<typename T>
0454 int QU::copy_device_to_host( T* h, T* d, unsigned num_items)
0455 {
0456 if( d == nullptr ) std::cerr
0457 << "QU::copy_device_to_host"
0458 << " ERROR : device pointer is null "
0459 << std::endl
0460 ;
0461
0462 if( d == nullptr ) return 1 ;
0463
0464 size_t size = num_items*sizeof(T) ;
0465 QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
0466
0467 return 0 ;
0468 }
0469
0470
0471 template int QU::copy_device_to_host<int>( int* h, int* d, unsigned num_items);
0472 template int QU::copy_device_to_host<float>( float* h, float* d, unsigned num_items);
0473 template int QU::copy_device_to_host<double>( double* h, double* d, unsigned num_items);
0474 template int QU::copy_device_to_host<quad>( quad* h, quad* d, unsigned num_items);
0475 template int QU::copy_device_to_host<quad2>( quad2* h, quad2* d, unsigned num_items);
0476 template int QU::copy_device_to_host<quad4>( quad4* h, quad4* d, unsigned num_items);
0477 template int QU::copy_device_to_host<sphoton>( sphoton* h, sphoton* d, unsigned num_items);
0478 template int QU::copy_device_to_host<sphotonlite>( sphotonlite* h, sphotonlite* d, unsigned num_items);
0479 template int QU::copy_device_to_host<quad6>( quad6* h, quad6* d, unsigned num_items);
0480 template int QU::copy_device_to_host<sstate>( sstate* h, sstate* d, unsigned num_items);
0481 template int QU::copy_device_to_host<XORWOW>( XORWOW* h, XORWOW* d, unsigned num_items);
0482 template int QU::copy_device_to_host<Philox>( Philox* h, Philox* d, unsigned num_items);
0483 #ifndef PRODUCTION
0484 template int QU::copy_device_to_host<srec>( srec* h, srec* d, unsigned num_items);
0485 template int QU::copy_device_to_host<sseq>( sseq* h, sseq* d, unsigned num_items);
0486 template int QU::copy_device_to_host<stag>( stag* h, stag* d, unsigned num_items);
0487 template int QU::copy_device_to_host<sflat>( sflat* h, sflat* d, unsigned num_items);
0488 #endif
0489
0490
0491
0492
0493
0494
0495
0496
0497
0498
0499
0500
0501
0502
0503
0504
0505
0506
0507
0508
0509
0510
0511
0512
0513
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532
0533
0534
0535
0536
0537 template<typename T>
0538 void QU::copy_device_to_host_and_free( T* h, T* d, unsigned num_items, const char* label)
0539 {
0540 size_t size = num_items*sizeof(T) ;
0541 LOG(LEVEL)
0542 << "copy " << num_items
0543 << " sizeof(T) " << sizeof(T)
0544 << " label " << ( label ? label : "-" )
0545 ;
0546
0547 QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( h ), d , size, cudaMemcpyDeviceToHost ));
0548 QUDA_CHECK( cudaFree(d) );
0549 }
0550
0551
0552 template void QU::copy_device_to_host_and_free<float>( float* h, float* d, unsigned num_items, const char* label );
0553 template void QU::copy_device_to_host_and_free<double>( double* h, double* d, unsigned num_items, const char* label);
0554 template void QU::copy_device_to_host_and_free<quad>( quad* h, quad* d, unsigned num_items, const char* label);
0555 template void QU::copy_device_to_host_and_free<quad2>( quad2* h, quad2* d, unsigned num_items, const char* label);
0556 template void QU::copy_device_to_host_and_free<quad4>( quad4* h, quad4* d, unsigned num_items, const char* label);
0557 template void QU::copy_device_to_host_and_free<sphoton>( sphoton* h, sphoton* d, unsigned num_items, const char* label);
0558 template void QU::copy_device_to_host_and_free<sphotonlite>( sphotonlite* h, sphotonlite* d, unsigned num_items, const char* label);
0559 template void QU::copy_device_to_host_and_free<quad6>( quad6* h, quad6* d, unsigned num_items, const char* label);
0560 template void QU::copy_device_to_host_and_free<sstate>( sstate* h, sstate* d, unsigned num_items, const char* label);
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572
0573 template<typename T>
0574 void QU::copy_host_to_device( T* d, const T* h, unsigned num_items)
0575 {
0576 size_t size = num_items*sizeof(T) ;
0577 QUDA_CHECK( cudaMemcpy(reinterpret_cast<void*>( d ), h , size, cudaMemcpyHostToDevice ));
0578 }
0579
0580 template void QU::copy_host_to_device<float>( float* d, const float* h, unsigned num_items);
0581 template void QU::copy_host_to_device<double>( double* d, const double* h, unsigned num_items);
0582 template void QU::copy_host_to_device<unsigned>( unsigned* d, const unsigned* h, unsigned num_items);
0583 template void QU::copy_host_to_device<sevent>( sevent* d, const sevent* h, unsigned num_items);
0584 template void QU::copy_host_to_device<quad4>( quad4* d, const quad4* h, unsigned num_items);
0585 template void QU::copy_host_to_device<sphoton>( sphoton* d, const sphoton* h, unsigned num_items);
0586 template void QU::copy_host_to_device<sphotonlite>( sphotonlite* d, const sphotonlite* h, unsigned num_items);
0587 template void QU::copy_host_to_device<quad6>( quad6* d, const quad6* h, unsigned num_items);
0588 template void QU::copy_host_to_device<quad2>( quad2* d, const quad2* h, unsigned num_items);
0589 template void QU::copy_host_to_device<XORWOW>( XORWOW* d, const XORWOW* h, unsigned num_items);
0590 template void QU::copy_host_to_device<Philox>( Philox* d, const Philox* h, unsigned num_items);
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600
0601 template <typename T>
0602 unsigned QU::NumItems( const NP* a )
0603 {
0604 unsigned num_items = 0 ;
0605
0606 if( sizeof(T) == sizeof(float)*6*4 )
0607 {
0608 if(a->shape.size() == 3 )
0609 {
0610 assert( a->has_shape( -1, 6, 4) );
0611 num_items = a->shape[0] ;
0612 }
0613 }
0614 else if( sizeof(T) == sizeof(float)*4*4 )
0615 {
0616 if(a->shape.size() == 3 )
0617 {
0618 assert( a->has_shape( -1, 4, 4) );
0619 num_items = a->shape[0] ;
0620 }
0621 else if(a->shape.size() == 4 )
0622 {
0623 assert( a->shape[2] == 2 && a->shape[3] == 4 );
0624 num_items = a->shape[0]*a->shape[1] ;
0625 }
0626 }
0627 else if( sizeof(T) == sizeof(float)*4*2 )
0628 {
0629 if(a->shape.size() == 3 )
0630 {
0631 assert( a->has_shape( -1, 2, 4) );
0632 num_items = a->shape[0] ;
0633 }
0634 else if(a->shape.size() == 4 )
0635 {
0636 assert( a->shape[2] == 2 && a->shape[3] == 4 );
0637 num_items = a->shape[0]*a->shape[1] ;
0638 }
0639 }
0640 return num_items ;
0641 }
0642
0643 template unsigned QU::NumItems<quad2>(const NP* );
0644 template unsigned QU::NumItems<quad4>(const NP* );
0645 template unsigned QU::NumItems<quad6>(const NP* );
0646
0647
0648
0649
0650
0651
0652
0653
0654
0655
0656
0657
0658
0659
0660
0661 template <typename T>
0662 unsigned QU::copy_host_to_device( T* d, const NP* a)
0663 {
0664 unsigned num_items = NumItems<T>(a);
0665 if( num_items == 0 )
0666 {
0667 LOG(fatal) << " failed to devine num_items for array " << a->sstr() << " with template type where sizeof(T) " << sizeof(T) ;
0668 }
0669
0670 if( num_items > 0 )
0671 {
0672 copy_host_to_device( d, (T*)a->bytes(), num_items );
0673 }
0674 return num_items ;
0675 }
0676
0677 template unsigned QU::copy_host_to_device<quad2>( quad2* , const NP* );
0678 template unsigned QU::copy_host_to_device<quad4>( quad4* , const NP* );
0679 template unsigned QU::copy_host_to_device<quad6>( quad6* , const NP* );
0680
0681
0682
0683
0684
0685
0686
0687
0688
0689
0690
0691
0692
0693
0694 void QU::ConfigureLaunch( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height )
0695 {
0696 threadsPerBlock.x = 512 ;
0697 threadsPerBlock.y = 1 ;
0698 threadsPerBlock.z = 1 ;
0699
0700 numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0701 numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0702 numBlocks.z = 1 ;
0703
0704
0705 }
0706
0707 void QU::ConfigureLaunch1D( dim3& numBlocks, dim3& threadsPerBlock, unsigned num, unsigned threads_per_block )
0708 {
0709 threadsPerBlock.x = threads_per_block ;
0710 threadsPerBlock.y = 1 ;
0711 threadsPerBlock.z = 1 ;
0712
0713 numBlocks.x = (num + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0714 numBlocks.y = 1 ;
0715 numBlocks.z = 1 ;
0716 }
0717
0718
0719
0720 void QU::ConfigureLaunch2D( dim3& numBlocks, dim3& threadsPerBlock, unsigned width, unsigned height )
0721 {
0722 threadsPerBlock.x = 16 ;
0723 threadsPerBlock.y = 16 ;
0724 threadsPerBlock.z = 1 ;
0725
0726 numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0727 numBlocks.y = (height + threadsPerBlock.y - 1) / threadsPerBlock.y ;
0728 numBlocks.z = 1 ;
0729 }
0730
0731
0732 void QU::ConfigureLaunch16( dim3& numBlocks, dim3& threadsPerBlock )
0733 {
0734 threadsPerBlock.x = 16 ;
0735 threadsPerBlock.y = 1 ;
0736 threadsPerBlock.z = 1 ;
0737
0738 numBlocks.x = 1 ;
0739 numBlocks.y = 1 ;
0740 numBlocks.z = 1 ;
0741 }
0742
0743
0744 std::string QU::Desc(const dim3& d, int w)
0745 {
0746 std::stringstream ss ;
0747 ss << "( "
0748 << std::setw(w) << d.x
0749 << " "
0750 << std::setw(w) << d.y
0751 << " "
0752 << std::setw(w) << d.z
0753 << ")"
0754 ;
0755 std::string s = ss.str();
0756 return s ;
0757 }
0758
0759 std::string QU::DescLaunch( const dim3& numBlocks, const dim3& threadsPerBlock )
0760 {
0761 std::stringstream ss ;
0762 ss
0763 << " numBlocks " << Desc(numBlocks,4)
0764 << " threadsPerBlock " << Desc(threadsPerBlock, 4)
0765 ;
0766 std::string s = ss.str();
0767 return s ;
0768 }
0769