eic-opticks/qudarap/QSim.cc

0001
0002 #include <csignal>
0003
0004 #include "SLOG.hh"
0005
0006 #include "ssys.h"
0007 #include "sstamp.h"
0008 #include "spath.h"
0009 #include "SProf.hh"
0010
0011 #include "SComp.h"
0012 #include "SEvt.hh"
0013 #include "SSim.hh"
0014 #include "scuda.h"
0015 #include "squad.h"
0016 #include "salloc.h"
0017 #include "SEvent.hh"
0018 #include "SEventConfig.hh"
0019
0020 //#include "SCSGOptiX.h"
0021 #include "SSimulator.h"
0022
0023 #include "SGenstep.h"
0024 #include "sslice.h"
0025
0026 #include "NP.hh"
0027 #include "QUDA_CHECK.h"
0028 #include "QU.hh"
0029
0030 #include "qrng.h"
0031 #include "qsim.h"
0032 #include "qdebug.h"
0033
0034 #include "QBase.hh"
0035 #include "QEvt.hh"
0036 #include "QRng.hh"
0037 #include "QTex.hh"
0038 #include "QScint.hh"
0039 #include "QCerenkov.hh"
0040 #include "QBnd.hh"
0041 #include "QProp.hh"
0042 #include "QMultiFilm.hh"
0043 #include "QEvt.hh"
0044 #include "QOptical.hh"
0045 #include "QSimLaunch.hh"
0046 #include "QDebug.hh"
0047 #include "QPMT.hh"
0048
0049 #include "QSim.hh"
0050
0051 const plog::Severity QSim::LEVEL = SLOG::EnvLevel("QSim", "DEBUG");
0052
0053 const bool  QSim::REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
0054 const int   QSim::SAVE_IGS_EVENTID = ssys::getenvint(_QSim__SAVE_IGS_EVENTID,-1) ;
0055 const char* QSim::SAVE_IGS_PATH = ssys::getenvvar(_QSim__SAVE_IGS_PATH, "$TMP/.opticks/igs.npy");
0056 const bool  QSim::CONCAT = ssys::getenvbool(_QSim__CONCAT);
0057 const bool  QSim::ALLOC  = ssys::getenvbool(_QSim__ALLOC);
0058
0059
0060
0061 QSim* QSim::INSTANCE = nullptr ;
0062 QSim* QSim::Get(){ return INSTANCE ; }
0063
0064 QSim* QSim::Create()
0065 {
0066     LOG_IF(fatal, INSTANCE != nullptr) << " a QSim INSTANCE already exists " ;
0067     assert( INSTANCE == nullptr ) ;
0068     return new QSim  ;
0069 }
0070
0071
0072
0073
0074 /**
0075 QSim::UploadComponents
0076 -----------------------
0077
0078 Canonical invokation from CSGOptiX::InitSim by CSGOptiX::Create
0079
0080 As the QBase instanciation done by QSim::UploadComponents
0081 is typically the first connection to the GPU device
0082 by an Opticks process it is prone to CUDA latency of
0083 order 1.5s per GPU if nvidia-persistenced is not running.
0084 Start the daemon to avoid this latency following:
0085
0086 * https://docs.nvidia.com/deploy/driver-persistence/index.html
0087
0088 Essentially::
0089
0090     N[blyth@localhost ~]$ sudo su
0091     N[root@localhost blyth]$ mkdir -p /var/run/nvidia-persistenced
0092     N[root@localhost blyth]$ chown blyth:blyth /var/run/nvidia-persistenced
0093     N[root@localhost blyth]$ which nvidia-persistenced
0094     /bin/nvidia-persistenced
0095     N[root@localhost blyth]$ nvidia-persistenced --user blyth
0096     N[root@localhost blyth]$ ls /var/run/nvidia-persistenced/
0097     nvidia-persistenced.pid  socket
0098
0099
0100 QSim::UploadComponents is invoked for example by CSGOptiX/tests/CSGOptiXSimtraceTest.cc
0101 prior to instanciating CSGOptiX
0102
0103 Uploading components is a once only action for a geometry, encompassing:
0104
0105 * random states
0106 * scintillation textures
0107 * boundary textures
0108 * property arrays
0109
0110 It is the simulation physics equivalent of uploading the CSGFoundry geometry.
0111
0112 The components are managed by separate singleton instances
0113 that subsequent QSim instanciation collects together.
0114 This structure is used to allow separate testing.
0115
0116 **/
0117
0118 void QSim::UploadComponents( const SSim* ssim  )
0119 {
0120     LOG(LEVEL) << "[ ssim " << ssim ;
0121     if(getenv("QSim__UploadComponents_SIGINT")) std::raise(SIGINT);
0122
0123     LOG(LEVEL) << "[ new QBase" ;
0124     QBase* base = new QBase ;
0125     LOG(LEVEL) << "] new QBase : latency here of about 0.3s from first device access, if latency of >1s need to start nvidia-persistenced " ;
0126     LOG(LEVEL) << base->desc();
0127
0128
0129     unsigned skipahead_event_offset = SEventConfig::EventSkipahead()  ;
0130     LOG(LEVEL) << "[ new QRng skipahead_event_offset : " << skipahead_event_offset << " " << SEventConfig::kEventSkipahead ;
0131     QRng* rng = new QRng(skipahead_event_offset)  ;  // loads and uploads RNG
0132     LOG(LEVEL) << "] new QRng " << rng->desc()  ;
0133
0134     LOG(LEVEL) << rng->desc();
0135
0136     const NP* optical = ssim->get(snam::OPTICAL);
0137     const NP* bnd = ssim->get(snam::BND);
0138
0139     if( optical == nullptr && bnd == nullptr )
0140     {
0141         LOG(error) << " optical and bnd null  snam::OPTICAL " << snam::OPTICAL << " snam::BND " << snam::BND  ;
0142     }
0143     else
0144     {
0145        // note that QOptical and QBnd are tightly coupled, perhaps add constraints to tie them together
0146         QOptical* qopt = new QOptical(optical);
0147         LOG(LEVEL) << qopt->desc();
0148
0149         QBnd* qbnd = new QBnd(bnd); // boundary texture with standard domain, used for standard fast property lookup
0150         LOG(LEVEL) << qbnd->desc();
0151     }
0152
0153     QDebug* debug_ = new QDebug ;
0154     LOG(LEVEL) << debug_->desc() ;
0155
0156     const NP* propcom = ssim->get(snam::PROPCOM);
0157     if( propcom )
0158     {
0159         LOG(LEVEL) << "[ QProp " ;
0160         QProp<float>* prop = new QProp<float>(propcom) ;
0161         // property interpolation with per-property domains, eg used for Cerenkov RINDEX sampling
0162         LOG(LEVEL) << "] QProp " ;
0163         LOG(LEVEL) << prop->desc();
0164     }
0165     else
0166     {
0167         LOG(LEVEL) << "  propcom null, snam::PROPCOM " <<  snam::PROPCOM ;
0168     }
0169
0170
0171     const NP* icdf = ssim->get(snam::ICDF);
0172     if( icdf == nullptr )
0173     {
0174         LOG(error) << " icdf null, snam::ICDF " << snam::ICDF ;
0175     }
0176     else
0177     {
0178         unsigned hd_factor = 20u ;  // 0,10,20
0179         QScint* scint = new QScint( icdf, hd_factor); // custom high-definition inverse CDF for scintillation generation
0180         LOG(LEVEL) << scint->desc();
0181     }
0182
0183
0184     // TODO: make this more like the others : acting on the available inputs rather than the mode
0185     bool is_simtrace = SEventConfig::IsRGModeSimtrace() ;
0186     if(is_simtrace == false )
0187     {
0188         QCerenkov* cerenkov = new QCerenkov  ;
0189         LOG(LEVEL) << cerenkov->desc();
0190     }
0191     else
0192     {
0193         LOG(LEVEL) << " skip QCerenkov for simtrace running " ;
0194     }
0195
0196
0197
0198
0199     const NPFold* spmt_f = ssim->get_spmt_f() ;
0200     QPMT<float>* qpmt = spmt_f ? new QPMT<float>(spmt_f) : nullptr ;
0201
0202     bool has_PMT = spmt_f != nullptr && qpmt != nullptr ;
0203     bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false ;
0204
0205     LOG_IF(fatal, MISSING_PMT )
0206         << " MISSING_PMT "
0207         << " has_PMT " << ( has_PMT ? "YES" : "NO " )
0208         << " REQUIRE_PMT " << ( REQUIRE_PMT ? "YES" : "NO " )
0209         << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
0210         << " spmt_f " << ( spmt_f ? "YES" : "NO " )
0211         << " qpmt " << ( qpmt ? "YES" : "NO " )
0212         ;
0213
0214     assert(MISSING_PMT == false) ;
0215     if(MISSING_PMT)  std::raise(SIGINT);
0216
0217
0218
0219     LOG(LEVEL)
0220         << QPMT<float>::Desc()
0221         << std::endl
0222         << " spmt_f " << ( spmt_f ? "YES" : "NO " )
0223         << " qpmt " << ( qpmt ? "YES" : "NO " )
0224         ;
0225
0226
0227
0228     const NP* multifilm = ssim->get_extra(snam::MULTIFILM);
0229     if(multifilm == nullptr)
0230     {
0231         LOG(LEVEL) << " multifilm null, snam::MULTIFILM " << snam::MULTIFILM ;
0232     }
0233     else
0234     {
0235         QMultiFilm* mul = new QMultiFilm( multifilm );
0236         LOG(LEVEL) << mul->desc();
0237     }
0238     LOG(LEVEL) << "] ssim " << ssim ;
0239
0240
0241
0242 }
0243
0244
0245
0246
0247
0248
0249 /**
0250 QSim:::QSim
0251 -------------
0252
0253 Canonical instance is instanciated with CSGOptiX::CSGOptiX in sim mode.
0254 Notice how the instanciation pulls together device pointers from
0255 the constituents into the CPU side *sim* and then uploads that to *d_sim*
0256 which is available as *sim* GPU side.
0257
0258 Prior to instanciating QSim invoke QSim::Init to prepare the
0259 singleton components.
0260
0261 **/
0262
0263 QSim::QSim()
0264     :
0265     base(QBase::Get()),
0266     qev(new QEvt),
0267     sev(qev->sev),
0268     rng(QRng::Get()),
0269     scint(QScint::Get()),
0270     cerenkov(QCerenkov::Get()),
0271     bnd(QBnd::Get()),
0272     debug_(QDebug::Get()),
0273     prop(QProp<float>::Get()),
0274     pmt(QPMT<float>::Get()),
0275     multifilm(QMultiFilm::Get()),
0276     sim(nullptr),
0277     d_sim(nullptr),
0278     dbg(debug_ ? debug_->dbg : nullptr),
0279     d_dbg(debug_ ? debug_->d_dbg : nullptr),
0280     cx(nullptr)
0281 {
0282     LOG(LEVEL) << desc() ;
0283     init();
0284 }
0285
0286
0287 /**
0288 QSim::init
0289 ------------
0290
0291 *sim* (qsim.h) is a host side instance that is populated
0292 with device side pointers and handles and then uploaded
0293 to the device *d_sim*.
0294
0295 Many device pointers and handles are then accessible from
0296 the qsim.h instance at the cost of only a single launch
0297 parameter argument: the d_sim pointer
0298 or with optix launches a single Params member.
0299
0300 The advantage of this approach is it avoids kernel
0301 launches having very long argument lists and provides a natural
0302 place (qsim.h) to add GPU side functionality.
0303
0304 **/
0305
0306
0307 void QSim::init()
0308 {
0309     sim = new qsim ;
0310     sim->base = base ? base->d_base : nullptr ;
0311     sim->evt = qev ? qev->getDevicePtr() : nullptr ;
0312     //sim->rng_state = rng ? rng->qr->uploaded_states : nullptr ;
0313     sim->rng = rng ? rng->d_qr : nullptr ;
0314
0315     sim->bnd = bnd ? bnd->d_qb : nullptr ;
0316     sim->multifilm = multifilm ? multifilm->d_multifilm : nullptr ;
0317     sim->cerenkov = cerenkov ? cerenkov->d_cerenkov : nullptr ;
0318     sim->scint = scint ? scint->d_scint : nullptr ;
0319     sim->pmt = pmt ? pmt->d_pmt : nullptr ;
0320
0321
0322     bool has_PMT = pmt != nullptr && sim->pmt != nullptr ;
0323     bool REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
0324     bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false ;
0325
0326     LOG(LEVEL)
0327         << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
0328         << " has_PMT " << ( has_PMT ? "YES" : "NO " )
0329         << " QSim::pmt " << ( pmt ? "YES" : "NO " )
0330         << " QSim::pmt->d_pmt " << ( sim->pmt ? "YES" : "NO " )
0331         << " [" << _QSim__REQUIRE_PMT << "] " << ( REQUIRE_PMT ? "YES" : "NO " )
0332         ;
0333
0334     LOG_IF(fatal, MISSING_PMT )
0335         << " MISSING_PMT ABORT "
0336         << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
0337         << " has_PMT " << ( has_PMT ? "YES" : "NO " )
0338         << " QSim::pmt " << ( pmt ? "YES" : "NO " )
0339         << " QSim::pmt->d_pmt " << ( sim->pmt ? "YES" : "NO " )
0340         << " [" << _QSim__REQUIRE_PMT << "] " << ( REQUIRE_PMT ? "YES" : "NO " )
0341         ;
0342
0343     assert(MISSING_PMT == false) ;
0344     if(MISSING_PMT)  std::raise(SIGINT);
0345
0346     d_sim = QU::UploadArray<qsim>(sim, 1, "QSim::init.sim" );
0347
0348     INSTANCE = this ;
0349     LOG(LEVEL) << desc() ;
0350     LOG(LEVEL) << descComponents() ;
0351 }
0352
0353 /**
0354 QSim::setLauncher
0355 ------------------
0356
0357 Formerly used SCSGOptiX
0358
0359 **/
0360 void QSim::setLauncher(SSimulator* cx_ )
0361 {
0362     cx = cx_ ;
0363 }
0364
0365
0366 /**
0367 QSim::post_launch
0368 --------------------
0369
0370 launch is async, so need to sync before can gather ?
0371 HOW ELSE TO DO THIS : LIKELY BIG PERFORMANCE EFFECT
0372
0373 cudaDeviceSynchronize already done by CSGOptiX::launch via CUDA_SYNC_CHECK see CSG/CUDA_CHECK.h
0374
0375 void QSim::post_launch()
0376 {
0377     cudaDeviceSynchronize();
0378 }
0379 **/
0380
0381
0382 /**
0383 QSim::simulate
0384 ---------------
0385
0386 This expects gensteps to have been collected into SEvt vectors of quad6
0387 prior to being called.
0388
0389 Now only one canonical invokations, the higher level G4CXOpticks no longer
0390 directly uses QSim:
0391
0392 1. CSGOptiX::simulate for pure CSGOptiX level testing such as by ~/o/cxs_min.sh
0393
0394
0395 Collected genstep are uploaded and the CSGOptiX kernel is launched to generate and propagate.
0396
0397 NB the surprising fact that this calls CSGOptiX::simulate_launch (using a protocol),
0398 that seems funny dependency-wise but its needed for genstep preparation prior to
0399 the launch.
0400
0401 ::
0402
0403     QSim::simulate
0404
0405        EGPU.SEvt::beginOfEvent
0406
0407        determine slices over gensteps depending on VRAM
0408
0409        for each slice
0410
0411            QEvt::setGenstepUpload_NP
0412
0413            SSimulator::simulate_launch
0414
0415            SEvt::gather into *fold* below *topfold*
0416
0417        end for each slice
0418
0419        (NPFold)topfold->concat joining all arrays from the per-launch *fold* into the one *topfold*
0420
0421        EGPU.SEvt::endOfEvent [only when reset:true, not so in OJ running as need to defer until after hits collected]
0422
0423
0424 **/
0425
0426 bool QSim::KEEP_SUBFOLD = ssys::getenvbool(QSim__simulate_KEEP_SUBFOLD);
0427
0428 double QSim::simulate(int eventID, bool reset_)
0429 {
0430     SProf::SetTag(eventID, "A%0.3d_" ) ;
0431
0432     assert( SEventConfig::IsRGModeSimulate() );
0433
0434     //cudaStream_t stream ;  cudaStreamCreate(&stream);
0435     cudaStream_t stream = 0 ;
0436
0437
0438     int64_t tot_ph = 0 ;
0439
0440     double tot_dt = 0. ;
0441
0442     int64_t tot_idt = 0 ;
0443     int64_t tot_gdt = 0 ;
0444
0445     int64_t t_HEAD = SProf::Add("QSim__simulate_HEAD");
0446
0447     LOG_IF(info, SEvt::LIFECYCLE) << "[ eventID " << eventID ;
0448     if( qev == nullptr ) return -1. ;
0449
0450
0451     sev->beginOfEvent(eventID);  // set SEvt index and tees up frame gensteps for simtrace and input photon simulate running
0452
0453     NP* igs = sev->makeGenstepArrayFromVector();
0454
0455     MaybeSaveIGS(eventID, igs);
0456
0457     std::vector<sslice> igs_slice ;
0458     int64_t tot_ph_0 = SGenstep::GetGenstepSlices( igs_slice, igs, SEventConfig::MaxSlot() );
0459
0460     //bool xxl = tot_ph_0 > SGenstep::MAX_SLOT_PER_SLICE ;
0461     bool xxl = tot_ph_0 > 100*M ;
0462
0463     int num_slice = igs_slice.size();
0464
0465     LOG(xxl ? info : LEVEL)
0466         << " eventID " << std::setw(6) << eventID
0467         << " igs " << ( igs ? igs->sstr() : "-" )
0468         << " tot_ph_0 " << tot_ph_0
0469         << " tot_ph_0/M " << tot_ph_0/M
0470         << " xxl " << ( xxl ? "YES" : "NO " )
0471         << " MaxSlot " << SEventConfig::MaxSlot()
0472         << " MaxSlot/M " << SEventConfig::MaxSlot()/M
0473         << " sslice::Desc(igs_slice)\n"
0474         << sslice::Desc(igs_slice)
0475         << " num_slice " << num_slice
0476         ;
0477
0478
0479     int64_t t_LBEG = SProf::Add("QSim__simulate_LBEG");
0480
0481     for(int i=0 ; i < num_slice ; i++)
0482     {
0483         SProf::Add("QSim__simulate_PRUP");
0484
0485         const sslice& sl = igs_slice[i] ;
0486
0487         LOG(LEVEL) << sl.idx_desc(i) ;
0488
0489         int rc = qev->setGenstepUpload_NP(igs, &sl ) ;
0490         LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : have qev but no gensteps collected : will skip cx.simulate " ;
0491
0492         LOG_IF(info, ALLOC)
0493             << " [" << _QSim__ALLOC << "] "
0494             << " i " << std::setw(5) << i
0495             << " SEventConfig::ALLOC " << ( SEventConfig::ALLOC  ? "YES" : "NO " )
0496             << ( SEventConfig::ALLOC ? SEventConfig::ALLOC->desc() : "-" )
0497             ;
0498
0499
0500         SProf::Add("QSim__simulate_PREL");
0501
0502         sev->t_PreLaunch = sstamp::Now() ;
0503
0504         double dt = rc == 0 && cx != nullptr ? cx->simulate_launch() : -1. ;  //SSimulator protocol
0505
0506         sev->t_PostLaunch = sstamp::Now() ;
0507         sev->t_Launch = dt ;
0508
0509         tot_idt += ( sev->t_PostLaunch - sev->t_PreLaunch ) ;
0510         tot_dt += dt ;
0511         tot_ph += sl.ph_count ;
0512
0513         LOG( xxl ? info : LEVEL )
0514             << " eventID " << eventID
0515             << " xxl " << ( xxl ? "YES" : "NO " )
0516             << " i " << std::setw(4) << i
0517             << " dt " << std::setw(11) << std::fixed << std::setprecision(6) << dt
0518             << " slice " << sl.idx_desc(i)
0519             ;
0520
0521         int64_t t_POST = SProf::Add("QSim__simulate_POST");
0522
0523         sev->gather();  // gather into *fold* just added to *topfold*
0524
0525         int64_t t_DOWN = SProf::Add("QSim__simulate_DOWN");
0526
0527         tot_gdt += ( t_DOWN - t_POST ) ;
0528     }
0529
0530
0531     size_t max_slot_M = SEventConfig::MaxSlot()/M;
0532     std::string anno = SProf::Annotation("slice",num_slice, "max_slot_M", max_slot_M);
0533     int64_t t_LEND = SProf::Add("QSim__simulate_LEND", anno.c_str());
0534
0535     std::stringstream ss ;
0536     std::ostream* out = CONCAT ? &ss : nullptr ;
0537     int concat_rc = sev->topfold->concat(out);
0538
0539     LOG_IF(info, CONCAT) << ss.str() ;
0540     LOG_IF(fatal, concat_rc != 0) << " sev->topfold->concat FAILED " ;
0541     assert(concat_rc == 0);
0542
0543     bool has_hlm = sev->topfold->has_key(SComp::HITLITEMERGED_);
0544     bool has_hm  = sev->topfold->has_key(SComp::HITMERGED_);
0545     bool do_final_merge = num_slice > 1 && ( has_hlm || has_hm ) ;
0546     LOG(LEVEL)
0547          << " num_slice " << num_slice
0548          << " has_hm " << ( has_hm ? "YES" : "NO " )
0549          << " has_hlm " << ( has_hlm ? "YES" : "NO " )
0550          << " do_final_merge " << ( do_final_merge ? "YES" : "NO " )
0551          ;
0552     if(do_final_merge) simulate_final_merge(tot_ph, stream);
0553
0554
0555     if(!KEEP_SUBFOLD) sev->topfold->clear_subfold();
0556
0557     int64_t t_PCAT = SProf::Add("QSim__simulate_PCAT");
0558
0559     int tot_ht = sev->getNumHit() ;  // NB from fold, so requires hits array gathering to be configured to get non-zero
0560     std::string counts = sev->getCounts();  // collect counts before reset
0561
0562     LOG_IF(info, SEvt::MINIMAL)
0563         << " eventID " << eventID
0564         << " tot_dt " << std::setw(11) << std::fixed << std::setprecision(6) << tot_dt
0565         << " tot_ph " << std::setw(10) << tot_ph
0566         << " tot_ph/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ph)/float(M)
0567         << " tot_ht " << std::setw(10) << tot_ht
0568         << " tot_ht/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht)/float(M)
0569         << " tot_ht/tot_ph " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht)/float(tot_ph)
0570         << " reset_ " << ( reset_ ? "YES" : "NO " )
0571         ;
0572
0573
0574     assert( tot_ph == tot_ph_0 );
0575
0576     int64_t t_BRES  = SProf::Add("QSim__simulate_BRES", counts.c_str() );
0577     if(reset_) reset(eventID) ;
0578
0579     int64_t t_TAIL  = SProf::Add("QSim__simulate_TAIL");
0580
0581     SProf::Write(); // per-event write, so have something in case of crash
0582
0583     LOG_IF(info, SEvt::MINTIME) << "\n"
0584         << SEvt::SEvt__MINTIME
0585         << "\n"
0586         << " (TAIL - HEAD)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_TAIL - t_HEAD )/M
0587         << " (head to tail of QSim::simulate method) "
0588         << "\n"
0589         << " (LEND - LBEG)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_LEND - t_LBEG )/M
0590         << " (multilaunch loop begin to end) "
0591         << "\n"
0592         << " (PCAT - LEND)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_PCAT - t_LEND )/M
0593         << " (topfold concat and clear subfold) "
0594         << "\n"
0595         << " (TAIL - BRES)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_TAIL - t_BRES )/M
0596         << " (QSim::reset which saves hits) "
0597         << "\n"
0598         << " tot_idt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_idt)/M
0599         << " (sum of kernel execution int64_t stamp differences in microseconds)"
0600         << "\n"
0601         << " tot_dt          " << std::setw(10) << std::fixed << std::setprecision(6) << tot_dt
0602         << " int(tot_dt*M)   " << std::setw(10) << int64_t(tot_dt*M)
0603         << " (sum of kernel execution double chrono stamp differences in seconds, and scaled to ms) "
0604         << "\n"
0605         << " tot_gdt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_gdt)/M
0606         << " (sum of SEvt::gather int64_t stamp differences in microseconds)"
0607         << "\n"
0608         ;
0609
0610     return tot_dt ;
0611 }
0612
0613
0614 /**
0615 QSim::simulate_final_merge
0616 ---------------------------
0617
0618 * NB this normally only runs for large events with more
0619   photons than fit in VRAM that cause multiple launches
0620   to be done
0621
0622 * requires topfold to have "hitlitemerged" array,
0623   expected to be the result of simple concatenation
0624   of individual launch "hitlitemerged" arrays
0625
0626 * does CUDA thrust implemented hitlitemerged final merging
0627
0628
0629 TODO: use QEvt::FinalMerge_async once that makes sense
0630
0631 **/
0632
0633 void QSim::simulate_final_merge(int64_t tot_ph, cudaStream_t stream)
0634 {
0635     bool has_hlm = sev->topfold->has_key(SComp::HITLITEMERGED_);
0636     bool has_hm  = sev->topfold->has_key(SComp::HITMERGED_);
0637
0638     if( has_hlm )
0639     {
0640         const NP* hlm = sev->topfold->get(SComp::HITLITEMERGED_);
0641         NP*       fin = QEvt::FinalMerge<sphotonlite>(hlm, stream);
0642
0643         float     hlm_frac = float(hlm->num_items())/float(tot_ph) ;
0644         float     fin_frac = float(fin->num_items())/float(hlm->num_items()) ;
0645
0646         std::stringstream ss ;
0647         ss
0648             << " tot_ph " << tot_ph
0649             << " hlm " << ( hlm ? hlm->sstr() : "-" )
0650             << " fin " << ( fin ? fin->sstr() : "-" )
0651             << " hlm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hlm_frac
0652             << " fin/hlm " << std::setw(7) << std::fixed << std::setprecision(4) << fin_frac
0653             ;
0654
0655         std::string note = ss.str();
0656         fin->set_meta<std::string>("QSim__simulate_final_merge", note );
0657
0658         sev->topfold->set(SComp::HITLITEMERGED_, fin );
0659
0660         LOG(info) << note ;
0661     }
0662     if( has_hm )
0663     {
0664         const NP* hm = sev->topfold->get(SComp::HITMERGED_);
0665         NP*       fi = QEvt::FinalMerge<sphoton>(hm, stream);
0666
0667         float     hm_frac = float(hm->num_items())/float(tot_ph) ;
0668         float     fi_frac = float(fi->num_items())/float(hm->num_items()) ;
0669
0670         std::stringstream ss ;
0671         ss
0672             << " tot_ph " << tot_ph
0673             << " hm " << ( hm ? hm->sstr() : "-" )
0674             << " fi " << ( fi ? fi->sstr() : "-" )
0675             << " hm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hm_frac
0676             << " fi/hm "  << std::setw(7) << std::fixed << std::setprecision(4) << fi_frac
0677             ;
0678
0679         std::string note = ss.str();
0680         fi->set_meta<std::string>("QSim__simulate_final_merge", note );
0681
0682         sev->topfold->set(SComp::HITMERGED_, fi );
0683         LOG(info) << note ;
0684     }
0685 }
0686
0687
0688 /**
0689 QSim::simulate
0690 ----------------
0691
0692 High level API intending to be used from CSGOptiXService
0693
0694 1. access eventID from genstep metadata
0695 2. add the genstep to the EGPU SEvt
0696 3. invoke simulate doing the kernel launch and pullback
0697 4. copy global hits array (current thinking that localization
0698    should happen within the client to avoid ~doubling hits transfer size)
0699 5. reset the SEvt doing dealloc
0700
0701 Thus is used from language crossing stack::
0702
0703     async def simulate(request: Request)  [CSGOptiX/tests/CSGOptiXService_FastAPI_test/main.py]
0704     inline nb::ndarray<nb::numpy> _CSGOptiXService::simulate( nb::ndarray<nb::numpy> _gs, int eventID )
0705     inline NP* CSGOptiXService::simulate( NP* gs, int eventID )
0706     NP* CSGOptiX::simulate(const NP* gs, int eventID)
0707
0708 **/
0709
0710
0711 NP* QSim::simulate(const NP* gs, int eventID )
0712 {
0713     bool eventID_expected = eventID > -1;
0714     if(!eventID_expected) std::cerr << "QSim::simulate gs lacks needed eventID metadata [" << eventID << "]\n" ;
0715     assert(eventID_expected);
0716
0717     assert( sev == SEvt::Get_EGPU() );
0718     sev->addGenstep(gs);
0719
0720     bool reset_ = false ;
0721     double tot_dt = simulate(eventID, reset_);
0722
0723     const NP* _ht = sev->getHit();
0724     NP* ht = _ht ? _ht->copy() : nullptr ;  // copy global hits from SEvt before reset
0725     ht->set_meta<double>("QSim__simulate_tot_dt", tot_dt);
0726
0727     LOG(info)
0728         << " eventID " << std::setw(6) << eventID
0729         << " gs " << ( gs ? gs->sstr() : "-" )
0730         << " ht " << ( ht ? ht->sstr() : "-" )
0731         << " tot_dt " << std::fixed << std::setw(10) << std::setprecision(6) << tot_dt
0732         ;
0733     reset(eventID);
0734
0735     return ht ;
0736 }
0737
0738
0739
0740 /**
0741 QSim::MaybeSaveIGS
0742 --------------------
0743
0744 Invoked from QSim::simulate just prior to the launch loop
0745 over genstep slices. To configure saving of the gensteps
0746 use the below envvars::
0747
0748     export QSim__SAVE_IGS_EVENTID=6                   ## default -1 does not save
0749     export QSim__SAVE_IGS_PATH=$TMP/.opticks/igs.npy  ## default path below $TMP
0750
0751 Saving gensteps prior to launch is useful for eventID that cause crashes,
0752 as the saved gensteps can then be used with input genstep running::
0753
0754     export OPTICKS_RUNNING_MODE=SRM_INPUT_GENSTEP
0755
0756 That enables fast cycle rerunning without Geant4 initalization, see eg::
0757
0758     TEST=input_genstep_muon cxs_min.sh
0759
0760 A potential cause of crashes is the heuristic OPTICKS_MAX_SLOT
0761 determined based on available VRAM is too optimistically large.
0762 Try manually reducing slots to see if memory limits are the cause::
0763
0764     export OPTICKS_MAX_SLOT=M100
0765
0766 **/
0767
0768 void QSim::MaybeSaveIGS(int eventID, NP* igs) // static
0769 {
0770     bool igs_null = igs == nullptr ;
0771     const char* igs_path = SAVE_IGS_PATH ? spath::Resolve(SAVE_IGS_PATH) : nullptr ;
0772     bool save_igs = igs && SAVE_IGS_EVENTID == eventID && igs_path ;
0773     LOG(LEVEL)
0774         << " eventID " << eventID
0775         << " igs " << ( igs ? igs->sstr() : "-" )
0776         << " igs_null " << ( igs_null ? "YES" : "NO " )
0777         << " [" << _QSim__SAVE_IGS_EVENTID << "] " <<  SAVE_IGS_EVENTID
0778         << " [" << _QSim__SAVE_IGS_PATH    << "] " << ( SAVE_IGS_PATH ? SAVE_IGS_PATH : "-" )
0779         << " igs_path [" << ( igs_path ? igs_path : "-" ) << "]"
0780         << " save_igs " << ( save_igs ? "YES" : "NO " )
0781         ;
0782
0783     if(!save_igs) return ;
0784     igs->save(igs_path);
0785 }
0786
0787
0788
0789 /**
0790 QSim::getPhotonSlotOffset
0791 ---------------------------
0792
0793 The photon slot offset is available
0794 from QEvt after setGenstepUpload_NP
0795
0796 The first genstep slice yields an offset of zero.
0797 So this only yields non-zero offsets for multi-launch
0798 running,
0799
0800
0801 The photon_slot_offset is uploaded to Params on device prior
0802 to simulate launches by CSGOptiX::prepareParamSimulate
0803
0804 The offset is used at head of CSGOptiX7.cu:simulate  to offset the
0805 launch index.
0806
0807 Hence the sum of the offset and the
0808 number of photons in the launch must be less than
0809 or equal the number of states uploaded.
0810
0811 **/
0812
0813
0814
0815 unsigned long long QSim::get_photon_slot_offset() const
0816 {
0817     return qev->get_photon_slot_offset() ;
0818 }
0819
0820
0821 /**
0822 QSim::reset
0823 ------------
0824
0825 When *QSim::simulate* is called with argument *reset:true* the
0826 *QSim::reset* method is called which invokes SEvt::endOfEvent in order
0827 to clean up the SEvt. Normally that would be done after saving
0828 any Opticks configured arrays.
0829
0830 When *QSim::simulate* is called with argument *reset:false*
0831 the *QSim::reset* method must be called separately in order to avoid a memory leak.
0832 Using *reset:false* is typically done in order to keep arrays alive longer
0833 to enable copying from the gathered arrays into non-Opticks collections.
0834
0835 **/
0836 void QSim::reset(int eventID)
0837 {
0838     SProf::Add("QSim__reset_HEAD");
0839     qev->clear();
0840     sev->endOfEvent(eventID);
0841     LOG_IF(info, SEvt::LIFECYCLE) << "] eventID " << eventID ;
0842     SProf::Add("QSim__reset_TAIL");
0843 }
0844
0845
0846
0847 /**
0848 QSim::simtrace
0849 ---------------
0850
0851 Canonically invoked from G4CXOpticks::simtrace
0852 Collected genstep are uploaded and the CSGOptiX kernel is launched to generate and propagate.
0853
0854 **/
0855
0856
0857 double QSim::simtrace(int eventID)
0858 {
0859     assert( SEventConfig::IsRGModeSimtrace() );
0860
0861
0862     sev->beginOfEvent(eventID);
0863
0864     NP* igs = sev->makeGenstepArrayFromVector();
0865
0866     LOG_IF(fatal, igs==nullptr)
0867          << " igs NULL "
0868          << " sev.descGenstepArrayFromVector " << sev->descGenstepArrayFromVector()
0869          ;
0870
0871     assert(igs);
0872     int rc = qev->setGenstepUpload_NP(igs) ;
0873
0874     LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : no gensteps collected : will skip cx.simtrace " ;
0875
0876     sev->t_PreLaunch = sstamp::Now() ;
0877     double dt = rc == 0 && cx != nullptr ? cx->simtrace_launch() : -1. ;
0878     sev->t_PostLaunch = sstamp::Now() ;
0879     sev->t_Launch = dt ;
0880
0881     // see ~/o/notes/issues/cxt_min_simtrace_revival.rst
0882     sev->gather();
0883
0884     sev->topfold->concat();
0885     sev->topfold->clear_subfold();
0886
0887     sev->endOfEvent(eventID);
0888
0889     return dt ;
0890 }
0891
0892
0893 qsim* QSim::getDevicePtr() const
0894 {
0895     return d_sim ;
0896 }
0897
0898
0899 char QSim::getScintTexFilterMode() const
0900 {
0901     return scint->tex->getFilterMode() ;
0902 }
0903
0904 std::string QSim::desc() const
0905 {
0906     std::stringstream ss ;
0907     ss << "QSim::desc"
0908        << std::endl
0909        << " this 0x"            << std::hex << std::uint64_t(this)     << std::dec
0910        << " INSTANCE 0x"        << std::hex << std::uint64_t(INSTANCE) << std::dec
0911        << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev)    << std::dec
0912        << " qsim.h:sim 0x"      << std::hex << std::uint64_t(sim)      << std::dec
0913        ;
0914     std::string s = ss.str();
0915     return s ;
0916 }
0917
0918 std::string QSim::descFull() const
0919 {
0920     std::stringstream ss ;
0921     ss
0922        << std::endl
0923        << "QSim::descFull"
0924        << std::endl
0925        << " this 0x"            << std::hex << std::uint64_t(this)     << std::dec
0926        << " INSTANCE 0x"        << std::hex << std::uint64_t(INSTANCE) << std::dec
0927        << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev)    << std::dec
0928        << " qsim.h:sim 0x"      << std::hex << std::uint64_t(sim)      << std::dec
0929        << " qsim.h:d_sim 0x"    << std::hex << std::uint64_t(d_sim)    << std::dec
0930        //<< " sim->rng_state 0x"   << std::hex << std::uint64_t(sim->rng_state) << std::dec  // tending to SEGV on some systems
0931        << " sim->base 0x"       << std::hex << std::uint64_t(sim->base)  << std::dec
0932        << " sim->bnd 0x"        << std::hex << std::uint64_t(sim->bnd)   << std::dec
0933        << " sim->scint 0x"      << std::hex << std::uint64_t(sim->scint) << std::dec
0934        << " sim->cerenkov 0x"   << std::hex << std::uint64_t(sim->cerenkov) << std::dec
0935        ;
0936     std::string s = ss.str();
0937     return s ;
0938 }
0939
0940 std::string QSim::descComponents() const
0941 {
0942     std::stringstream ss ;
0943     ss << std::endl
0944        << "QSim::descComponents"
0945        << std::endl
0946        << " (QBase)base             " << ( base      ? "YES" : "NO " )  << std::endl
0947        << " (QEvt)qev           " << ( qev     ? "YES" : "NO " )  << std::endl
0948        << " (SEvt)sev               " << ( sev       ? "YES" : "NO " )  << std::endl
0949        << " (QRng)rng               " << ( rng       ? "YES" : "NO " )  << std::endl
0950        << " (QScint)scint           " << ( scint     ? "YES" : "NO " )  << std::endl
0951        << " (QCerenkov)cerenkov     " << ( cerenkov  ? "YES" : "NO " )  << std::endl
0952        << " (QBnd)bnd               " << ( bnd       ? "YES" : "NO " )  << std::endl
0953        << " (QOptical)optical       " << ( optical   ? "YES" : "NO " )  << std::endl
0954        << " (QDebug)debug_          " << ( debug_    ? "YES" : "NO " )  << std::endl
0955        << " (QProp)prop             " << ( prop      ? "YES" : "NO " )  << std::endl
0956        << " (QPMT)pmt               " << ( pmt       ? "YES" : "NO " )  << std::endl
0957        << " (QMultiFilm)multifilm   " << ( multifilm ? "YES" : "NO " )  << std::endl
0958        << " (qsim)sim               " << ( sim       ? "YES" : "NO " )  << std::endl
0959        << " (qsim)d_sim             " << ( d_sim     ? "YES" : "NO " )  << std::endl
0960        << " (qdebug)dbg             " << ( dbg       ? "YES" : "NO " )  << std::endl
0961        << " (qdebug)d_dbg           " << ( d_dbg     ? "YES" : "NO " )  << std::endl
0962        ;
0963     std::string s = ss.str();
0964     return s ;
0965 }
0966
0967
0968
0969
0970
0971 void QSim::configureLaunch(unsigned width, unsigned height )
0972 {
0973     QU::ConfigureLaunch(numBlocks, threadsPerBlock, width, height);
0974 }
0975
0976 void QSim::configureLaunch2D(unsigned width, unsigned height )
0977 {
0978     QU::ConfigureLaunch2D(numBlocks, threadsPerBlock, width, height);
0979 }
0980
0981 void QSim::configureLaunch16()
0982 {
0983     QU::ConfigureLaunch16(numBlocks, threadsPerBlock);
0984 }
0985
0986 void QSim::configureLaunch1D(unsigned num, unsigned threads_per_block)
0987 {
0988     QU::ConfigureLaunch1D(numBlocks, threadsPerBlock, num, threads_per_block);
0989 }
0990
0991
0992 std::string QSim::descLaunch() const
0993 {
0994     return QU::DescLaunch(numBlocks, threadsPerBlock);
0995 }
0996
0997
0998
0999
1000
1001
1002
1003
1004
1005 /**
1006 QSim::rng_sequence mass production with multiple launches...
1007 --------------------------------------------------------------
1008
1009 The output files are split too::
1010
1011     epsilon:opticks blyth$ np.py *.npy
1012     a :                                            TRngBufTest_0.npy :      (10000, 16, 16) : 8f9b27c9416a0121574730baa742b5c9 : 20210715-1227
1013     epsilon:opticks blyth$ du -h TRngBufTest_0.npy
1014      20M    TRngBufTest_0.npy
1015
1016     In [6]: (16*16*4*2*10000)/1e6
1017     Out[6]: 20.48
1018
1019 Upping to 1M would be 100x 20M = 2000M  2GB
1020
1021 * using floats would half storage to 1GB, just promote to double in cks/OpticksRandom::flat
1022   THIS MAKES SENSE AS curand_uniform IS GENERATING float anyhow
1023 * 100k blocks would mean 10*100Mb files for 1M : can do in 10 launches to work on any GPU
1024
1025 **/
1026
1027
1028 template <typename T>
1029 extern void QSim_rng_sequence(  dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, T* seq, unsigned ni, unsigned nj, unsigned id_offset );
1030
1031
1032 /**
1033 QSim::rng_sequence generate randoms in single CUDA launch
1034 -------------------------------------------------------------
1035
1036 This is invoked for each tranche by the below rng_sequence method.
1037 Each tranche launch generates ni_tranche*nv randoms writing them into seq
1038
1039 ni_tranche : item tranche size
1040
1041 nv : number randoms to generate for each item
1042
1043 id_offset : acts on the rng_state array
1044
1045 skipahead : used curand skipahead offsets depending on sim->evt->index and OPTICKS_EVENT_SKIPAHEAD
1046
1047
1048 **/
1049
1050 template <typename T>
1051 void QSim::rng_sequence( T* seq, unsigned ni_tranche, unsigned nv, unsigned id_offset )
1052 {
1053     configureLaunch(ni_tranche, 1 );
1054
1055     unsigned num_rng = ni_tranche*nv ;
1056
1057     const char* label = "QSim::rng_sequence:num_rng" ;
1058
1059     T* d_seq = QU::device_alloc<T>(num_rng, label );
1060
1061     QSim_rng_sequence<T>( numBlocks, threadsPerBlock, d_sim, d_seq, ni_tranche, nv, id_offset );
1062
1063     QU::copy_device_to_host_and_free<T>( seq, d_seq, num_rng, label );
1064 }
1065
1066
1067
1068 const char* QSim::PREFIX = "rng_sequence" ;
1069
1070 /**
1071 QSim::rng_sequence
1072 ---------------------
1073
1074 *ni* is the total number of items across all launches that is
1075 split into tranches of *ni_tranche_size* each.
1076
1077 As separate CPU and GPU memory allocations, launches and output files are made
1078 for each tranche the total generated size may exceed the total available memory of the GPU.
1079 Splitting the output files also eases management by avoiding huge files.
1080
1081 Default *dir* is $TMP/QSimTest/rng_sequence leading to npy paths like::
1082
1083     /tmp/blyth/opticks/QSimTest/rng_sequence/rng_sequence_f_ni1000000_nj16_nk16_tranche100000/rng_sequence_f_ni100000_nj16_nk16_ioffset000000.npy
1084
1085 **/
1086
1087 template <typename T>
1088 void QSim::rng_sequence( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size )
1089 {
1090     assert( ni >= ni_tranche_size && ni % ni_tranche_size == 0 );   // total size *ni* must be integral multiple of *ni_tranche_size*
1091     unsigned num_tranche = ni/ni_tranche_size ;
1092     unsigned nv = nj*nk ;
1093
1094     unsigned size = ni_tranche_size*nv ;   // number of randoms to be generated in each launch
1095     std::string reldir = QU::rng_sequence_reldir<T>(PREFIX, ni, nj, nk, ni_tranche_size  ) ;
1096
1097     LOG(info)
1098         << " ni " << ni
1099         << " ni_tranche_size " << ni_tranche_size
1100         << " num_tranche " << num_tranche
1101         << " reldir " << reldir.c_str()
1102         << " nj " << nj
1103         << " nk " << nk
1104         << " nv(nj*nk) " << nv
1105         << " size(ni_tranche_size*nv) " << size
1106         << " typecode " << QU::typecode<T>()
1107         ;
1108
1109
1110     // NB seq array memory gets reused for each launch and saved to different paths
1111     NP* seq = NP::Make<T>(ni_tranche_size, nj, nk) ;
1112     T* seq_values = seq->values<T>();
1113     NP::INT seq_nv = seq->num_values();
1114
1115
1116     LOG(info)
1117         << " seq " << ( seq ? seq->sstr() : "-" )
1118         << " seq_values " << seq_values
1119         << " seq_nv " << seq_nv
1120         << " seq_values[0] " << seq_values[0]
1121         << " seq_values[seq_nv-1] " << seq_values[seq_nv-1]
1122         ;
1123
1124
1125
1126     for(unsigned t=0 ; t < num_tranche ; t++)
1127     {
1128         // *id_offset* controls which rng_state/RNG to use
1129         unsigned id_offset = ni_tranche_size*t ;
1130         std::string name = QU::rng_sequence_name<T>(PREFIX, ni_tranche_size, nj, nk, id_offset ) ;
1131
1132         std::cout
1133             << std::setw(3) << t
1134             << std::setw(10) << id_offset
1135             << std::setw(100) << name.c_str()
1136             << std::endl
1137             ;
1138
1139         rng_sequence( seq_values, ni_tranche_size, nv, id_offset );
1140
1141         const char* path = spath::Resolve(dir, reldir.c_str(), name.c_str() );
1142         seq->save(path);
1143     }
1144 }
1145
1146
1147
1148 template void QSim::rng_sequence<float>(  const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
1149 template void QSim::rng_sequence<double>( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160 /**
1161 QSim::scint_wavelength
1162 ----------------------------------
1163
1164 Setting envvar QSIM_DISABLE_HD disables multiresolution handling
1165 and causes the returned hd_factor to be zero rather then
1166 the typical values of 10 or 20 which depend on the buffer creation.
1167
1168 **/
1169
1170 extern void QSim_scint_wavelength(   dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, float* wavelength, unsigned num_wavelength );
1171
1172 NP* QSim::scint_wavelength(unsigned num_wavelength, unsigned& hd_factor )
1173 {
1174
1175     bool qsim_disable_hd = ssys::getenvbool("QSIM_DISABLE_HD");
1176     hd_factor = qsim_disable_hd ? 0u : scint->tex->getHDFactor() ;
1177     // HMM: perhaps get this from sim rather than occupying an argument slot
1178     LOG(LEVEL) << "[" << " qsim_disable_hd " << qsim_disable_hd << " hd_factor " << hd_factor ;
1179
1180     configureLaunch(num_wavelength, 1 );
1181
1182     float* d_wavelength = QU::device_alloc<float>(num_wavelength, "QSim::scint_wavelength/num_wavelength");
1183
1184     QSim_scint_wavelength(numBlocks, threadsPerBlock, d_sim, d_wavelength, num_wavelength );
1185
1186     NP* w = NP::Make<float>(num_wavelength) ;
1187
1188     QU::copy_device_to_host_and_free<float>( (float*)w->bytes(), d_wavelength, num_wavelength, "QSim::scint_wavelength" );
1189
1190     LOG(LEVEL) << "]" ;
1191
1192     return w ;
1193 }
1194
1195
1196 extern void QSim_RandGaussQ_shoot(  dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, float* v, unsigned num_v );
1197
1198 NP* QSim::RandGaussQ_shoot(unsigned num_v )
1199 {
1200     const char* label = "QSim::RandGaussQ_shoot/num" ;
1201     configureLaunch(num_v, 1 );
1202     std::cout << label << " " << num_v << std::endl;
1203
1204     float* d_v = QU::device_alloc<float>(num_v, label );
1205
1206     QSim_RandGaussQ_shoot(numBlocks, threadsPerBlock, d_sim, d_v, num_v );
1207
1208     cudaDeviceSynchronize();
1209
1210     NP* v = NP::Make<float>(num_v) ;
1211     QU::copy_device_to_host_and_free<float>( (float*)v->bytes(), d_v, num_v, label );
1212
1213     return v ;
1214 }
1215
1216
1217
1218
1219 void QSim::dump_wavelength( float* wavelength, unsigned num_wavelength, unsigned edgeitems )
1220 {
1221     LOG(LEVEL);
1222     for(unsigned i=0 ; i < num_wavelength ; i++)
1223     {
1224         if( i < edgeitems || i > num_wavelength - edgeitems)
1225         {
1226             std::cout
1227                 << std::setw(10) << i
1228                 << std::setw(10) << std::fixed << std::setprecision(3) << wavelength[i]
1229                 << std::endl
1230                 ;
1231         }
1232     }
1233 }
1234
1235
1236 extern void QSim_dbg_gs_generate(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, qdebug* dbg, sphoton* photon, unsigned num_photon, unsigned type ) ;
1237
1238
1239 NP* QSim::dbg_gs_generate(unsigned num_photon, unsigned type )
1240 {
1241     assert( type == SCINT_GENERATE || type == CERENKOV_GENERATE );
1242
1243     configureLaunch( num_photon, 1 );
1244     sphoton* d_photon = QU::device_alloc<sphoton>(num_photon, "QSim::dbg_gs_generate:num_photon") ;
1245     QU::device_memset<sphoton>(d_photon, 0, num_photon);
1246
1247     QSim_dbg_gs_generate(numBlocks, threadsPerBlock, d_sim, d_dbg, d_photon, num_photon, type );
1248
1249     NP* p = NP::Make<float>(num_photon, 4, 4);
1250     const char* label = "QSim::dbg_gs_generate" ;
1251
1252     QU::copy_device_to_host_and_free<sphoton>( (sphoton*)p->bytes(), d_photon, num_photon, label );
1253     return p ;
1254 }
1255
1256
1257
1258 extern void QSim_generate_photon(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim )  ;
1259
1260 /**
1261 QSim::generate_photon
1262 -----------------------
1263
1264 **/
1265
1266
1267 void QSim::generate_photon()
1268 {
1269     LOG(LEVEL) << "[" ;
1270
1271     unsigned num_photon = qev->getNumPhoton() ;
1272     LOG(info) << " num_photon " << num_photon ;
1273
1274     LOG_IF(fatal, num_photon == 0 )
1275         << " num_photon zero : MUST QEvt::setGenstep before QSim::generate_photon "
1276         ;
1277
1278     assert( num_photon > 0 );
1279     assert( d_sim );
1280
1281     configureLaunch( num_photon, 1 );
1282
1283     LOG(info) << "QSim_generate_photon... " ;
1284
1285     QSim_generate_photon(numBlocks, threadsPerBlock, d_sim );
1286
1287     LOG(LEVEL) << "]" ;
1288 }
1289
1290
1291
1292
1293
1294
1295 extern void QSim_fill_state_0(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad6* state, unsigned num_state, qdebug* dbg );
1296
1297 void QSim::fill_state_0(quad6* state, unsigned num_state)
1298 {
1299     assert( d_sim );
1300     assert( d_dbg );
1301
1302     quad6* d_state = QU::device_alloc<quad6>(num_state, "QSim::fill_state_0:num_state") ;
1303
1304
1305     unsigned threads_per_block = 32 ;
1306     configureLaunch1D( num_state, threads_per_block );
1307
1308     LOG(info)
1309          << " num_state " << num_state
1310          << " threads_per_block  " << threads_per_block
1311          << " descLaunch " << descLaunch()
1312          ;
1313
1314     QSim_fill_state_0(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg  );
1315
1316     const char* label = "QSim::fill_state_0" ;
1317     QU::copy_device_to_host_and_free<quad6>( state, d_state, num_state, label );
1318 }
1319
1320
1321 extern void QSim_fill_state_1(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, sstate* state, unsigned num_state, qdebug* dbg );
1322
1323 void QSim::fill_state_1(sstate* state, unsigned num_state)
1324 {
1325     assert( d_sim );
1326     assert( d_dbg );
1327
1328     sstate* d_state = QU::device_alloc<sstate>(num_state, "QSim::fill_state_1:num_state") ;
1329
1330     unsigned threads_per_block = 64 ;
1331     configureLaunch1D( num_state, threads_per_block );
1332
1333     LOG(info)
1334          << " num_state " << num_state
1335          << " threads_per_block  " << threads_per_block
1336          << " descLaunch " << descLaunch()
1337          ;
1338
1339     QSim_fill_state_1(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg );
1340
1341     const char* label = "QSim::fill_state_1" ;
1342     QU::copy_device_to_host_and_free<sstate>( state, d_state, num_state, label );
1343 }
1344
1345
1346
1347
1348
1349
1350
1351
1352 /**
1353 extern QSim_quad_launch
1354 --------------------------
1355
1356 This function is implemented in QSim.cu and it used by *quad_launch_generate*
1357
1358 **/
1359
1360 extern void QSim_quad_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad* q, unsigned num_quad, qdebug* dbg, unsigned type  );
1361
1362
1363
1364 NP* QSim::quad_launch_generate(unsigned num_quad, unsigned type )
1365 {
1366     assert( d_sim );
1367     assert( d_dbg );
1368
1369     const char* label = "QSim::quad_launch_generate:num_quad" ;
1370
1371     quad* d_q = QU::device_alloc<quad>(num_quad, label ) ;
1372
1373     unsigned threads_per_block = 512 ;
1374     configureLaunch1D( num_quad, threads_per_block );
1375
1376     QSim_quad_launch(numBlocks, threadsPerBlock, d_sim, d_q, num_quad, d_dbg, type );
1377
1378     NP* q = NP::Make<float>( num_quad, 4 );
1379     quad* qq = (quad*)q->bytes();
1380
1381     QU::copy_device_to_host_and_free<quad>( qq, d_q, num_quad, label );
1382
1383     if( type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA || type == QGEN_SMEAR_NORMAL_POLISH )
1384     {
1385         q->set_meta<std::string>("normal", scuda::serialize(dbg->normal) );
1386         q->set_meta<std::string>("direction", scuda::serialize(dbg->direction) );
1387         q->set_meta<float>("value", dbg->value );
1388         q->set_meta<std::string>("valuename", type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA ? "sigma_alpha" : "polish" );
1389     }
1390
1391     return q ;
1392 }
1393
1394
1395
1396
1397 /**
1398 extern QSim_photon_launch
1399 --------------------------
1400
1401 This function is implemented in QSim.cu and it used by BOTH *photon_launch_generate* and *photon_launch_mutate*
1402
1403 **/
1404
1405 extern void QSim_photon_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, sphoton* photon, unsigned num_photon, qdebug* dbg, unsigned type  );
1406
1407
1408 /**
1409 QSim::photon_launch_generate
1410 ------------------------------
1411
1412 This allocates a photon array on the device, generates photons into it on device and
1413 then downloads the generated photons into the host array. Contrast with *photon_launch_mutate*.
1414
1415 **/
1416
1417 NP* QSim::photon_launch_generate(unsigned num_photon, unsigned type )
1418 {
1419     assert( d_sim );
1420     assert( d_dbg );
1421
1422     const char* label = "QSim::photon_launch_generate:num_photon" ;
1423
1424     sphoton* d_photon = QU::device_alloc<sphoton>(num_photon, label ) ;
1425     QU::device_memset<sphoton>(d_photon, 0, num_photon);
1426
1427     unsigned threads_per_block = 512 ;
1428     configureLaunch1D( num_photon, threads_per_block );
1429
1430     QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, num_photon, d_dbg, type );
1431
1432     NP* p = NP::Make<float>(num_photon, 4, 4);
1433     sphoton* photon = (sphoton*)p->bytes() ;
1434
1435     QU::copy_device_to_host_and_free<sphoton>( photon, d_photon, num_photon, label );
1436
1437     return p ;
1438 }
1439
1440
1441
1442
1443 /**
1444 QSim::photon_launch_mutate
1445 ---------------------------
1446
1447 This uploads the photon array provided, mutates it and then downloads the changed array.
1448
1449 **/
1450
1451 void QSim::photon_launch_mutate(sphoton* photon, unsigned num_photon, unsigned type )
1452 {
1453     assert( d_sim );
1454     assert( d_dbg );
1455
1456     const char* label_0 = "QSim::photon_launch_mutate/d_photon" ;
1457     sphoton* d_photon = QU::UploadArray<sphoton>(photon, num_photon, label_0 );
1458
1459     unsigned DEBUG_NUM_PHOTON = ssys::getenvunsigned(_QSim__photon_launch_mutate_DEBUG_NUM_PHOTON, 0 );
1460     bool DEBUG_NUM_PHOTON_valid = DEBUG_NUM_PHOTON > 0 && DEBUG_NUM_PHOTON <= num_photon ;
1461     unsigned u_num_photon = DEBUG_NUM_PHOTON_valid ? DEBUG_NUM_PHOTON  : num_photon ;
1462     bool SKIP_LAUNCH = ssys::getenvbool(_QSim__photon_launch_mutate_SKIP_LAUNCH) ;
1463
1464     LOG_IF( error, DEBUG_NUM_PHOTON_valid || true )
1465         << _QSim__photon_launch_mutate_DEBUG_NUM_PHOTON
1466         << " DEBUG_NUM_PHOTON " << DEBUG_NUM_PHOTON
1467         << " num_photon " << num_photon
1468         << " u_num_photon " << u_num_photon
1469         << _QSim__photon_launch_mutate_SKIP_LAUNCH
1470         << " " << ( SKIP_LAUNCH ? "YES" : "NO " )
1471         ;
1472
1473
1474     if( SKIP_LAUNCH == false )
1475     {
1476         unsigned threads_per_block = 512 ;
1477         configureLaunch1D( u_num_photon, threads_per_block );
1478         QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, u_num_photon, d_dbg, type );
1479     }
1480
1481
1482     const char* label_1 = "QSim::photon_launch_mutate" ;
1483     QU::copy_device_to_host_and_free<sphoton>( photon, d_photon, u_num_photon, label_1 );
1484 }
1485
1486
1487
1488
1489
1490 /**
1491 QSim::UploadFakePRD (formerly "UploadMockPRD" )
1492 ----------------------------------------------------
1493
1494 Caution this returns a device pointer.
1495 **/
1496
1497 quad2* QSim::UploadFakePRD(const NP* ip, const NP* prd) // static
1498 {
1499     assert(ip);
1500     int num_ip = ip->shape[0] ;
1501     assert( num_ip > 0 );
1502
1503     assert( prd->has_shape( num_ip, -1, 2, 4 ) );    // TODO: evt->max_record checking
1504     assert( prd->shape.size() == 4 && prd->shape[2] == 2 && prd->shape[3] == 4 );
1505     int num_prd = prd->shape[0]*prd->shape[1] ;
1506
1507     LOG(LEVEL)
1508          << "["
1509          << " num_ip " << num_ip
1510          << " num_prd " << num_prd
1511          << " prd " << prd->sstr()
1512          ;
1513
1514     const char* label = "QSim::UploadFakePRD/d_prd" ;
1515     quad2* d_prd = QU::UploadArray<quad2>( (quad2*)prd->bytes(), num_prd, label );
1516
1517     // prd is non-standard so it is appropriate to adhoc upload here
1518
1519     return d_prd ;
1520 }
1521
1522 #if !defined(PRODUCTION)
1523 extern void QSim_fake_propagate_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad2* prd );
1524 #endif
1525
1526 /**
1527
1528 QSim::fake_propagate (formerly mock_propagate)
1529 -----------------------------------------------
1530
1531 This is only invoked from QSimTest::fake_propagate
1532
1533
1534 Was renamed from "mock" to "fake" as is within Opticks "mock" is
1535 used to mean without using CUDA.
1536
1537 * number of prd must be a multiple of the number of photon, ratio giving bounce_max
1538 * number of record must be a multiple of the number of photon, ratio giving record_max
1539 * HMM: this is an obtuse way to get bounce_max and record_max
1540
1541 Aiming to replace the above with a simpler and less duplicitous version by
1542 using common QEvt functionality
1543
1544 **/
1545
1546 void QSim::fake_propagate( const NP* prd, unsigned type )
1547 {
1548 #if defined(PRODUCTION)
1549     (void)prd;
1550     (void)type;
1551     LOG(fatal) << "QSim::fake_propagate is disabled in PRODUCTION builds";
1552     std::raise(SIGINT);
1553 #else
1554     const NP* ip = sev->getInputPhoton();
1555     int num_ip = ip ? ip->shape[0] : 0 ;
1556     assert( num_ip > 0 );
1557
1558     quad2* d_prd = UploadFakePRD(ip, prd) ;
1559
1560     NP* igs = sev->makeGenstepArrayFromVector();
1561
1562     int rc = qev->setGenstepUpload_NP(igs);
1563     assert( rc == 0 );
1564     if(rc!=0) std::raise(SIGINT);
1565
1566     sev->add_array("prd0", prd );
1567     // NB SEvt::beginOfEvent calls SEvt/clear so this addition
1568     // must be after that to succeed in being added to SEvt saved arrays
1569
1570     int num_photon = qev->getNumPhoton();
1571     bool consistent_num_photon = num_photon == num_ip ;
1572
1573     LOG_IF(fatal, !consistent_num_photon)
1574          << "["
1575          << " num_ip " << num_ip
1576          << " QEvt::getNumPhoton " << num_photon
1577          << " consistent_num_photon " << ( consistent_num_photon ? "YES" : "NO " )
1578          << " prd " << prd->sstr()
1579          ;
1580     assert(consistent_num_photon);
1581
1582     assert( qev->upload_count > 0 );
1583
1584     unsigned threads_per_block = 512 ;
1585     configureLaunch1D( num_photon, threads_per_block );
1586
1587     QSim_fake_propagate_launch(numBlocks, threadsPerBlock, d_sim, d_prd );
1588
1589     cudaDeviceSynchronize();
1590
1591
1592     LOG(LEVEL) << "]" ;
1593 #endif
1594 }
1595
1596
1597
1598 extern void QSim_boundary_lookup_all(    dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, quad* lookup, unsigned width, unsigned height );
1599
1600 NP* QSim::boundary_lookup_all(unsigned width, unsigned height )
1601 {
1602     LOG(LEVEL) << "[" ;
1603     assert( bnd );
1604     assert( width <= getBoundaryTexWidth()  );
1605     assert( height <= getBoundaryTexHeight()  );
1606
1607     unsigned num_lookup = width*height ;
1608     LOG(LEVEL)
1609         << " width " << width
1610         << " height " << height
1611         << " num_lookup " << num_lookup
1612         ;
1613
1614
1615     configureLaunch(width, height );
1616
1617     const char* label = "QSim::boundary_lookup_all:num_lookup" ;
1618
1619     quad* d_lookup = QU::device_alloc<quad>(num_lookup, label ) ;
1620     QSim_boundary_lookup_all(numBlocks, threadsPerBlock, d_sim, d_lookup, width, height );
1621
1622     assert( height % 8 == 0 );
1623     unsigned num_bnd = height/8 ;
1624
1625     NP* l = NP::Make<float>( num_bnd, 4, 2, width, 4 );
1626     QU::copy_device_to_host_and_free<quad>( (quad*)l->bytes(), d_lookup, num_lookup, label );
1627
1628     LOG(LEVEL) << "]" ;
1629
1630     return l ;
1631
1632 }
1633
1634 extern void QSim_boundary_lookup_line(    dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, quad* lookup, float* domain, unsigned num_lookup, unsigned line, unsigned k );
1635
1636
1637 NP* QSim::boundary_lookup_line( float* domain, unsigned num_lookup, unsigned line, unsigned k )
1638 {
1639     LOG(LEVEL)
1640         << "["
1641         << " num_lookup " << num_lookup
1642         << " line " << line
1643         << " k " << k
1644         ;
1645
1646     configureLaunch(num_lookup, 1  );
1647
1648     float* d_domain = QU::device_alloc<float>(num_lookup, "QSim::boundary_lookup_line:num_lookup") ;
1649
1650     QU::copy_host_to_device<float>( d_domain, domain, num_lookup );
1651
1652     const char* label = "QSim::boundary_lookup_line:num_lookup" ;
1653
1654     quad* d_lookup = QU::device_alloc<quad>(num_lookup, label ) ;
1655
1656     QSim_boundary_lookup_line(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, num_lookup, line, k );
1657
1658
1659     NP* l = NP::Make<float>( num_lookup, 4 );
1660
1661     QU::copy_device_to_host_and_free<quad>( (quad*)l->bytes(), d_lookup, num_lookup, label  );
1662
1663     QU::device_free<float>( d_domain );
1664
1665     LOG(LEVEL) << "]" ;
1666
1667     return l ;
1668 }
1669
1670
1671
1672
1673
1674 /**
1675 QSim::prop_lookup
1676 --------------------
1677
1678 suspect problem when have fine domain and many pids due to 2d launch config,
1679 BUT when have 1d launch there is no problem to launch millions of threads : hence the
1680 below *prop_lookup_onebyone*
1681
1682 **/
1683
1684
1685 template <typename T>
1686 extern void QSim_prop_lookup( dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, T* lookup, const T* domain, unsigned domain_width, unsigned* pids, unsigned num_pids );
1687
1688 template <typename T>
1689 void QSim::prop_lookup( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids )
1690 {
1691     unsigned num_pids = pids.size() ;
1692     unsigned num_lookup = num_pids*domain_width ;
1693     LOG(LEVEL)
1694         << "["
1695         << " num_pids " << num_pids
1696         << " domain_width " << domain_width
1697         << " num_lookup " << num_lookup
1698         ;
1699
1700     configureLaunch(domain_width, num_pids  );
1701
1702     unsigned* d_pids = QU::device_alloc<unsigned>(num_pids, "QSim::prop_lookup:num_pids") ;
1703     T* d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup:domain_width") ;
1704     T* d_lookup = QU::device_alloc<T>(num_lookup  , "QSim::prop_lookup:num_lookup") ;
1705
1706     QU::copy_host_to_device<T>( d_domain, domain, domain_width );
1707     QU::copy_host_to_device<unsigned>( d_pids, pids.data(), num_pids );
1708
1709     QSim_prop_lookup(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, d_pids, num_pids );
1710
1711     QU::copy_device_to_host_and_free<T>( lookup, d_lookup, num_lookup );
1712     QU::device_free<T>( d_domain );
1713     QU::device_free<unsigned>( d_pids );
1714
1715     LOG(LEVEL) << "]" ;
1716 }
1717
1718
1719
1720 /**
1721 Hmm doing lookups like this is a very common pattern, could do with
1722 a sub context to carry the pieces to simplify doing that.
1723 **/
1724
1725 template <typename T>
1726 extern void QSim_prop_lookup_one(
1727     dim3 numBlocks,
1728     dim3 threadsPerBlock,
1729     qsim* sim,
1730     T* lookup,
1731     const T* domain,
1732     unsigned domain_width,
1733     unsigned num_pids,
1734     unsigned pid,
1735     unsigned ipid
1736 );
1737
1738 /**
1739 QSim::prop_lookup_onebyone
1740 ---------------------------
1741
1742 The *lookup* output array holds across domain lookups for multiple property ids.
1743 Separate launches are made for each property id, all writing to the same buffer.
1744
1745 On device uses::
1746
1747     sim->prop->interpolate
1748
1749 **/
1750
1751 template <typename T>
1752 void QSim::prop_lookup_onebyone( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids )
1753 {
1754     unsigned num_pids = pids.size() ;
1755     unsigned num_lookup = num_pids*domain_width ;
1756     LOG(LEVEL)
1757         << "["
1758         << " num_pids " << num_pids
1759         << " domain_width " << domain_width
1760         << " num_lookup " << num_lookup
1761         ;
1762
1763     configureLaunch(domain_width, 1  );
1764
1765     T* d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup_onebyone:domain_width") ;
1766     QU::copy_host_to_device<T>( d_domain, domain, domain_width );
1767
1768     const char* label = "QSim::prop_lookup_onebyone:num_lookup" ;
1769
1770     T* d_lookup = QU::device_alloc<T>(num_lookup, label ) ;
1771
1772     // separate launches for each pid
1773     for(unsigned ipid=0 ; ipid < num_pids ; ipid++)
1774     {
1775         unsigned pid = pids[ipid] ;
1776         QSim_prop_lookup_one<T>(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, num_pids, pid, ipid );
1777     }
1778
1779     QU::copy_device_to_host_and_free<T>( lookup, d_lookup, num_lookup, label  );
1780
1781     QU::device_free<T>( d_domain );
1782
1783     LOG(LEVEL) << "]" ;
1784 }
1785
1786
1787 template void QSim::prop_lookup_onebyone( float*, const float* ,   unsigned, const std::vector<unsigned>& );
1788 template void QSim::prop_lookup_onebyone( double*, const double* , unsigned, const std::vector<unsigned>& );
1789
1790
1791
1792
1793
1794
1795 extern void QSim_multifilm_lookup_all(    dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad2* sample, quad2* result,  unsigned width, unsigned height );
1796
1797 void QSim::multifilm_lookup_all( quad2 * sample , quad2 * result ,  unsigned width, unsigned height )
1798 {
1799     LOG(LEVEL) << "[" ;
1800     unsigned num_lookup = width*height ;
1801     unsigned size = num_lookup ;
1802
1803     LOG(LEVEL)
1804         << " width " << width
1805         << " height " << height
1806         << " num_lookup " << num_lookup
1807         << " size "<<size
1808         ;
1809
1810     configureLaunch2D(width, height );
1811
1812     //const float * c_sample = sample;
1813     quad2* d_sample = QU::device_alloc<quad2>(size, "QSim::multifilm_lookup_all:size" ) ;
1814
1815     const char* label = "QSim::multifilm_lookup_all:size" ;
1816
1817     quad2* d_result = QU::device_alloc<quad2>(size, label ) ;
1818     LOG(LEVEL)
1819        <<" copy_host_to_device<quad2>( d_sample, sample , size) before";
1820     QU::copy_host_to_device<quad2>( d_sample, sample , size);
1821     LOG(LEVEL)
1822        <<" copy_host_to_device<quad2>( d_sample, sample , size) after";
1823
1824     QSim_multifilm_lookup_all(numBlocks, threadsPerBlock, d_sim, d_sample, d_result, width, height );
1825     QU::copy_device_to_host_and_free<quad2>( result , d_result , size, label );
1826     QU::device_free<quad2>(d_sample);
1827
1828     cudaDeviceSynchronize();
1829     LOG(LEVEL) << "]" ;
1830 }
1831
1832
1833
1834
1835 unsigned QSim::getBoundaryTexWidth() const
1836 {
1837     return bnd->tex->width ;
1838 }
1839 unsigned QSim::getBoundaryTexHeight() const
1840 {
1841     return bnd->tex->height ;
1842 }
1843 const NP* QSim::getBoundaryTexSrc() const
1844 {
1845     return bnd->src ;
1846 }
1847
1848 void QSim::dump_photon( quad4* photon, unsigned num_photon, const char* opt_, unsigned edgeitems )
1849 {
1850     LOG(LEVEL);
1851
1852     std::string opt = opt_ ;
1853
1854     bool f0 = opt.find("f0") != std::string::npos ;
1855     bool f1 = opt.find("f1") != std::string::npos ;
1856     bool f2 = opt.find("f2") != std::string::npos ;
1857     bool f3 = opt.find("f3") != std::string::npos ;
1858
1859     bool i0 = opt.find("i0") != std::string::npos ;
1860     bool i1 = opt.find("i1") != std::string::npos ;
1861     bool i2 = opt.find("i2") != std::string::npos ;
1862     bool i3 = opt.find("i3") != std::string::npos ;
1863
1864     int wi = 7 ;
1865     int pr = 2 ;
1866
1867     for(unsigned i=0 ; i < num_photon ; i++)
1868     {
1869         if( i < edgeitems || i > num_photon - edgeitems)
1870         {
1871             const quad4& p = photon[i] ;
1872
1873             std::cout
1874                 << std::setw(wi) << i
1875                 ;
1876
1877             if(f0) std::cout
1878                 << " f0 "
1879                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.x
1880                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.y
1881                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.z
1882                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.w
1883                 ;
1884
1885             if(f1) std::cout
1886                 << " f1 "
1887                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.x
1888                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.y
1889                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.z
1890                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.w
1891                 ;
1892
1893             if(f2) std::cout
1894                 << " f2 "
1895                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.x
1896                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.y
1897                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.z
1898                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.w
1899                 ;
1900
1901             if(f3) std::cout
1902                 << " f3 "
1903                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.x
1904                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.y
1905                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.z
1906                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.w
1907                 ;
1908
1909             if(i0) std::cout
1910                 << " i0 "
1911                 << std::setw(wi) << p.q0.i.x
1912                 << std::setw(wi) << p.q0.i.y
1913                 << std::setw(wi) << p.q0.i.z
1914                 << std::setw(wi) << p.q0.i.w
1915                 ;
1916
1917             if(i1) std::cout
1918                 << " i1 "
1919                 << std::setw(wi) << p.q1.i.x
1920                 << std::setw(wi) << p.q1.i.y
1921                 << std::setw(wi) << p.q1.i.z
1922                 << std::setw(wi) << p.q1.i.w
1923                 ;
1924
1925             if(i2) std::cout
1926                 << " i2 "
1927                 << std::setw(wi) << p.q2.i.x
1928                 << std::setw(wi) << p.q2.i.y
1929                 << std::setw(wi) << p.q2.i.z
1930                 << std::setw(wi) << p.q2.i.w
1931                 ;
1932
1933             if(i3) std::cout
1934                 << " i3 "
1935                 << std::setw(wi) << p.q3.i.x
1936                 << std::setw(wi) << p.q3.i.y
1937                 << std::setw(wi) << p.q3.i.z
1938                 << std::setw(wi) << p.q3.i.w
1939                 ;
1940
1941             std::cout
1942                 << std::endl
1943                 ;
1944         }
1945     }
1946 }
1947
1948
1949 /**
1950 QSim::Desc
1951 ------------
1952
1953 Generated with::
1954
1955    ~/opticks/qudarap/QSim__Desc.sh
1956
1957 Dump flags with::
1958
1959    QSimDescTest
1960    ssys_test
1961
1962 **/
1963 std::string QSim::Desc(char delim)  // static
1964 {
1965     std::stringstream ss ;
1966     ss << ( delim == ',' ? "" : "QSim::Desc\n" )
1967 #ifdef CONFIG_Debug
1968        << "CONFIG_Debug"
1969 #else
1970        << "NOT-CONFIG_Debug"
1971 #endif
1972        << delim
1973 #ifdef CONFIG_RelWithDebInfo
1974        << "CONFIG_RelWithDebInfo"
1975 #else
1976        << "NOT-CONFIG_RelWithDebInfo"
1977 #endif
1978        << delim
1979 #ifdef CONFIG_Release
1980        << "CONFIG_Release"
1981 #else
1982        << "NOT-CONFIG_Release"
1983 #endif
1984        << delim
1985 #ifdef CONFIG_MinSizeRel
1986        << "CONFIG_MinSizeRel"
1987 #else
1988        << "NOT-CONFIG_MinSizeRel"
1989 #endif
1990        << delim
1991 #ifdef PRODUCTION
1992        << "PRODUCTION"
1993 #else
1994        << "NOT-PRODUCTION"
1995 #endif
1996        << delim
1997 #ifdef WITH_CHILD
1998        << "WITH_CHILD"
1999 #else
2000        << "NOT-WITH_CHILD"
2001 #endif
2002        << delim
2003 #ifdef WITH_CUSTOM4
2004        << "WITH_CUSTOM4"
2005 #else
2006        << "NOT-WITH_CUSTOM4"
2007 #endif
2008        << delim
2009 #ifdef PLOG_LOCAL
2010        << "PLOG_LOCAL"
2011 #else
2012        << "NOT-PLOG_LOCAL"
2013 #endif
2014        << delim
2015 #ifdef DEBUG_PIDX
2016        << "DEBUG_PIDX"
2017 #else
2018        << "NOT-DEBUG_PIDX"
2019 #endif
2020        << delim
2021 #ifdef DEBUG_TAG
2022        << "DEBUG_TAG"
2023 #else
2024        << "NOT-DEBUG_TAG"
2025 #endif
2026        << delim
2027 #ifdef RNG_XORWOW
2028        << "RNG_XORWOW"
2029 #else
2030        << "NOT-RNG_XORWOW"
2031 #endif
2032        << delim
2033 #ifdef RNG_PHILOX
2034        << "RNG_PHILOX"
2035 #else
2036        << "NOT-RNG_PHILOX"
2037 #endif
2038        << delim
2039 #ifdef RNG_PHILITEOX
2040        << "RNG_PHILITEOX"
2041 #else
2042        << "NOT-RNG_PHILITEOX"
2043 #endif
2044        << delim
2045        ;
2046     std::string str = ss.str() ;
2047     return str ;
2048 }
2049
2050
2051
2052 std::string QSim::Switches()  // static
2053 {
2054     return Desc(',');
2055 }
2056
2057