Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:10

0001 
0002 #include <csignal>
0003 
0004 #include "SLOG.hh"
0005 
0006 #include "ssys.h"
0007 #include "sstamp.h"
0008 #include "spath.h"
0009 #include "SProf.hh"
0010 
0011 #include "SComp.h"
0012 #include "SEvt.hh"
0013 #include "SSim.hh"
0014 #include "scuda.h"
0015 #include "squad.h"
0016 #include "salloc.h"
0017 #include "SEvent.hh"
0018 #include "SEventConfig.hh"
0019 
0020 //#include "SCSGOptiX.h"
0021 #include "SSimulator.h"
0022 
0023 #include "SGenstep.h"
0024 #include "sslice.h"
0025 
0026 #include "NP.hh"
0027 #include "QUDA_CHECK.h"
0028 #include "QU.hh"
0029 
0030 #include "qrng.h"
0031 #include "qsim.h"
0032 #include "qdebug.h"
0033 
0034 #include "QBase.hh"
0035 #include "QEvt.hh"
0036 #include "QRng.hh"
0037 #include "QTex.hh"
0038 #include "QScint.hh"
0039 #include "QCerenkov.hh"
0040 #include "QBnd.hh"
0041 #include "QProp.hh"
0042 #include "QMultiFilm.hh"
0043 #include "QEvt.hh"
0044 #include "QOptical.hh"
0045 #include "QSimLaunch.hh"
0046 #include "QDebug.hh"
0047 #include "QPMT.hh"
0048 
0049 #include "QSim.hh"
0050 
0051 const plog::Severity QSim::LEVEL = SLOG::EnvLevel("QSim", "DEBUG");
0052 
0053 const bool  QSim::REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
0054 const int   QSim::SAVE_IGS_EVENTID = ssys::getenvint(_QSim__SAVE_IGS_EVENTID,-1) ;
0055 const char* QSim::SAVE_IGS_PATH = ssys::getenvvar(_QSim__SAVE_IGS_PATH, "$TMP/.opticks/igs.npy");
0056 const bool  QSim::CONCAT = ssys::getenvbool(_QSim__CONCAT);
0057 const bool  QSim::ALLOC  = ssys::getenvbool(_QSim__ALLOC);
0058 
0059 
0060 
0061 QSim* QSim::INSTANCE = nullptr ;
0062 QSim* QSim::Get(){ return INSTANCE ; }
0063 
0064 QSim* QSim::Create()
0065 {
0066     LOG_IF(fatal, INSTANCE != nullptr) << " a QSim INSTANCE already exists " ;
0067     assert( INSTANCE == nullptr ) ;
0068     return new QSim  ;
0069 }
0070 
0071 
0072 
0073 
0074 /**
0075 QSim::UploadComponents
0076 -----------------------
0077 
0078 Canonical invokation from CSGOptiX::InitSim by CSGOptiX::Create
0079 
0080 As the QBase instanciation done by QSim::UploadComponents
0081 is typically the first connection to the GPU device
0082 by an Opticks process it is prone to CUDA latency of
0083 order 1.5s per GPU if nvidia-persistenced is not running.
0084 Start the daemon to avoid this latency following:
0085 
0086 * https://docs.nvidia.com/deploy/driver-persistence/index.html
0087 
0088 Essentially::
0089 
0090     N[blyth@localhost ~]$ sudo su
0091     N[root@localhost blyth]$ mkdir -p /var/run/nvidia-persistenced
0092     N[root@localhost blyth]$ chown blyth:blyth /var/run/nvidia-persistenced
0093     N[root@localhost blyth]$ which nvidia-persistenced
0094     /bin/nvidia-persistenced
0095     N[root@localhost blyth]$ nvidia-persistenced --user blyth
0096     N[root@localhost blyth]$ ls /var/run/nvidia-persistenced/
0097     nvidia-persistenced.pid  socket
0098 
0099 
0100 QSim::UploadComponents is invoked for example by CSGOptiX/tests/CSGOptiXSimtraceTest.cc
0101 prior to instanciating CSGOptiX
0102 
0103 Uploading components is a once only action for a geometry, encompassing:
0104 
0105 * random states
0106 * scintillation textures
0107 * boundary textures
0108 * property arrays
0109 
0110 It is the simulation physics equivalent of uploading the CSGFoundry geometry.
0111 
0112 The components are managed by separate singleton instances
0113 that subsequent QSim instanciation collects together.
0114 This structure is used to allow separate testing.
0115 
0116 **/
0117 
0118 void QSim::UploadComponents( const SSim* ssim  )
0119 {
0120     LOG(LEVEL) << "[ ssim " << ssim ;
0121     if(getenv("QSim__UploadComponents_SIGINT")) std::raise(SIGINT);
0122 
0123     LOG(LEVEL) << "[ new QBase" ;
0124     QBase* base = new QBase ;
0125     LOG(LEVEL) << "] new QBase : latency here of about 0.3s from first device access, if latency of >1s need to start nvidia-persistenced " ;
0126     LOG(LEVEL) << base->desc();
0127 
0128 
0129     unsigned skipahead_event_offset = SEventConfig::EventSkipahead()  ;
0130     LOG(LEVEL) << "[ new QRng skipahead_event_offset : " << skipahead_event_offset << " " << SEventConfig::kEventSkipahead ;
0131     QRng* rng = new QRng(skipahead_event_offset)  ;  // loads and uploads RNG
0132     LOG(LEVEL) << "] new QRng " << rng->desc()  ;
0133 
0134     LOG(LEVEL) << rng->desc();
0135 
0136     const NP* optical = ssim->get(snam::OPTICAL);
0137     const NP* bnd = ssim->get(snam::BND);
0138 
0139     if( optical == nullptr && bnd == nullptr )
0140     {
0141         LOG(error) << " optical and bnd null  snam::OPTICAL " << snam::OPTICAL << " snam::BND " << snam::BND  ;
0142     }
0143     else
0144     {
0145        // note that QOptical and QBnd are tightly coupled, perhaps add constraints to tie them together
0146         QOptical* qopt = new QOptical(optical);
0147         LOG(LEVEL) << qopt->desc();
0148 
0149         QBnd* qbnd = new QBnd(bnd); // boundary texture with standard domain, used for standard fast property lookup
0150         LOG(LEVEL) << qbnd->desc();
0151     }
0152 
0153     QDebug* debug_ = new QDebug ;
0154     LOG(LEVEL) << debug_->desc() ;
0155 
0156     const NP* propcom = ssim->get(snam::PROPCOM);
0157     if( propcom )
0158     {
0159         LOG(LEVEL) << "[ QProp " ;
0160         QProp<float>* prop = new QProp<float>(propcom) ;
0161         // property interpolation with per-property domains, eg used for Cerenkov RINDEX sampling
0162         LOG(LEVEL) << "] QProp " ;
0163         LOG(LEVEL) << prop->desc();
0164     }
0165     else
0166     {
0167         LOG(LEVEL) << "  propcom null, snam::PROPCOM " <<  snam::PROPCOM ;
0168     }
0169 
0170 
0171     const NP* icdf = ssim->get(snam::ICDF);
0172     if( icdf == nullptr )
0173     {
0174         LOG(error) << " icdf null, snam::ICDF " << snam::ICDF ;
0175     }
0176     else
0177     {
0178         unsigned hd_factor = 20u ;  // 0,10,20
0179         QScint* scint = new QScint( icdf, hd_factor); // custom high-definition inverse CDF for scintillation generation
0180         LOG(LEVEL) << scint->desc();
0181     }
0182 
0183 
0184     // TODO: make this more like the others : acting on the available inputs rather than the mode
0185     bool is_simtrace = SEventConfig::IsRGModeSimtrace() ;
0186     if(is_simtrace == false )
0187     {
0188         QCerenkov* cerenkov = new QCerenkov  ;
0189         LOG(LEVEL) << cerenkov->desc();
0190     }
0191     else
0192     {
0193         LOG(LEVEL) << " skip QCerenkov for simtrace running " ;
0194     }
0195 
0196 
0197 
0198 
0199     const NPFold* spmt_f = ssim->get_spmt_f() ;
0200     QPMT<float>* qpmt = spmt_f ? new QPMT<float>(spmt_f) : nullptr ;
0201 
0202     bool has_PMT = spmt_f != nullptr && qpmt != nullptr ;
0203     bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false ;
0204 
0205     LOG_IF(fatal, MISSING_PMT )
0206         << " MISSING_PMT "
0207         << " has_PMT " << ( has_PMT ? "YES" : "NO " )
0208         << " REQUIRE_PMT " << ( REQUIRE_PMT ? "YES" : "NO " )
0209         << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
0210         << " spmt_f " << ( spmt_f ? "YES" : "NO " )
0211         << " qpmt " << ( qpmt ? "YES" : "NO " )
0212         ;
0213 
0214     assert(MISSING_PMT == false) ;
0215     if(MISSING_PMT)  std::raise(SIGINT);
0216 
0217 
0218 
0219     LOG(LEVEL)
0220         << QPMT<float>::Desc()
0221         << std::endl
0222         << " spmt_f " << ( spmt_f ? "YES" : "NO " )
0223         << " qpmt " << ( qpmt ? "YES" : "NO " )
0224         ;
0225 
0226 
0227 
0228     const NP* multifilm = ssim->get_extra(snam::MULTIFILM);
0229     if(multifilm == nullptr)
0230     {
0231         LOG(LEVEL) << " multifilm null, snam::MULTIFILM " << snam::MULTIFILM ;
0232     }
0233     else
0234     {
0235         QMultiFilm* mul = new QMultiFilm( multifilm );
0236         LOG(LEVEL) << mul->desc();
0237     }
0238     LOG(LEVEL) << "] ssim " << ssim ;
0239 
0240 
0241 
0242 }
0243 
0244 
0245 
0246 
0247 
0248 
0249 /**
0250 QSim:::QSim
0251 -------------
0252 
0253 Canonical instance is instanciated with CSGOptiX::CSGOptiX in sim mode.
0254 Notice how the instanciation pulls together device pointers from
0255 the constituents into the CPU side *sim* and then uploads that to *d_sim*
0256 which is available as *sim* GPU side.
0257 
0258 Prior to instanciating QSim invoke QSim::Init to prepare the
0259 singleton components.
0260 
0261 **/
0262 
0263 QSim::QSim()
0264     :
0265     base(QBase::Get()),
0266     qev(new QEvt),
0267     sev(qev->sev),
0268     rng(QRng::Get()),
0269     scint(QScint::Get()),
0270     cerenkov(QCerenkov::Get()),
0271     bnd(QBnd::Get()),
0272     debug_(QDebug::Get()),
0273     prop(QProp<float>::Get()),
0274     pmt(QPMT<float>::Get()),
0275     multifilm(QMultiFilm::Get()),
0276     sim(nullptr),
0277     d_sim(nullptr),
0278     dbg(debug_ ? debug_->dbg : nullptr),
0279     d_dbg(debug_ ? debug_->d_dbg : nullptr),
0280     cx(nullptr)
0281 {
0282     LOG(LEVEL) << desc() ;
0283     init();
0284 }
0285 
0286 
0287 /**
0288 QSim::init
0289 ------------
0290 
0291 *sim* (qsim.h) is a host side instance that is populated
0292 with device side pointers and handles and then uploaded
0293 to the device *d_sim*.
0294 
0295 Many device pointers and handles are then accessible from
0296 the qsim.h instance at the cost of only a single launch
0297 parameter argument: the d_sim pointer
0298 or with optix launches a single Params member.
0299 
0300 The advantage of this approach is it avoids kernel
0301 launches having very long argument lists and provides a natural
0302 place (qsim.h) to add GPU side functionality.
0303 
0304 **/
0305 
0306 
0307 void QSim::init()
0308 {
0309     sim = new qsim ;
0310     sim->base = base ? base->d_base : nullptr ;
0311     sim->evt = qev ? qev->getDevicePtr() : nullptr ;
0312     //sim->rng_state = rng ? rng->qr->uploaded_states : nullptr ;
0313     sim->rng = rng ? rng->d_qr : nullptr ;
0314 
0315     sim->bnd = bnd ? bnd->d_qb : nullptr ;
0316     sim->multifilm = multifilm ? multifilm->d_multifilm : nullptr ;
0317     sim->cerenkov = cerenkov ? cerenkov->d_cerenkov : nullptr ;
0318     sim->scint = scint ? scint->d_scint : nullptr ;
0319     sim->pmt = pmt ? pmt->d_pmt : nullptr ;
0320 
0321 
0322     bool has_PMT = pmt != nullptr && sim->pmt != nullptr ;
0323     bool REQUIRE_PMT = ssys::getenvbool(_QSim__REQUIRE_PMT);
0324     bool MISSING_PMT = REQUIRE_PMT == true && has_PMT == false ;
0325 
0326     LOG(LEVEL)
0327         << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
0328         << " has_PMT " << ( has_PMT ? "YES" : "NO " )
0329         << " QSim::pmt " << ( pmt ? "YES" : "NO " )
0330         << " QSim::pmt->d_pmt " << ( sim->pmt ? "YES" : "NO " )
0331         << " [" << _QSim__REQUIRE_PMT << "] " << ( REQUIRE_PMT ? "YES" : "NO " )
0332         ;
0333 
0334     LOG_IF(fatal, MISSING_PMT )
0335         << " MISSING_PMT ABORT "
0336         << " MISSING_PMT " << ( MISSING_PMT ? "YES" : "NO " )
0337         << " has_PMT " << ( has_PMT ? "YES" : "NO " )
0338         << " QSim::pmt " << ( pmt ? "YES" : "NO " )
0339         << " QSim::pmt->d_pmt " << ( sim->pmt ? "YES" : "NO " )
0340         << " [" << _QSim__REQUIRE_PMT << "] " << ( REQUIRE_PMT ? "YES" : "NO " )
0341         ;
0342 
0343     assert(MISSING_PMT == false) ;
0344     if(MISSING_PMT)  std::raise(SIGINT);
0345 
0346     d_sim = QU::UploadArray<qsim>(sim, 1, "QSim::init.sim" );
0347 
0348     INSTANCE = this ;
0349     LOG(LEVEL) << desc() ;
0350     LOG(LEVEL) << descComponents() ;
0351 }
0352 
0353 /**
0354 QSim::setLauncher
0355 ------------------
0356 
0357 Formerly used SCSGOptiX
0358 
0359 **/
0360 void QSim::setLauncher(SSimulator* cx_ )
0361 {
0362     cx = cx_ ;
0363 }
0364 
0365 
0366 /**
0367 QSim::post_launch
0368 --------------------
0369 
0370 launch is async, so need to sync before can gather ?
0371 HOW ELSE TO DO THIS : LIKELY BIG PERFORMANCE EFFECT
0372 
0373 cudaDeviceSynchronize already done by CSGOptiX::launch via CUDA_SYNC_CHECK see CSG/CUDA_CHECK.h
0374 
0375 void QSim::post_launch()
0376 {
0377     cudaDeviceSynchronize();
0378 }
0379 **/
0380 
0381 
0382 /**
0383 QSim::simulate
0384 ---------------
0385 
0386 This expects gensteps to have been collected into SEvt vectors of quad6
0387 prior to being called.
0388 
0389 Now only one canonical invokations, the higher level G4CXOpticks no longer
0390 directly uses QSim:
0391 
0392 1. CSGOptiX::simulate for pure CSGOptiX level testing such as by ~/o/cxs_min.sh
0393 
0394 
0395 Collected genstep are uploaded and the CSGOptiX kernel is launched to generate and propagate.
0396 
0397 NB the surprising fact that this calls CSGOptiX::simulate_launch (using a protocol),
0398 that seems funny dependency-wise but its needed for genstep preparation prior to
0399 the launch.
0400 
0401 ::
0402 
0403     QSim::simulate
0404 
0405        EGPU.SEvt::beginOfEvent
0406 
0407        determine slices over gensteps depending on VRAM
0408 
0409        for each slice
0410 
0411            QEvt::setGenstepUpload_NP
0412 
0413            SSimulator::simulate_launch
0414 
0415            SEvt::gather into *fold* below *topfold*
0416 
0417        end for each slice
0418 
0419        (NPFold)topfold->concat joining all arrays from the per-launch *fold* into the one *topfold*
0420 
0421        EGPU.SEvt::endOfEvent [only when reset:true, not so in OJ running as need to defer until after hits collected]
0422 
0423 
0424 **/
0425 
0426 bool QSim::KEEP_SUBFOLD = ssys::getenvbool(QSim__simulate_KEEP_SUBFOLD);
0427 
0428 double QSim::simulate(int eventID, bool reset_)
0429 {
0430     SProf::SetTag(eventID, "A%0.3d_" ) ;
0431 
0432     assert( SEventConfig::IsRGModeSimulate() );
0433 
0434     //cudaStream_t stream ;  cudaStreamCreate(&stream);
0435     cudaStream_t stream = 0 ;
0436 
0437 
0438     int64_t tot_ph = 0 ;
0439 
0440     double tot_dt = 0. ;
0441 
0442     int64_t tot_idt = 0 ;
0443     int64_t tot_gdt = 0 ;
0444 
0445     int64_t t_HEAD = SProf::Add("QSim__simulate_HEAD");
0446 
0447     LOG_IF(info, SEvt::LIFECYCLE) << "[ eventID " << eventID ;
0448     if( qev == nullptr ) return -1. ;
0449 
0450 
0451     sev->beginOfEvent(eventID);  // set SEvt index and tees up frame gensteps for simtrace and input photon simulate running
0452 
0453     NP* igs = sev->makeGenstepArrayFromVector();
0454 
0455     MaybeSaveIGS(eventID, igs);
0456 
0457     std::vector<sslice> igs_slice ;
0458     int64_t tot_ph_0 = SGenstep::GetGenstepSlices( igs_slice, igs, SEventConfig::MaxSlot() );
0459 
0460     //bool xxl = tot_ph_0 > SGenstep::MAX_SLOT_PER_SLICE ;
0461     bool xxl = tot_ph_0 > 100*M ;
0462 
0463     int num_slice = igs_slice.size();
0464 
0465     LOG(xxl ? info : LEVEL)
0466         << " eventID " << std::setw(6) << eventID
0467         << " igs " << ( igs ? igs->sstr() : "-" )
0468         << " tot_ph_0 " << tot_ph_0
0469         << " tot_ph_0/M " << tot_ph_0/M
0470         << " xxl " << ( xxl ? "YES" : "NO " )
0471         << " MaxSlot " << SEventConfig::MaxSlot()
0472         << " MaxSlot/M " << SEventConfig::MaxSlot()/M
0473         << " sslice::Desc(igs_slice)\n"
0474         << sslice::Desc(igs_slice)
0475         << " num_slice " << num_slice
0476         ;
0477 
0478 
0479     int64_t t_LBEG = SProf::Add("QSim__simulate_LBEG");
0480 
0481     for(int i=0 ; i < num_slice ; i++)
0482     {
0483         SProf::Add("QSim__simulate_PRUP");
0484 
0485         const sslice& sl = igs_slice[i] ;
0486 
0487         LOG(LEVEL) << sl.idx_desc(i) ;
0488 
0489         int rc = qev->setGenstepUpload_NP(igs, &sl ) ;
0490         LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : have qev but no gensteps collected : will skip cx.simulate " ;
0491 
0492         LOG_IF(info, ALLOC)
0493             << " [" << _QSim__ALLOC << "] "
0494             << " i " << std::setw(5) << i
0495             << " SEventConfig::ALLOC " << ( SEventConfig::ALLOC  ? "YES" : "NO " )
0496             << ( SEventConfig::ALLOC ? SEventConfig::ALLOC->desc() : "-" )
0497             ;
0498 
0499 
0500         SProf::Add("QSim__simulate_PREL");
0501 
0502         sev->t_PreLaunch = sstamp::Now() ;
0503 
0504         double dt = rc == 0 && cx != nullptr ? cx->simulate_launch() : -1. ;  //SSimulator protocol
0505 
0506         sev->t_PostLaunch = sstamp::Now() ;
0507         sev->t_Launch = dt ;
0508 
0509         tot_idt += ( sev->t_PostLaunch - sev->t_PreLaunch ) ;
0510         tot_dt += dt ;
0511         tot_ph += sl.ph_count ;
0512 
0513         LOG( xxl ? info : LEVEL )
0514             << " eventID " << eventID
0515             << " xxl " << ( xxl ? "YES" : "NO " )
0516             << " i " << std::setw(4) << i
0517             << " dt " << std::setw(11) << std::fixed << std::setprecision(6) << dt
0518             << " slice " << sl.idx_desc(i)
0519             ;
0520 
0521         int64_t t_POST = SProf::Add("QSim__simulate_POST");
0522 
0523         sev->gather();  // gather into *fold* just added to *topfold*
0524 
0525         int64_t t_DOWN = SProf::Add("QSim__simulate_DOWN");
0526 
0527         tot_gdt += ( t_DOWN - t_POST ) ;
0528     }
0529 
0530 
0531     size_t max_slot_M = SEventConfig::MaxSlot()/M;
0532     std::string anno = SProf::Annotation("slice",num_slice, "max_slot_M", max_slot_M);
0533     int64_t t_LEND = SProf::Add("QSim__simulate_LEND", anno.c_str());
0534 
0535     std::stringstream ss ;
0536     std::ostream* out = CONCAT ? &ss : nullptr ;
0537     int concat_rc = sev->topfold->concat(out);
0538 
0539     LOG_IF(info, CONCAT) << ss.str() ;
0540     LOG_IF(fatal, concat_rc != 0) << " sev->topfold->concat FAILED " ;
0541     assert(concat_rc == 0);
0542 
0543     bool has_hlm = sev->topfold->has_key(SComp::HITLITEMERGED_);
0544     bool has_hm  = sev->topfold->has_key(SComp::HITMERGED_);
0545     bool do_final_merge = num_slice > 1 && ( has_hlm || has_hm ) ;
0546     LOG(LEVEL)
0547          << " num_slice " << num_slice
0548          << " has_hm " << ( has_hm ? "YES" : "NO " )
0549          << " has_hlm " << ( has_hlm ? "YES" : "NO " )
0550          << " do_final_merge " << ( do_final_merge ? "YES" : "NO " )
0551          ;
0552     if(do_final_merge) simulate_final_merge(tot_ph, stream);
0553 
0554 
0555     if(!KEEP_SUBFOLD) sev->topfold->clear_subfold();
0556 
0557     int64_t t_PCAT = SProf::Add("QSim__simulate_PCAT");
0558 
0559     int tot_ht = sev->getNumHit() ;  // NB from fold, so requires hits array gathering to be configured to get non-zero
0560     std::string counts = sev->getCounts();  // collect counts before reset
0561 
0562     LOG_IF(info, SEvt::MINIMAL)
0563         << " eventID " << eventID
0564         << " tot_dt " << std::setw(11) << std::fixed << std::setprecision(6) << tot_dt
0565         << " tot_ph " << std::setw(10) << tot_ph
0566         << " tot_ph/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ph)/float(M)
0567         << " tot_ht " << std::setw(10) << tot_ht
0568         << " tot_ht/M " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht)/float(M)
0569         << " tot_ht/tot_ph " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_ht)/float(tot_ph)
0570         << " reset_ " << ( reset_ ? "YES" : "NO " )
0571         ;
0572 
0573 
0574     assert( tot_ph == tot_ph_0 );
0575 
0576     int64_t t_BRES  = SProf::Add("QSim__simulate_BRES", counts.c_str() );
0577     if(reset_) reset(eventID) ;
0578 
0579     int64_t t_TAIL  = SProf::Add("QSim__simulate_TAIL");
0580 
0581     SProf::Write(); // per-event write, so have something in case of crash
0582 
0583     LOG_IF(info, SEvt::MINTIME) << "\n"
0584         << SEvt::SEvt__MINTIME
0585         << "\n"
0586         << " (TAIL - HEAD)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_TAIL - t_HEAD )/M
0587         << " (head to tail of QSim::simulate method) "
0588         << "\n"
0589         << " (LEND - LBEG)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_LEND - t_LBEG )/M
0590         << " (multilaunch loop begin to end) "
0591         << "\n"
0592         << " (PCAT - LEND)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_PCAT - t_LEND )/M
0593         << " (topfold concat and clear subfold) "
0594         << "\n"
0595         << " (TAIL - BRES)/M " << std::setw(10) << std::fixed << std::setprecision(6) << float( t_TAIL - t_BRES )/M
0596         << " (QSim::reset which saves hits) "
0597         << "\n"
0598         << " tot_idt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_idt)/M
0599         << " (sum of kernel execution int64_t stamp differences in microseconds)"
0600         << "\n"
0601         << " tot_dt          " << std::setw(10) << std::fixed << std::setprecision(6) << tot_dt
0602         << " int(tot_dt*M)   " << std::setw(10) << int64_t(tot_dt*M)
0603         << " (sum of kernel execution double chrono stamp differences in seconds, and scaled to ms) "
0604         << "\n"
0605         << " tot_gdt/M       " << std::setw(10) << std::fixed << std::setprecision(6) << float(tot_gdt)/M
0606         << " (sum of SEvt::gather int64_t stamp differences in microseconds)"
0607         << "\n"
0608         ;
0609 
0610     return tot_dt ;
0611 }
0612 
0613 
0614 /**
0615 QSim::simulate_final_merge
0616 ---------------------------
0617 
0618 * NB this normally only runs for large events with more
0619   photons than fit in VRAM that cause multiple launches
0620   to be done
0621 
0622 * requires topfold to have "hitlitemerged" array,
0623   expected to be the result of simple concatenation
0624   of individual launch "hitlitemerged" arrays
0625 
0626 * does CUDA thrust implemented hitlitemerged final merging
0627 
0628 
0629 TODO: use QEvt::FinalMerge_async once that makes sense
0630 
0631 **/
0632 
0633 void QSim::simulate_final_merge(int64_t tot_ph, cudaStream_t stream)
0634 {
0635     bool has_hlm = sev->topfold->has_key(SComp::HITLITEMERGED_);
0636     bool has_hm  = sev->topfold->has_key(SComp::HITMERGED_);
0637 
0638     if( has_hlm )
0639     {
0640         const NP* hlm = sev->topfold->get(SComp::HITLITEMERGED_);
0641         NP*       fin = QEvt::FinalMerge<sphotonlite>(hlm, stream);
0642 
0643         float     hlm_frac = float(hlm->num_items())/float(tot_ph) ;
0644         float     fin_frac = float(fin->num_items())/float(hlm->num_items()) ;
0645 
0646         std::stringstream ss ;
0647         ss
0648             << " tot_ph " << tot_ph
0649             << " hlm " << ( hlm ? hlm->sstr() : "-" )
0650             << " fin " << ( fin ? fin->sstr() : "-" )
0651             << " hlm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hlm_frac
0652             << " fin/hlm " << std::setw(7) << std::fixed << std::setprecision(4) << fin_frac
0653             ;
0654 
0655         std::string note = ss.str();
0656         fin->set_meta<std::string>("QSim__simulate_final_merge", note );
0657 
0658         sev->topfold->set(SComp::HITLITEMERGED_, fin );
0659 
0660         LOG(info) << note ;
0661     }
0662     if( has_hm )
0663     {
0664         const NP* hm = sev->topfold->get(SComp::HITMERGED_);
0665         NP*       fi = QEvt::FinalMerge<sphoton>(hm, stream);
0666 
0667         float     hm_frac = float(hm->num_items())/float(tot_ph) ;
0668         float     fi_frac = float(fi->num_items())/float(hm->num_items()) ;
0669 
0670         std::stringstream ss ;
0671         ss
0672             << " tot_ph " << tot_ph
0673             << " hm " << ( hm ? hm->sstr() : "-" )
0674             << " fi " << ( fi ? fi->sstr() : "-" )
0675             << " hm/tot " << std::setw(7) << std::fixed << std::setprecision(4) << hm_frac
0676             << " fi/hm "  << std::setw(7) << std::fixed << std::setprecision(4) << fi_frac
0677             ;
0678 
0679         std::string note = ss.str();
0680         fi->set_meta<std::string>("QSim__simulate_final_merge", note );
0681 
0682         sev->topfold->set(SComp::HITMERGED_, fi );
0683         LOG(info) << note ;
0684     }
0685 }
0686 
0687 
0688 /**
0689 QSim::simulate
0690 ----------------
0691 
0692 High level API intending to be used from CSGOptiXService
0693 
0694 1. access eventID from genstep metadata
0695 2. add the genstep to the EGPU SEvt
0696 3. invoke simulate doing the kernel launch and pullback
0697 4. copy global hits array (current thinking that localization
0698    should happen within the client to avoid ~doubling hits transfer size)
0699 5. reset the SEvt doing dealloc
0700 
0701 Thus is used from language crossing stack::
0702 
0703     async def simulate(request: Request)  [CSGOptiX/tests/CSGOptiXService_FastAPI_test/main.py]
0704     inline nb::ndarray<nb::numpy> _CSGOptiXService::simulate( nb::ndarray<nb::numpy> _gs, int eventID )
0705     inline NP* CSGOptiXService::simulate( NP* gs, int eventID )
0706     NP* CSGOptiX::simulate(const NP* gs, int eventID)
0707 
0708 **/
0709 
0710 
0711 NP* QSim::simulate(const NP* gs, int eventID )
0712 {
0713     bool eventID_expected = eventID > -1;
0714     if(!eventID_expected) std::cerr << "QSim::simulate gs lacks needed eventID metadata [" << eventID << "]\n" ;
0715     assert(eventID_expected);
0716 
0717     assert( sev == SEvt::Get_EGPU() );
0718     sev->addGenstep(gs);
0719 
0720     bool reset_ = false ;
0721     double tot_dt = simulate(eventID, reset_);
0722 
0723     const NP* _ht = sev->getHit();
0724     NP* ht = _ht ? _ht->copy() : nullptr ;  // copy global hits from SEvt before reset
0725     ht->set_meta<double>("QSim__simulate_tot_dt", tot_dt);
0726 
0727     LOG(info)
0728         << " eventID " << std::setw(6) << eventID
0729         << " gs " << ( gs ? gs->sstr() : "-" )
0730         << " ht " << ( ht ? ht->sstr() : "-" )
0731         << " tot_dt " << std::fixed << std::setw(10) << std::setprecision(6) << tot_dt
0732         ;
0733     reset(eventID);
0734 
0735     return ht ;
0736 }
0737 
0738 
0739 
0740 /**
0741 QSim::MaybeSaveIGS
0742 --------------------
0743 
0744 Invoked from QSim::simulate just prior to the launch loop
0745 over genstep slices. To configure saving of the gensteps
0746 use the below envvars::
0747 
0748     export QSim__SAVE_IGS_EVENTID=6                   ## default -1 does not save
0749     export QSim__SAVE_IGS_PATH=$TMP/.opticks/igs.npy  ## default path below $TMP
0750 
0751 Saving gensteps prior to launch is useful for eventID that cause crashes,
0752 as the saved gensteps can then be used with input genstep running::
0753 
0754     export OPTICKS_RUNNING_MODE=SRM_INPUT_GENSTEP
0755 
0756 That enables fast cycle rerunning without Geant4 initalization, see eg::
0757 
0758     TEST=input_genstep_muon cxs_min.sh
0759 
0760 A potential cause of crashes is the heuristic OPTICKS_MAX_SLOT
0761 determined based on available VRAM is too optimistically large.
0762 Try manually reducing slots to see if memory limits are the cause::
0763 
0764     export OPTICKS_MAX_SLOT=M100
0765 
0766 **/
0767 
0768 void QSim::MaybeSaveIGS(int eventID, NP* igs) // static
0769 {
0770     bool igs_null = igs == nullptr ;
0771     const char* igs_path = SAVE_IGS_PATH ? spath::Resolve(SAVE_IGS_PATH) : nullptr ;
0772     bool save_igs = igs && SAVE_IGS_EVENTID == eventID && igs_path ;
0773     LOG(LEVEL)
0774         << " eventID " << eventID
0775         << " igs " << ( igs ? igs->sstr() : "-" )
0776         << " igs_null " << ( igs_null ? "YES" : "NO " )
0777         << " [" << _QSim__SAVE_IGS_EVENTID << "] " <<  SAVE_IGS_EVENTID
0778         << " [" << _QSim__SAVE_IGS_PATH    << "] " << ( SAVE_IGS_PATH ? SAVE_IGS_PATH : "-" )
0779         << " igs_path [" << ( igs_path ? igs_path : "-" ) << "]"
0780         << " save_igs " << ( save_igs ? "YES" : "NO " )
0781         ;
0782 
0783     if(!save_igs) return ;
0784     igs->save(igs_path);
0785 }
0786 
0787 
0788 
0789 /**
0790 QSim::getPhotonSlotOffset
0791 ---------------------------
0792 
0793 The photon slot offset is available
0794 from QEvt after setGenstepUpload_NP
0795 
0796 The first genstep slice yields an offset of zero.
0797 So this only yields non-zero offsets for multi-launch
0798 running,
0799 
0800 
0801 The photon_slot_offset is uploaded to Params on device prior
0802 to simulate launches by CSGOptiX::prepareParamSimulate
0803 
0804 The offset is used at head of CSGOptiX7.cu:simulate  to offset the
0805 launch index.
0806 
0807 Hence the sum of the offset and the
0808 number of photons in the launch must be less than
0809 or equal the number of states uploaded.
0810 
0811 **/
0812 
0813 
0814 
0815 unsigned long long QSim::get_photon_slot_offset() const
0816 {
0817     return qev->get_photon_slot_offset() ;
0818 }
0819 
0820 
0821 /**
0822 QSim::reset
0823 ------------
0824 
0825 When *QSim::simulate* is called with argument *reset:true* the
0826 *QSim::reset* method is called which invokes SEvt::endOfEvent in order
0827 to clean up the SEvt. Normally that would be done after saving
0828 any Opticks configured arrays.
0829 
0830 When *QSim::simulate* is called with argument *reset:false*
0831 the *QSim::reset* method must be called separately in order to avoid a memory leak.
0832 Using *reset:false* is typically done in order to keep arrays alive longer
0833 to enable copying from the gathered arrays into non-Opticks collections.
0834 
0835 **/
0836 void QSim::reset(int eventID)
0837 {
0838     SProf::Add("QSim__reset_HEAD");
0839     qev->clear();
0840     sev->endOfEvent(eventID);
0841     LOG_IF(info, SEvt::LIFECYCLE) << "] eventID " << eventID ;
0842     SProf::Add("QSim__reset_TAIL");
0843 }
0844 
0845 
0846 
0847 /**
0848 QSim::simtrace
0849 ---------------
0850 
0851 Canonically invoked from G4CXOpticks::simtrace
0852 Collected genstep are uploaded and the CSGOptiX kernel is launched to generate and propagate.
0853 
0854 **/
0855 
0856 
0857 double QSim::simtrace(int eventID)
0858 {
0859     assert( SEventConfig::IsRGModeSimtrace() );
0860 
0861 
0862     sev->beginOfEvent(eventID);
0863 
0864     NP* igs = sev->makeGenstepArrayFromVector();
0865 
0866     LOG_IF(fatal, igs==nullptr)
0867          << " igs NULL "
0868          << " sev.descGenstepArrayFromVector " << sev->descGenstepArrayFromVector()
0869          ;
0870 
0871     assert(igs);
0872     int rc = qev->setGenstepUpload_NP(igs) ;
0873 
0874     LOG_IF(error, rc != 0) << " QEvt::setGenstep ERROR : no gensteps collected : will skip cx.simtrace " ;
0875 
0876     sev->t_PreLaunch = sstamp::Now() ;
0877     double dt = rc == 0 && cx != nullptr ? cx->simtrace_launch() : -1. ;
0878     sev->t_PostLaunch = sstamp::Now() ;
0879     sev->t_Launch = dt ;
0880 
0881     // see ~/o/notes/issues/cxt_min_simtrace_revival.rst
0882     sev->gather();
0883 
0884     sev->topfold->concat();
0885     sev->topfold->clear_subfold();
0886 
0887     sev->endOfEvent(eventID);
0888 
0889     return dt ;
0890 }
0891 
0892 
0893 qsim* QSim::getDevicePtr() const
0894 {
0895     return d_sim ;
0896 }
0897 
0898 
0899 char QSim::getScintTexFilterMode() const
0900 {
0901     return scint->tex->getFilterMode() ;
0902 }
0903 
0904 std::string QSim::desc() const
0905 {
0906     std::stringstream ss ;
0907     ss << "QSim::desc"
0908        << std::endl
0909        << " this 0x"            << std::hex << std::uint64_t(this)     << std::dec
0910        << " INSTANCE 0x"        << std::hex << std::uint64_t(INSTANCE) << std::dec
0911        << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev)    << std::dec
0912        << " qsim.h:sim 0x"      << std::hex << std::uint64_t(sim)      << std::dec
0913        ;
0914     std::string s = ss.str();
0915     return s ;
0916 }
0917 
0918 std::string QSim::descFull() const
0919 {
0920     std::stringstream ss ;
0921     ss
0922        << std::endl
0923        << "QSim::descFull"
0924        << std::endl
0925        << " this 0x"            << std::hex << std::uint64_t(this)     << std::dec
0926        << " INSTANCE 0x"        << std::hex << std::uint64_t(INSTANCE) << std::dec
0927        << " QEvt.hh:qev 0x" << std::hex << std::uint64_t(qev)    << std::dec
0928        << " qsim.h:sim 0x"      << std::hex << std::uint64_t(sim)      << std::dec
0929        << " qsim.h:d_sim 0x"    << std::hex << std::uint64_t(d_sim)    << std::dec
0930        //<< " sim->rng_state 0x"   << std::hex << std::uint64_t(sim->rng_state) << std::dec  // tending to SEGV on some systems
0931        << " sim->base 0x"       << std::hex << std::uint64_t(sim->base)  << std::dec
0932        << " sim->bnd 0x"        << std::hex << std::uint64_t(sim->bnd)   << std::dec
0933        << " sim->scint 0x"      << std::hex << std::uint64_t(sim->scint) << std::dec
0934        << " sim->cerenkov 0x"   << std::hex << std::uint64_t(sim->cerenkov) << std::dec
0935        ;
0936     std::string s = ss.str();
0937     return s ;
0938 }
0939 
0940 std::string QSim::descComponents() const
0941 {
0942     std::stringstream ss ;
0943     ss << std::endl
0944        << "QSim::descComponents"
0945        << std::endl
0946        << " (QBase)base             " << ( base      ? "YES" : "NO " )  << std::endl
0947        << " (QEvt)qev           " << ( qev     ? "YES" : "NO " )  << std::endl
0948        << " (SEvt)sev               " << ( sev       ? "YES" : "NO " )  << std::endl
0949        << " (QRng)rng               " << ( rng       ? "YES" : "NO " )  << std::endl
0950        << " (QScint)scint           " << ( scint     ? "YES" : "NO " )  << std::endl
0951        << " (QCerenkov)cerenkov     " << ( cerenkov  ? "YES" : "NO " )  << std::endl
0952        << " (QBnd)bnd               " << ( bnd       ? "YES" : "NO " )  << std::endl
0953        << " (QOptical)optical       " << ( optical   ? "YES" : "NO " )  << std::endl
0954        << " (QDebug)debug_          " << ( debug_    ? "YES" : "NO " )  << std::endl
0955        << " (QProp)prop             " << ( prop      ? "YES" : "NO " )  << std::endl
0956        << " (QPMT)pmt               " << ( pmt       ? "YES" : "NO " )  << std::endl
0957        << " (QMultiFilm)multifilm   " << ( multifilm ? "YES" : "NO " )  << std::endl
0958        << " (qsim)sim               " << ( sim       ? "YES" : "NO " )  << std::endl
0959        << " (qsim)d_sim             " << ( d_sim     ? "YES" : "NO " )  << std::endl
0960        << " (qdebug)dbg             " << ( dbg       ? "YES" : "NO " )  << std::endl
0961        << " (qdebug)d_dbg           " << ( d_dbg     ? "YES" : "NO " )  << std::endl
0962        ;
0963     std::string s = ss.str();
0964     return s ;
0965 }
0966 
0967 
0968 
0969 
0970 
0971 void QSim::configureLaunch(unsigned width, unsigned height )
0972 {
0973     QU::ConfigureLaunch(numBlocks, threadsPerBlock, width, height);
0974 }
0975 
0976 void QSim::configureLaunch2D(unsigned width, unsigned height )
0977 {
0978     QU::ConfigureLaunch2D(numBlocks, threadsPerBlock, width, height);
0979 }
0980 
0981 void QSim::configureLaunch16()
0982 {
0983     QU::ConfigureLaunch16(numBlocks, threadsPerBlock);
0984 }
0985 
0986 void QSim::configureLaunch1D(unsigned num, unsigned threads_per_block)
0987 {
0988     QU::ConfigureLaunch1D(numBlocks, threadsPerBlock, num, threads_per_block);
0989 }
0990 
0991 
0992 std::string QSim::descLaunch() const
0993 {
0994     return QU::DescLaunch(numBlocks, threadsPerBlock);
0995 }
0996 
0997 
0998 
0999 
1000 
1001 
1002 
1003 
1004 
1005 /**
1006 QSim::rng_sequence mass production with multiple launches...
1007 --------------------------------------------------------------
1008 
1009 The output files are split too::
1010 
1011     epsilon:opticks blyth$ np.py *.npy
1012     a :                                            TRngBufTest_0.npy :      (10000, 16, 16) : 8f9b27c9416a0121574730baa742b5c9 : 20210715-1227
1013     epsilon:opticks blyth$ du -h TRngBufTest_0.npy
1014      20M    TRngBufTest_0.npy
1015 
1016     In [6]: (16*16*4*2*10000)/1e6
1017     Out[6]: 20.48
1018 
1019 Upping to 1M would be 100x 20M = 2000M  2GB
1020 
1021 * using floats would half storage to 1GB, just promote to double in cks/OpticksRandom::flat
1022   THIS MAKES SENSE AS curand_uniform IS GENERATING float anyhow
1023 * 100k blocks would mean 10*100Mb files for 1M : can do in 10 launches to work on any GPU
1024 
1025 **/
1026 
1027 
1028 template <typename T>
1029 extern void QSim_rng_sequence(  dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, T* seq, unsigned ni, unsigned nj, unsigned id_offset );
1030 
1031 
1032 /**
1033 QSim::rng_sequence generate randoms in single CUDA launch
1034 -------------------------------------------------------------
1035 
1036 This is invoked for each tranche by the below rng_sequence method.
1037 Each tranche launch generates ni_tranche*nv randoms writing them into seq
1038 
1039 ni_tranche : item tranche size
1040 
1041 nv : number randoms to generate for each item
1042 
1043 id_offset : acts on the rng_state array
1044 
1045 skipahead : used curand skipahead offsets depending on sim->evt->index and OPTICKS_EVENT_SKIPAHEAD
1046 
1047 
1048 **/
1049 
1050 template <typename T>
1051 void QSim::rng_sequence( T* seq, unsigned ni_tranche, unsigned nv, unsigned id_offset )
1052 {
1053     configureLaunch(ni_tranche, 1 );
1054 
1055     unsigned num_rng = ni_tranche*nv ;
1056 
1057     const char* label = "QSim::rng_sequence:num_rng" ;
1058 
1059     T* d_seq = QU::device_alloc<T>(num_rng, label );
1060 
1061     QSim_rng_sequence<T>( numBlocks, threadsPerBlock, d_sim, d_seq, ni_tranche, nv, id_offset );
1062 
1063     QU::copy_device_to_host_and_free<T>( seq, d_seq, num_rng, label );
1064 }
1065 
1066 
1067 
1068 const char* QSim::PREFIX = "rng_sequence" ;
1069 
1070 /**
1071 QSim::rng_sequence
1072 ---------------------
1073 
1074 *ni* is the total number of items across all launches that is
1075 split into tranches of *ni_tranche_size* each.
1076 
1077 As separate CPU and GPU memory allocations, launches and output files are made
1078 for each tranche the total generated size may exceed the total available memory of the GPU.
1079 Splitting the output files also eases management by avoiding huge files.
1080 
1081 Default *dir* is $TMP/QSimTest/rng_sequence leading to npy paths like::
1082 
1083     /tmp/blyth/opticks/QSimTest/rng_sequence/rng_sequence_f_ni1000000_nj16_nk16_tranche100000/rng_sequence_f_ni100000_nj16_nk16_ioffset000000.npy
1084 
1085 **/
1086 
1087 template <typename T>
1088 void QSim::rng_sequence( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size )
1089 {
1090     assert( ni >= ni_tranche_size && ni % ni_tranche_size == 0 );   // total size *ni* must be integral multiple of *ni_tranche_size*
1091     unsigned num_tranche = ni/ni_tranche_size ;
1092     unsigned nv = nj*nk ;
1093 
1094     unsigned size = ni_tranche_size*nv ;   // number of randoms to be generated in each launch
1095     std::string reldir = QU::rng_sequence_reldir<T>(PREFIX, ni, nj, nk, ni_tranche_size  ) ;
1096 
1097     LOG(info)
1098         << " ni " << ni
1099         << " ni_tranche_size " << ni_tranche_size
1100         << " num_tranche " << num_tranche
1101         << " reldir " << reldir.c_str()
1102         << " nj " << nj
1103         << " nk " << nk
1104         << " nv(nj*nk) " << nv
1105         << " size(ni_tranche_size*nv) " << size
1106         << " typecode " << QU::typecode<T>()
1107         ;
1108 
1109 
1110     // NB seq array memory gets reused for each launch and saved to different paths
1111     NP* seq = NP::Make<T>(ni_tranche_size, nj, nk) ;
1112     T* seq_values = seq->values<T>();
1113     NP::INT seq_nv = seq->num_values();
1114 
1115 
1116     LOG(info)
1117         << " seq " << ( seq ? seq->sstr() : "-" )
1118         << " seq_values " << seq_values
1119         << " seq_nv " << seq_nv
1120         << " seq_values[0] " << seq_values[0]
1121         << " seq_values[seq_nv-1] " << seq_values[seq_nv-1]
1122         ;
1123 
1124 
1125 
1126     for(unsigned t=0 ; t < num_tranche ; t++)
1127     {
1128         // *id_offset* controls which rng_state/RNG to use
1129         unsigned id_offset = ni_tranche_size*t ;
1130         std::string name = QU::rng_sequence_name<T>(PREFIX, ni_tranche_size, nj, nk, id_offset ) ;
1131 
1132         std::cout
1133             << std::setw(3) << t
1134             << std::setw(10) << id_offset
1135             << std::setw(100) << name.c_str()
1136             << std::endl
1137             ;
1138 
1139         rng_sequence( seq_values, ni_tranche_size, nv, id_offset );
1140 
1141         const char* path = spath::Resolve(dir, reldir.c_str(), name.c_str() );
1142         seq->save(path);
1143     }
1144 }
1145 
1146 
1147 
1148 template void QSim::rng_sequence<float>(  const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
1149 template void QSim::rng_sequence<double>( const char* dir, unsigned ni, unsigned nj, unsigned nk, unsigned ni_tranche_size );
1150 
1151 
1152 
1153 
1154 
1155 
1156 
1157 
1158 
1159 
1160 /**
1161 QSim::scint_wavelength
1162 ----------------------------------
1163 
1164 Setting envvar QSIM_DISABLE_HD disables multiresolution handling
1165 and causes the returned hd_factor to be zero rather then
1166 the typical values of 10 or 20 which depend on the buffer creation.
1167 
1168 **/
1169 
1170 extern void QSim_scint_wavelength(   dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, float* wavelength, unsigned num_wavelength );
1171 
1172 NP* QSim::scint_wavelength(unsigned num_wavelength, unsigned& hd_factor )
1173 {
1174 
1175     bool qsim_disable_hd = ssys::getenvbool("QSIM_DISABLE_HD");
1176     hd_factor = qsim_disable_hd ? 0u : scint->tex->getHDFactor() ;
1177     // HMM: perhaps get this from sim rather than occupying an argument slot
1178     LOG(LEVEL) << "[" << " qsim_disable_hd " << qsim_disable_hd << " hd_factor " << hd_factor ;
1179 
1180     configureLaunch(num_wavelength, 1 );
1181 
1182     float* d_wavelength = QU::device_alloc<float>(num_wavelength, "QSim::scint_wavelength/num_wavelength");
1183 
1184     QSim_scint_wavelength(numBlocks, threadsPerBlock, d_sim, d_wavelength, num_wavelength );
1185 
1186     NP* w = NP::Make<float>(num_wavelength) ;
1187 
1188     QU::copy_device_to_host_and_free<float>( (float*)w->bytes(), d_wavelength, num_wavelength, "QSim::scint_wavelength" );
1189 
1190     LOG(LEVEL) << "]" ;
1191 
1192     return w ;
1193 }
1194 
1195 
1196 extern void QSim_RandGaussQ_shoot(  dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, float* v, unsigned num_v );
1197 
1198 NP* QSim::RandGaussQ_shoot(unsigned num_v )
1199 {
1200     const char* label = "QSim::RandGaussQ_shoot/num" ;
1201     configureLaunch(num_v, 1 );
1202     std::cout << label << " " << num_v << std::endl;
1203 
1204     float* d_v = QU::device_alloc<float>(num_v, label );
1205 
1206     QSim_RandGaussQ_shoot(numBlocks, threadsPerBlock, d_sim, d_v, num_v );
1207 
1208     cudaDeviceSynchronize();
1209 
1210     NP* v = NP::Make<float>(num_v) ;
1211     QU::copy_device_to_host_and_free<float>( (float*)v->bytes(), d_v, num_v, label );
1212 
1213     return v ;
1214 }
1215 
1216 
1217 
1218 
1219 void QSim::dump_wavelength( float* wavelength, unsigned num_wavelength, unsigned edgeitems )
1220 {
1221     LOG(LEVEL);
1222     for(unsigned i=0 ; i < num_wavelength ; i++)
1223     {
1224         if( i < edgeitems || i > num_wavelength - edgeitems)
1225         {
1226             std::cout
1227                 << std::setw(10) << i
1228                 << std::setw(10) << std::fixed << std::setprecision(3) << wavelength[i]
1229                 << std::endl
1230                 ;
1231         }
1232     }
1233 }
1234 
1235 
1236 extern void QSim_dbg_gs_generate(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, qdebug* dbg, sphoton* photon, unsigned num_photon, unsigned type ) ;
1237 
1238 
1239 NP* QSim::dbg_gs_generate(unsigned num_photon, unsigned type )
1240 {
1241     assert( type == SCINT_GENERATE || type == CERENKOV_GENERATE );
1242 
1243     configureLaunch( num_photon, 1 );
1244     sphoton* d_photon = QU::device_alloc<sphoton>(num_photon, "QSim::dbg_gs_generate:num_photon") ;
1245     QU::device_memset<sphoton>(d_photon, 0, num_photon);
1246 
1247     QSim_dbg_gs_generate(numBlocks, threadsPerBlock, d_sim, d_dbg, d_photon, num_photon, type );
1248 
1249     NP* p = NP::Make<float>(num_photon, 4, 4);
1250     const char* label = "QSim::dbg_gs_generate" ;
1251 
1252     QU::copy_device_to_host_and_free<sphoton>( (sphoton*)p->bytes(), d_photon, num_photon, label );
1253     return p ;
1254 }
1255 
1256 
1257 
1258 extern void QSim_generate_photon(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim )  ;
1259 
1260 /**
1261 QSim::generate_photon
1262 -----------------------
1263 
1264 **/
1265 
1266 
1267 void QSim::generate_photon()
1268 {
1269     LOG(LEVEL) << "[" ;
1270 
1271     unsigned num_photon = qev->getNumPhoton() ;
1272     LOG(info) << " num_photon " << num_photon ;
1273 
1274     LOG_IF(fatal, num_photon == 0 )
1275         << " num_photon zero : MUST QEvt::setGenstep before QSim::generate_photon "
1276         ;
1277 
1278     assert( num_photon > 0 );
1279     assert( d_sim );
1280 
1281     configureLaunch( num_photon, 1 );
1282 
1283     LOG(info) << "QSim_generate_photon... " ;
1284 
1285     QSim_generate_photon(numBlocks, threadsPerBlock, d_sim );
1286 
1287     LOG(LEVEL) << "]" ;
1288 }
1289 
1290 
1291 
1292 
1293 
1294 
1295 extern void QSim_fill_state_0(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad6* state, unsigned num_state, qdebug* dbg );
1296 
1297 void QSim::fill_state_0(quad6* state, unsigned num_state)
1298 {
1299     assert( d_sim );
1300     assert( d_dbg );
1301 
1302     quad6* d_state = QU::device_alloc<quad6>(num_state, "QSim::fill_state_0:num_state") ;
1303 
1304 
1305     unsigned threads_per_block = 32 ;
1306     configureLaunch1D( num_state, threads_per_block );
1307 
1308     LOG(info)
1309          << " num_state " << num_state
1310          << " threads_per_block  " << threads_per_block
1311          << " descLaunch " << descLaunch()
1312          ;
1313 
1314     QSim_fill_state_0(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg  );
1315 
1316     const char* label = "QSim::fill_state_0" ;
1317     QU::copy_device_to_host_and_free<quad6>( state, d_state, num_state, label );
1318 }
1319 
1320 
1321 extern void QSim_fill_state_1(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, sstate* state, unsigned num_state, qdebug* dbg );
1322 
1323 void QSim::fill_state_1(sstate* state, unsigned num_state)
1324 {
1325     assert( d_sim );
1326     assert( d_dbg );
1327 
1328     sstate* d_state = QU::device_alloc<sstate>(num_state, "QSim::fill_state_1:num_state") ;
1329 
1330     unsigned threads_per_block = 64 ;
1331     configureLaunch1D( num_state, threads_per_block );
1332 
1333     LOG(info)
1334          << " num_state " << num_state
1335          << " threads_per_block  " << threads_per_block
1336          << " descLaunch " << descLaunch()
1337          ;
1338 
1339     QSim_fill_state_1(numBlocks, threadsPerBlock, d_sim, d_state, num_state, d_dbg );
1340 
1341     const char* label = "QSim::fill_state_1" ;
1342     QU::copy_device_to_host_and_free<sstate>( state, d_state, num_state, label );
1343 }
1344 
1345 
1346 
1347 
1348 
1349 
1350 
1351 
1352 /**
1353 extern QSim_quad_launch
1354 --------------------------
1355 
1356 This function is implemented in QSim.cu and it used by *quad_launch_generate*
1357 
1358 **/
1359 
1360 extern void QSim_quad_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad* q, unsigned num_quad, qdebug* dbg, unsigned type  );
1361 
1362 
1363 
1364 NP* QSim::quad_launch_generate(unsigned num_quad, unsigned type )
1365 {
1366     assert( d_sim );
1367     assert( d_dbg );
1368 
1369     const char* label = "QSim::quad_launch_generate:num_quad" ;
1370 
1371     quad* d_q = QU::device_alloc<quad>(num_quad, label ) ;
1372 
1373     unsigned threads_per_block = 512 ;
1374     configureLaunch1D( num_quad, threads_per_block );
1375 
1376     QSim_quad_launch(numBlocks, threadsPerBlock, d_sim, d_q, num_quad, d_dbg, type );
1377 
1378     NP* q = NP::Make<float>( num_quad, 4 );
1379     quad* qq = (quad*)q->bytes();
1380 
1381     QU::copy_device_to_host_and_free<quad>( qq, d_q, num_quad, label );
1382 
1383     if( type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA || type == QGEN_SMEAR_NORMAL_POLISH )
1384     {
1385         q->set_meta<std::string>("normal", scuda::serialize(dbg->normal) );
1386         q->set_meta<std::string>("direction", scuda::serialize(dbg->direction) );
1387         q->set_meta<float>("value", dbg->value );
1388         q->set_meta<std::string>("valuename", type == QGEN_SMEAR_NORMAL_SIGMA_ALPHA ? "sigma_alpha" : "polish" );
1389     }
1390 
1391     return q ;
1392 }
1393 
1394 
1395 
1396 
1397 /**
1398 extern QSim_photon_launch
1399 --------------------------
1400 
1401 This function is implemented in QSim.cu and it used by BOTH *photon_launch_generate* and *photon_launch_mutate*
1402 
1403 **/
1404 
1405 extern void QSim_photon_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, sphoton* photon, unsigned num_photon, qdebug* dbg, unsigned type  );
1406 
1407 
1408 /**
1409 QSim::photon_launch_generate
1410 ------------------------------
1411 
1412 This allocates a photon array on the device, generates photons into it on device and
1413 then downloads the generated photons into the host array. Contrast with *photon_launch_mutate*.
1414 
1415 **/
1416 
1417 NP* QSim::photon_launch_generate(unsigned num_photon, unsigned type )
1418 {
1419     assert( d_sim );
1420     assert( d_dbg );
1421 
1422     const char* label = "QSim::photon_launch_generate:num_photon" ;
1423 
1424     sphoton* d_photon = QU::device_alloc<sphoton>(num_photon, label ) ;
1425     QU::device_memset<sphoton>(d_photon, 0, num_photon);
1426 
1427     unsigned threads_per_block = 512 ;
1428     configureLaunch1D( num_photon, threads_per_block );
1429 
1430     QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, num_photon, d_dbg, type );
1431 
1432     NP* p = NP::Make<float>(num_photon, 4, 4);
1433     sphoton* photon = (sphoton*)p->bytes() ;
1434 
1435     QU::copy_device_to_host_and_free<sphoton>( photon, d_photon, num_photon, label );
1436 
1437     return p ;
1438 }
1439 
1440 
1441 
1442 
1443 /**
1444 QSim::photon_launch_mutate
1445 ---------------------------
1446 
1447 This uploads the photon array provided, mutates it and then downloads the changed array.
1448 
1449 **/
1450 
1451 void QSim::photon_launch_mutate(sphoton* photon, unsigned num_photon, unsigned type )
1452 {
1453     assert( d_sim );
1454     assert( d_dbg );
1455 
1456     const char* label_0 = "QSim::photon_launch_mutate/d_photon" ;
1457     sphoton* d_photon = QU::UploadArray<sphoton>(photon, num_photon, label_0 );
1458 
1459     unsigned DEBUG_NUM_PHOTON = ssys::getenvunsigned(_QSim__photon_launch_mutate_DEBUG_NUM_PHOTON, 0 );
1460     bool DEBUG_NUM_PHOTON_valid = DEBUG_NUM_PHOTON > 0 && DEBUG_NUM_PHOTON <= num_photon ;
1461     unsigned u_num_photon = DEBUG_NUM_PHOTON_valid ? DEBUG_NUM_PHOTON  : num_photon ;
1462     bool SKIP_LAUNCH = ssys::getenvbool(_QSim__photon_launch_mutate_SKIP_LAUNCH) ;
1463 
1464     LOG_IF( error, DEBUG_NUM_PHOTON_valid || true )
1465         << _QSim__photon_launch_mutate_DEBUG_NUM_PHOTON
1466         << " DEBUG_NUM_PHOTON " << DEBUG_NUM_PHOTON
1467         << " num_photon " << num_photon
1468         << " u_num_photon " << u_num_photon
1469         << _QSim__photon_launch_mutate_SKIP_LAUNCH
1470         << " " << ( SKIP_LAUNCH ? "YES" : "NO " )
1471         ;
1472 
1473 
1474     if( SKIP_LAUNCH == false )
1475     {
1476         unsigned threads_per_block = 512 ;
1477         configureLaunch1D( u_num_photon, threads_per_block );
1478         QSim_photon_launch(numBlocks, threadsPerBlock, d_sim, d_photon, u_num_photon, d_dbg, type );
1479     }
1480 
1481 
1482     const char* label_1 = "QSim::photon_launch_mutate" ;
1483     QU::copy_device_to_host_and_free<sphoton>( photon, d_photon, u_num_photon, label_1 );
1484 }
1485 
1486 
1487 
1488 
1489 
1490 /**
1491 QSim::UploadFakePRD (formerly "UploadMockPRD" )
1492 ----------------------------------------------------
1493 
1494 Caution this returns a device pointer.
1495 **/
1496 
1497 quad2* QSim::UploadFakePRD(const NP* ip, const NP* prd) // static
1498 {
1499     assert(ip);
1500     int num_ip = ip->shape[0] ;
1501     assert( num_ip > 0 );
1502 
1503     assert( prd->has_shape( num_ip, -1, 2, 4 ) );    // TODO: evt->max_record checking
1504     assert( prd->shape.size() == 4 && prd->shape[2] == 2 && prd->shape[3] == 4 );
1505     int num_prd = prd->shape[0]*prd->shape[1] ;
1506 
1507     LOG(LEVEL)
1508          << "["
1509          << " num_ip " << num_ip
1510          << " num_prd " << num_prd
1511          << " prd " << prd->sstr()
1512          ;
1513 
1514     const char* label = "QSim::UploadFakePRD/d_prd" ;
1515     quad2* d_prd = QU::UploadArray<quad2>( (quad2*)prd->bytes(), num_prd, label );
1516 
1517     // prd is non-standard so it is appropriate to adhoc upload here
1518 
1519     return d_prd ;
1520 }
1521 
1522 #if !defined(PRODUCTION)
1523 extern void QSim_fake_propagate_launch(dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad2* prd );
1524 #endif
1525 
1526 /**
1527 
1528 QSim::fake_propagate (formerly mock_propagate)
1529 -----------------------------------------------
1530 
1531 This is only invoked from QSimTest::fake_propagate
1532 
1533 
1534 Was renamed from "mock" to "fake" as is within Opticks "mock" is
1535 used to mean without using CUDA.
1536 
1537 * number of prd must be a multiple of the number of photon, ratio giving bounce_max
1538 * number of record must be a multiple of the number of photon, ratio giving record_max
1539 * HMM: this is an obtuse way to get bounce_max and record_max
1540 
1541 Aiming to replace the above with a simpler and less duplicitous version by
1542 using common QEvt functionality
1543 
1544 **/
1545 
1546 void QSim::fake_propagate( const NP* prd, unsigned type )
1547 {
1548 #if defined(PRODUCTION)
1549     (void)prd;
1550     (void)type;
1551     LOG(fatal) << "QSim::fake_propagate is disabled in PRODUCTION builds";
1552     std::raise(SIGINT);
1553 #else
1554     const NP* ip = sev->getInputPhoton();
1555     int num_ip = ip ? ip->shape[0] : 0 ;
1556     assert( num_ip > 0 );
1557 
1558     quad2* d_prd = UploadFakePRD(ip, prd) ;
1559 
1560     NP* igs = sev->makeGenstepArrayFromVector();
1561 
1562     int rc = qev->setGenstepUpload_NP(igs);
1563     assert( rc == 0 );
1564     if(rc!=0) std::raise(SIGINT);
1565 
1566     sev->add_array("prd0", prd );
1567     // NB SEvt::beginOfEvent calls SEvt/clear so this addition
1568     // must be after that to succeed in being added to SEvt saved arrays
1569 
1570     int num_photon = qev->getNumPhoton();
1571     bool consistent_num_photon = num_photon == num_ip ;
1572 
1573     LOG_IF(fatal, !consistent_num_photon)
1574          << "["
1575          << " num_ip " << num_ip
1576          << " QEvt::getNumPhoton " << num_photon
1577          << " consistent_num_photon " << ( consistent_num_photon ? "YES" : "NO " )
1578          << " prd " << prd->sstr()
1579          ;
1580     assert(consistent_num_photon);
1581 
1582     assert( qev->upload_count > 0 );
1583 
1584     unsigned threads_per_block = 512 ;
1585     configureLaunch1D( num_photon, threads_per_block );
1586 
1587     QSim_fake_propagate_launch(numBlocks, threadsPerBlock, d_sim, d_prd );
1588 
1589     cudaDeviceSynchronize();
1590 
1591 
1592     LOG(LEVEL) << "]" ;
1593 #endif
1594 }
1595 
1596 
1597 
1598 extern void QSim_boundary_lookup_all(    dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, quad* lookup, unsigned width, unsigned height );
1599 
1600 NP* QSim::boundary_lookup_all(unsigned width, unsigned height )
1601 {
1602     LOG(LEVEL) << "[" ;
1603     assert( bnd );
1604     assert( width <= getBoundaryTexWidth()  );
1605     assert( height <= getBoundaryTexHeight()  );
1606 
1607     unsigned num_lookup = width*height ;
1608     LOG(LEVEL)
1609         << " width " << width
1610         << " height " << height
1611         << " num_lookup " << num_lookup
1612         ;
1613 
1614 
1615     configureLaunch(width, height );
1616 
1617     const char* label = "QSim::boundary_lookup_all:num_lookup" ;
1618 
1619     quad* d_lookup = QU::device_alloc<quad>(num_lookup, label ) ;
1620     QSim_boundary_lookup_all(numBlocks, threadsPerBlock, d_sim, d_lookup, width, height );
1621 
1622     assert( height % 8 == 0 );
1623     unsigned num_bnd = height/8 ;
1624 
1625     NP* l = NP::Make<float>( num_bnd, 4, 2, width, 4 );
1626     QU::copy_device_to_host_and_free<quad>( (quad*)l->bytes(), d_lookup, num_lookup, label );
1627 
1628     LOG(LEVEL) << "]" ;
1629 
1630     return l ;
1631 
1632 }
1633 
1634 extern void QSim_boundary_lookup_line(    dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, quad* lookup, float* domain, unsigned num_lookup, unsigned line, unsigned k );
1635 
1636 
1637 NP* QSim::boundary_lookup_line( float* domain, unsigned num_lookup, unsigned line, unsigned k )
1638 {
1639     LOG(LEVEL)
1640         << "["
1641         << " num_lookup " << num_lookup
1642         << " line " << line
1643         << " k " << k
1644         ;
1645 
1646     configureLaunch(num_lookup, 1  );
1647 
1648     float* d_domain = QU::device_alloc<float>(num_lookup, "QSim::boundary_lookup_line:num_lookup") ;
1649 
1650     QU::copy_host_to_device<float>( d_domain, domain, num_lookup );
1651 
1652     const char* label = "QSim::boundary_lookup_line:num_lookup" ;
1653 
1654     quad* d_lookup = QU::device_alloc<quad>(num_lookup, label ) ;
1655 
1656     QSim_boundary_lookup_line(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, num_lookup, line, k );
1657 
1658 
1659     NP* l = NP::Make<float>( num_lookup, 4 );
1660 
1661     QU::copy_device_to_host_and_free<quad>( (quad*)l->bytes(), d_lookup, num_lookup, label  );
1662 
1663     QU::device_free<float>( d_domain );
1664 
1665     LOG(LEVEL) << "]" ;
1666 
1667     return l ;
1668 }
1669 
1670 
1671 
1672 
1673 
1674 /**
1675 QSim::prop_lookup
1676 --------------------
1677 
1678 suspect problem when have fine domain and many pids due to 2d launch config,
1679 BUT when have 1d launch there is no problem to launch millions of threads : hence the
1680 below *prop_lookup_onebyone*
1681 
1682 **/
1683 
1684 
1685 template <typename T>
1686 extern void QSim_prop_lookup( dim3 numBlocks, dim3 threadsPerBlock, qsim* d_sim, T* lookup, const T* domain, unsigned domain_width, unsigned* pids, unsigned num_pids );
1687 
1688 template <typename T>
1689 void QSim::prop_lookup( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids )
1690 {
1691     unsigned num_pids = pids.size() ;
1692     unsigned num_lookup = num_pids*domain_width ;
1693     LOG(LEVEL)
1694         << "["
1695         << " num_pids " << num_pids
1696         << " domain_width " << domain_width
1697         << " num_lookup " << num_lookup
1698         ;
1699 
1700     configureLaunch(domain_width, num_pids  );
1701 
1702     unsigned* d_pids = QU::device_alloc<unsigned>(num_pids, "QSim::prop_lookup:num_pids") ;
1703     T* d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup:domain_width") ;
1704     T* d_lookup = QU::device_alloc<T>(num_lookup  , "QSim::prop_lookup:num_lookup") ;
1705 
1706     QU::copy_host_to_device<T>( d_domain, domain, domain_width );
1707     QU::copy_host_to_device<unsigned>( d_pids, pids.data(), num_pids );
1708 
1709     QSim_prop_lookup(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, d_pids, num_pids );
1710 
1711     QU::copy_device_to_host_and_free<T>( lookup, d_lookup, num_lookup );
1712     QU::device_free<T>( d_domain );
1713     QU::device_free<unsigned>( d_pids );
1714 
1715     LOG(LEVEL) << "]" ;
1716 }
1717 
1718 
1719 
1720 /**
1721 Hmm doing lookups like this is a very common pattern, could do with
1722 a sub context to carry the pieces to simplify doing that.
1723 **/
1724 
1725 template <typename T>
1726 extern void QSim_prop_lookup_one(
1727     dim3 numBlocks,
1728     dim3 threadsPerBlock,
1729     qsim* sim,
1730     T* lookup,
1731     const T* domain,
1732     unsigned domain_width,
1733     unsigned num_pids,
1734     unsigned pid,
1735     unsigned ipid
1736 );
1737 
1738 /**
1739 QSim::prop_lookup_onebyone
1740 ---------------------------
1741 
1742 The *lookup* output array holds across domain lookups for multiple property ids.
1743 Separate launches are made for each property id, all writing to the same buffer.
1744 
1745 On device uses::
1746 
1747     sim->prop->interpolate
1748 
1749 **/
1750 
1751 template <typename T>
1752 void QSim::prop_lookup_onebyone( T* lookup, const T* domain, unsigned domain_width, const std::vector<unsigned>& pids )
1753 {
1754     unsigned num_pids = pids.size() ;
1755     unsigned num_lookup = num_pids*domain_width ;
1756     LOG(LEVEL)
1757         << "["
1758         << " num_pids " << num_pids
1759         << " domain_width " << domain_width
1760         << " num_lookup " << num_lookup
1761         ;
1762 
1763     configureLaunch(domain_width, 1  );
1764 
1765     T* d_domain = QU::device_alloc<T>(domain_width, "QSim::prop_lookup_onebyone:domain_width") ;
1766     QU::copy_host_to_device<T>( d_domain, domain, domain_width );
1767 
1768     const char* label = "QSim::prop_lookup_onebyone:num_lookup" ;
1769 
1770     T* d_lookup = QU::device_alloc<T>(num_lookup, label ) ;
1771 
1772     // separate launches for each pid
1773     for(unsigned ipid=0 ; ipid < num_pids ; ipid++)
1774     {
1775         unsigned pid = pids[ipid] ;
1776         QSim_prop_lookup_one<T>(numBlocks, threadsPerBlock, d_sim, d_lookup, d_domain, domain_width, num_pids, pid, ipid );
1777     }
1778 
1779     QU::copy_device_to_host_and_free<T>( lookup, d_lookup, num_lookup, label  );
1780 
1781     QU::device_free<T>( d_domain );
1782 
1783     LOG(LEVEL) << "]" ;
1784 }
1785 
1786 
1787 template void QSim::prop_lookup_onebyone( float*, const float* ,   unsigned, const std::vector<unsigned>& );
1788 template void QSim::prop_lookup_onebyone( double*, const double* , unsigned, const std::vector<unsigned>& );
1789 
1790 
1791 
1792 
1793 
1794 
1795 extern void QSim_multifilm_lookup_all(    dim3 numBlocks, dim3 threadsPerBlock, qsim* sim, quad2* sample, quad2* result,  unsigned width, unsigned height );
1796 
1797 void QSim::multifilm_lookup_all( quad2 * sample , quad2 * result ,  unsigned width, unsigned height )
1798 {
1799     LOG(LEVEL) << "[" ;
1800     unsigned num_lookup = width*height ;
1801     unsigned size = num_lookup ;
1802 
1803     LOG(LEVEL)
1804         << " width " << width
1805         << " height " << height
1806         << " num_lookup " << num_lookup
1807         << " size "<<size
1808         ;
1809 
1810     configureLaunch2D(width, height );
1811 
1812     //const float * c_sample = sample;
1813     quad2* d_sample = QU::device_alloc<quad2>(size, "QSim::multifilm_lookup_all:size" ) ;
1814 
1815     const char* label = "QSim::multifilm_lookup_all:size" ;
1816 
1817     quad2* d_result = QU::device_alloc<quad2>(size, label ) ;
1818     LOG(LEVEL)
1819        <<" copy_host_to_device<quad2>( d_sample, sample , size) before";
1820     QU::copy_host_to_device<quad2>( d_sample, sample , size);
1821     LOG(LEVEL)
1822        <<" copy_host_to_device<quad2>( d_sample, sample , size) after";
1823 
1824     QSim_multifilm_lookup_all(numBlocks, threadsPerBlock, d_sim, d_sample, d_result, width, height );
1825     QU::copy_device_to_host_and_free<quad2>( result , d_result , size, label );
1826     QU::device_free<quad2>(d_sample);
1827 
1828     cudaDeviceSynchronize();
1829     LOG(LEVEL) << "]" ;
1830 }
1831 
1832 
1833 
1834 
1835 unsigned QSim::getBoundaryTexWidth() const
1836 {
1837     return bnd->tex->width ;
1838 }
1839 unsigned QSim::getBoundaryTexHeight() const
1840 {
1841     return bnd->tex->height ;
1842 }
1843 const NP* QSim::getBoundaryTexSrc() const
1844 {
1845     return bnd->src ;
1846 }
1847 
1848 void QSim::dump_photon( quad4* photon, unsigned num_photon, const char* opt_, unsigned edgeitems )
1849 {
1850     LOG(LEVEL);
1851 
1852     std::string opt = opt_ ;
1853 
1854     bool f0 = opt.find("f0") != std::string::npos ;
1855     bool f1 = opt.find("f1") != std::string::npos ;
1856     bool f2 = opt.find("f2") != std::string::npos ;
1857     bool f3 = opt.find("f3") != std::string::npos ;
1858 
1859     bool i0 = opt.find("i0") != std::string::npos ;
1860     bool i1 = opt.find("i1") != std::string::npos ;
1861     bool i2 = opt.find("i2") != std::string::npos ;
1862     bool i3 = opt.find("i3") != std::string::npos ;
1863 
1864     int wi = 7 ;
1865     int pr = 2 ;
1866 
1867     for(unsigned i=0 ; i < num_photon ; i++)
1868     {
1869         if( i < edgeitems || i > num_photon - edgeitems)
1870         {
1871             const quad4& p = photon[i] ;
1872 
1873             std::cout
1874                 << std::setw(wi) << i
1875                 ;
1876 
1877             if(f0) std::cout
1878                 << " f0 "
1879                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.x
1880                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.y
1881                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.z
1882                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q0.f.w
1883                 ;
1884 
1885             if(f1) std::cout
1886                 << " f1 "
1887                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.x
1888                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.y
1889                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.z
1890                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q1.f.w
1891                 ;
1892 
1893             if(f2) std::cout
1894                 << " f2 "
1895                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.x
1896                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.y
1897                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.z
1898                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q2.f.w
1899                 ;
1900 
1901             if(f3) std::cout
1902                 << " f3 "
1903                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.x
1904                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.y
1905                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.z
1906                 << std::setw(wi) << std::fixed << std::setprecision(pr) << p.q3.f.w
1907                 ;
1908 
1909             if(i0) std::cout
1910                 << " i0 "
1911                 << std::setw(wi) << p.q0.i.x
1912                 << std::setw(wi) << p.q0.i.y
1913                 << std::setw(wi) << p.q0.i.z
1914                 << std::setw(wi) << p.q0.i.w
1915                 ;
1916 
1917             if(i1) std::cout
1918                 << " i1 "
1919                 << std::setw(wi) << p.q1.i.x
1920                 << std::setw(wi) << p.q1.i.y
1921                 << std::setw(wi) << p.q1.i.z
1922                 << std::setw(wi) << p.q1.i.w
1923                 ;
1924 
1925             if(i2) std::cout
1926                 << " i2 "
1927                 << std::setw(wi) << p.q2.i.x
1928                 << std::setw(wi) << p.q2.i.y
1929                 << std::setw(wi) << p.q2.i.z
1930                 << std::setw(wi) << p.q2.i.w
1931                 ;
1932 
1933             if(i3) std::cout
1934                 << " i3 "
1935                 << std::setw(wi) << p.q3.i.x
1936                 << std::setw(wi) << p.q3.i.y
1937                 << std::setw(wi) << p.q3.i.z
1938                 << std::setw(wi) << p.q3.i.w
1939                 ;
1940 
1941             std::cout
1942                 << std::endl
1943                 ;
1944         }
1945     }
1946 }
1947 
1948 
1949 /**
1950 QSim::Desc
1951 ------------
1952 
1953 Generated with::
1954 
1955    ~/opticks/qudarap/QSim__Desc.sh
1956 
1957 Dump flags with::
1958 
1959    QSimDescTest
1960    ssys_test
1961 
1962 **/
1963 std::string QSim::Desc(char delim)  // static
1964 {
1965     std::stringstream ss ;
1966     ss << ( delim == ',' ? "" : "QSim::Desc\n" )
1967 #ifdef CONFIG_Debug
1968        << "CONFIG_Debug"
1969 #else
1970        << "NOT-CONFIG_Debug"
1971 #endif
1972        << delim
1973 #ifdef CONFIG_RelWithDebInfo
1974        << "CONFIG_RelWithDebInfo"
1975 #else
1976        << "NOT-CONFIG_RelWithDebInfo"
1977 #endif
1978        << delim
1979 #ifdef CONFIG_Release
1980        << "CONFIG_Release"
1981 #else
1982        << "NOT-CONFIG_Release"
1983 #endif
1984        << delim
1985 #ifdef CONFIG_MinSizeRel
1986        << "CONFIG_MinSizeRel"
1987 #else
1988        << "NOT-CONFIG_MinSizeRel"
1989 #endif
1990        << delim
1991 #ifdef PRODUCTION
1992        << "PRODUCTION"
1993 #else
1994        << "NOT-PRODUCTION"
1995 #endif
1996        << delim
1997 #ifdef WITH_CHILD
1998        << "WITH_CHILD"
1999 #else
2000        << "NOT-WITH_CHILD"
2001 #endif
2002        << delim
2003 #ifdef WITH_CUSTOM4
2004        << "WITH_CUSTOM4"
2005 #else
2006        << "NOT-WITH_CUSTOM4"
2007 #endif
2008        << delim
2009 #ifdef PLOG_LOCAL
2010        << "PLOG_LOCAL"
2011 #else
2012        << "NOT-PLOG_LOCAL"
2013 #endif
2014        << delim
2015 #ifdef DEBUG_PIDX
2016        << "DEBUG_PIDX"
2017 #else
2018        << "NOT-DEBUG_PIDX"
2019 #endif
2020        << delim
2021 #ifdef DEBUG_TAG
2022        << "DEBUG_TAG"
2023 #else
2024        << "NOT-DEBUG_TAG"
2025 #endif
2026        << delim
2027 #ifdef RNG_XORWOW
2028        << "RNG_XORWOW"
2029 #else
2030        << "NOT-RNG_XORWOW"
2031 #endif
2032        << delim
2033 #ifdef RNG_PHILOX
2034        << "RNG_PHILOX"
2035 #else
2036        << "NOT-RNG_PHILOX"
2037 #endif
2038        << delim
2039 #ifdef RNG_PHILITEOX
2040        << "RNG_PHILITEOX"
2041 #else
2042        << "NOT-RNG_PHILITEOX"
2043 #endif
2044        << delim
2045        ;
2046     std::string str = ss.str() ;
2047     return str ;
2048 }
2049 
2050 
2051 
2052 std::string QSim::Switches()  // static
2053 {
2054     return Desc(',');
2055 }
2056 
2057