Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-15 07:41:50

0001 #pragma once
0002 //
0003 // async_gpu_std.h — Async CPU+GPU optical photon simulation, std-only.
0004 //
0005 // Same double-buffering pattern as examples/async_gpu_launch but without
0006 // G4TaskGroup / G4Mutex.  The GPU worker is a plain std::thread driven by
0007 // std::mutex + std::condition_variable + std::queue, with explicit
0008 // backpressure controlled by GPU_MAX_QUEUE_SIZE.
0009 //
0010 // Modes:
0011 //   --async   : double-buffered async GPU processing (default)
0012 //   --sync    : end-of-run GPU simulation (one batch)
0013 //
0014 
0015 #include <atomic>
0016 #include <chrono>
0017 #include <condition_variable>
0018 #include <cstdlib>
0019 #include <cstring>
0020 #include <filesystem>
0021 #include <fstream>
0022 #include <iostream>
0023 #include <memory>
0024 #include <mutex>
0025 #include <queue>
0026 #include <sstream>
0027 #include <thread>
0028 #include <vector>
0029 
0030 #include "G4AutoLock.hh"
0031 #include "G4BooleanSolid.hh"
0032 #include "G4Cerenkov.hh"
0033 #include "G4Electron.hh"
0034 #include "G4Event.hh"
0035 #include "G4EventManager.hh"
0036 #include "G4GDMLParser.hh"
0037 #include "G4LogicalVolumeStore.hh"
0038 #include "G4OpBoundaryProcess.hh"
0039 #include "G4OpticalPhoton.hh"
0040 #include "G4PhysicalConstants.hh"
0041 #include "G4PrimaryParticle.hh"
0042 #include "G4PrimaryVertex.hh"
0043 #include "G4RunManager.hh"
0044 #include "G4RunManagerFactory.hh"
0045 #include "G4SDManager.hh"
0046 #include "G4Scintillation.hh"
0047 #include "G4SubtractionSolid.hh"
0048 #include "G4SystemOfUnits.hh"
0049 #include "G4ThreeVector.hh"
0050 #include "G4Track.hh"
0051 #include "G4TrackStatus.hh"
0052 #include "G4UserEventAction.hh"
0053 #include "G4UserRunAction.hh"
0054 #include "G4UserSteppingAction.hh"
0055 #include "G4UserTrackingAction.hh"
0056 #include "G4VPhysicalVolume.hh"
0057 #include "G4VProcess.hh"
0058 #include "G4VUserDetectorConstruction.hh"
0059 #include "G4VUserPrimaryGeneratorAction.hh"
0060 
0061 #include "g4cx/G4CXOpticks.hh"
0062 #include "sysrap/NP.hh"
0063 #include "sysrap/OpticksGenstep.h"
0064 #include "sysrap/SEvt.hh"
0065 #include "sysrap/scerenkov.h"
0066 #include "sysrap/sgs.h"
0067 #include "sysrap/spho.h"
0068 #include "sysrap/sphoton.h"
0069 #include "sysrap/sscint.h"
0070 #include "u4/U4.hh"
0071 #include "u4/U4Random.hh"
0072 #include "u4/U4StepPoint.hh"
0073 #include "u4/U4Touchable.h"
0074 #include "u4/U4Track.h"
0075 
0076 namespace
0077 {
0078 G4Mutex                            genstep_mutex = G4MUTEX_INITIALIZER; // sync-mode SEvt collection
0079 std::mutex                         g4hits_mutex;                        // accumulator across worker threads
0080 std::vector<std::array<float, 16>> g4_accumulated_hits;
0081 } // namespace
0082 
0083 // ============================================================================
0084 // Geometry helpers
0085 // ============================================================================
0086 
0087 bool IsSubtractionSolid(G4VSolid* solid)
0088 {
0089     if (!solid)
0090         return false;
0091     if (dynamic_cast<G4SubtractionSolid*>(solid))
0092         return true;
0093     if (G4BooleanSolid* bs = dynamic_cast<G4BooleanSolid*>(solid))
0094     {
0095         if (IsSubtractionSolid(bs->GetConstituentSolid(0)) ||
0096             IsSubtractionSolid(bs->GetConstituentSolid(1)))
0097             return true;
0098     }
0099     return false;
0100 }
0101 
0102 std::string str_tolower(std::string s)
0103 {
0104     std::transform(s.begin(), s.end(), s.begin(),
0105                    [](unsigned char c) { return std::tolower(c); });
0106     return s;
0107 }
0108 
0109 // ============================================================================
0110 // GenstepBuffer — accumulates gensteps for one GPU batch
0111 // ============================================================================
0112 
0113 struct GenstepBuffer
0114 {
0115     std::vector<quad6> gensteps;
0116     std::vector<sgs>   labels;
0117     int64_t            photon_count = 0;
0118     int64_t            genstep_count = 0;
0119     int                event_id = 0;
0120 
0121     void addGenstep(const quad6& gs, int64_t numphotons)
0122     {
0123         sgs label;
0124         label.index = gensteps.size();
0125         label.photons = numphotons;
0126         label.offset = photon_count;
0127         label.gentype = gs.gentype();
0128 
0129         gensteps.push_back(gs);
0130         labels.push_back(label);
0131         photon_count += numphotons;
0132         genstep_count++;
0133     }
0134 
0135     bool empty() const
0136     {
0137         return gensteps.empty();
0138     }
0139     size_t size() const
0140     {
0141         return gensteps.size();
0142     }
0143 };
0144 
0145 struct GPUTask
0146 {
0147     int                            batch_id;
0148     std::shared_ptr<GenstepBuffer> buffer;
0149 
0150     GPUTask() :
0151         batch_id(-1),
0152         buffer(nullptr)
0153     {
0154     }
0155     GPUTask(int bid, std::shared_ptr<GenstepBuffer> buf) :
0156         batch_id(bid),
0157         buffer(std::move(buf))
0158     {
0159     }
0160 };
0161 
0162 // ============================================================================
0163 // GPUTaskManager — std::thread + std::queue worker, no G4Task
0164 //
0165 // CPU thread(s) push completed buffers into task_queue_ via submitBuffer()
0166 // when the photon threshold is reached.  A single worker std::thread pops
0167 // tasks and runs G4CXOpticks::simulate().  The queue has a fixed depth
0168 // (GPU_MAX_QUEUE_SIZE) so producers block when the GPU falls behind.
0169 // ============================================================================
0170 
0171 class GPUTaskManager
0172 {
0173   public:
0174     static constexpr int64_t DEFAULT_PHOTON_THRESHOLD = 10000000; // 10M photons
0175     static constexpr size_t  DEFAULT_MAX_QUEUE_SIZE = 3;
0176 
0177   private:
0178     int64_t photon_threshold_;
0179     size_t  max_queue_size_;
0180 
0181     std::shared_ptr<GenstepBuffer> active_buffer_;
0182     std::mutex                     buffer_mutex_;
0183 
0184     std::queue<GPUTask>     task_queue_;
0185     std::mutex              queue_mutex_;
0186     std::condition_variable queue_not_full_;
0187     std::condition_variable queue_not_empty_;
0188     std::condition_variable queue_drained_;
0189 
0190     std::thread       worker_;
0191     std::atomic<bool> shutdown_{false};
0192     std::atomic<bool> running_{false};
0193 
0194     std::atomic<int>      batch_counter_{0};
0195     std::atomic<int>      completed_batches_{0};
0196     std::atomic<uint64_t> total_hits_{0};
0197     std::atomic<uint64_t> total_photons_{0};
0198     std::atomic<uint64_t> total_gpu_time_us_{0};
0199 
0200   public:
0201     GPUTaskManager(int64_t threshold = DEFAULT_PHOTON_THRESHOLD,
0202                    size_t  max_queue = DEFAULT_MAX_QUEUE_SIZE) :
0203         photon_threshold_(threshold),
0204         max_queue_size_(max_queue),
0205         active_buffer_(std::make_shared<GenstepBuffer>())
0206     {
0207         if (const char* e = std::getenv("GPU_PHOTON_FLUSH_THRESHOLD"))
0208             photon_threshold_ = std::atoll(e);
0209         if (const char* e = std::getenv("GPU_MAX_QUEUE_SIZE"))
0210             max_queue_size_ = std::max(1, std::atoi(e));
0211     }
0212 
0213     ~GPUTaskManager()
0214     {
0215         shutdown();
0216     }
0217 
0218     void start()
0219     {
0220         if (running_.exchange(true))
0221             return;
0222         shutdown_ = false;
0223         worker_ = std::thread(&GPUTaskManager::workerLoop, this);
0224 
0225         G4cout << "GPUTaskManager [std]: started"
0226                << " threshold=" << photon_threshold_
0227                << " max_queue=" << max_queue_size_ << G4endl;
0228     }
0229 
0230     void shutdown()
0231     {
0232         if (!running_.exchange(false))
0233             return;
0234 
0235         // Flush any remaining gensteps in the active buffer
0236         std::shared_ptr<GenstepBuffer> remainder;
0237         {
0238             std::lock_guard<std::mutex> lock(buffer_mutex_);
0239             if (active_buffer_ && !active_buffer_->empty())
0240             {
0241                 remainder = active_buffer_;
0242                 active_buffer_ = std::make_shared<GenstepBuffer>();
0243             }
0244         }
0245         if (remainder)
0246             submitBuffer(remainder);
0247 
0248         {
0249             std::lock_guard<std::mutex> lock(queue_mutex_);
0250             shutdown_ = true;
0251         }
0252         queue_not_empty_.notify_all();
0253         queue_not_full_.notify_all();
0254 
0255         if (worker_.joinable())
0256             worker_.join();
0257 
0258         G4cout << "GPUTaskManager [std]: shutdown"
0259                << " batches=" << completed_batches_.load()
0260                << " photons=" << total_photons_.load()
0261                << " hits=" << total_hits_.load()
0262                << " gpu_time=" << (total_gpu_time_us_.load() / 1e6) << "s"
0263                << G4endl;
0264     }
0265 
0266     // Hot path — invoked from SteppingAction
0267     void addGenstep(const quad6& gs, int64_t numphotons, int eventID)
0268     {
0269         std::shared_ptr<GenstepBuffer> to_submit;
0270         {
0271             std::lock_guard<std::mutex> lock(buffer_mutex_);
0272             active_buffer_->event_id = eventID;
0273             active_buffer_->addGenstep(gs, numphotons);
0274 
0275             if (active_buffer_->photon_count >= photon_threshold_)
0276             {
0277                 to_submit = active_buffer_;
0278                 active_buffer_ = std::make_shared<GenstepBuffer>();
0279             }
0280         }
0281         if (to_submit)
0282             submitBuffer(to_submit);
0283     }
0284 
0285     void flushRemaining(int eventID)
0286     {
0287         std::shared_ptr<GenstepBuffer> to_submit;
0288         {
0289             std::lock_guard<std::mutex> lock(buffer_mutex_);
0290             if (active_buffer_ && !active_buffer_->empty())
0291             {
0292                 active_buffer_->event_id = eventID;
0293                 to_submit = active_buffer_;
0294                 active_buffer_ = std::make_shared<GenstepBuffer>();
0295             }
0296         }
0297         if (to_submit)
0298         {
0299             G4cout << "GPUTaskManager [std]: final flush (" << to_submit->photon_count
0300                    << " photons)" << G4endl;
0301             submitBuffer(to_submit);
0302         }
0303         waitForDrain();
0304     }
0305 
0306     void waitForDrain()
0307     {
0308         std::unique_lock<std::mutex> lock(queue_mutex_);
0309         queue_drained_.wait(lock, [this]() { return task_queue_.empty(); });
0310     }
0311 
0312     int getCompletedBatches() const
0313     {
0314         return completed_batches_.load();
0315     }
0316     uint64_t getTotalHits() const
0317     {
0318         return total_hits_.load();
0319     }
0320     uint64_t getTotalPhotons() const
0321     {
0322         return total_photons_.load();
0323     }
0324     int64_t getThreshold() const
0325     {
0326         return photon_threshold_;
0327     }
0328     double getTotalGPUTime() const
0329     {
0330         return total_gpu_time_us_.load() / 1e6;
0331     }
0332 
0333   private:
0334     void submitBuffer(std::shared_ptr<GenstepBuffer> buffer)
0335     {
0336         if (!buffer || buffer->empty())
0337             return;
0338 
0339         int     batch_id = batch_counter_.fetch_add(1);
0340         GPUTask task(batch_id, buffer);
0341 
0342         {
0343             std::unique_lock<std::mutex> lock(queue_mutex_);
0344             queue_not_full_.wait(lock, [this]() {
0345                 return task_queue_.size() < max_queue_size_ || shutdown_;
0346             });
0347             if (shutdown_)
0348                 return;
0349 
0350             task_queue_.push(std::move(task));
0351             G4cout << "GPUTaskManager [std]: queued batch " << batch_id << " ("
0352                    << buffer->photon_count << " photons, " << buffer->genstep_count
0353                    << " gensteps)"
0354                    << " queue_size=" << task_queue_.size() << G4endl;
0355         }
0356         queue_not_empty_.notify_one();
0357     }
0358 
0359     void workerLoop()
0360     {
0361         G4cout << "GPUTaskManager [std]: worker thread started (tid="
0362                << std::this_thread::get_id() << ")" << G4endl;
0363 
0364         while (true)
0365         {
0366             GPUTask task;
0367             {
0368                 std::unique_lock<std::mutex> lock(queue_mutex_);
0369                 queue_not_empty_.wait(
0370                     lock, [this]() { return !task_queue_.empty() || shutdown_; });
0371                 if (task_queue_.empty() && shutdown_)
0372                     break;
0373 
0374                 task = std::move(task_queue_.front());
0375                 task_queue_.pop();
0376             }
0377             queue_not_full_.notify_one();
0378 
0379             if (task.buffer)
0380                 runBatch(task.batch_id, task.buffer);
0381 
0382             {
0383                 std::lock_guard<std::mutex> lock(queue_mutex_);
0384                 if (task_queue_.empty())
0385                     queue_drained_.notify_all();
0386             }
0387         }
0388         G4cout << "GPUTaskManager [std]: worker thread exiting" << G4endl;
0389     }
0390 
0391     void runBatch(int batch_id, std::shared_ptr<GenstepBuffer> buffer)
0392     {
0393         G4cout << "=== GPU Batch " << batch_id << " ==="
0394                << " photons=" << buffer->photon_count
0395                << " gensteps=" << buffer->genstep_count << G4endl;
0396 
0397         G4CXOpticks* gx = G4CXOpticks::Get();
0398         SEvt*        sev = SEvt::Get_EGPU();
0399         if (!gx || !sev)
0400         {
0401             G4cerr << "GPUTaskManager [std]: G4CXOpticks/SEvt not available"
0402                    << G4endl;
0403             return;
0404         }
0405 
0406         sev->clear_genstep();
0407         NP* gs_array = NP::Make<float>(buffer->gensteps.size(), 6, 4);
0408         std::memcpy(gs_array->values<float>(), buffer->gensteps.data(),
0409                     buffer->gensteps.size() * sizeof(quad6));
0410         sev->addGenstep(gs_array);
0411 
0412         auto t0 = std::chrono::high_resolution_clock::now();
0413         gx->simulate(buffer->event_id, false);
0414         cudaDeviceSynchronize();
0415         auto t1 = std::chrono::high_resolution_clock::now();
0416         auto us =
0417             std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
0418 
0419         unsigned num_hits = sev->GetNumHit(0);
0420         total_gpu_time_us_ += us;
0421         total_hits_ += num_hits;
0422         total_photons_ += buffer->photon_count;
0423 
0424         G4cout << "  gpu_time=" << (us / 1e6) << "s hits=" << num_hits << G4endl;
0425         if (num_hits > 0)
0426             saveHits(batch_id, sev, num_hits);
0427 
0428         gx->reset(buffer->event_id);
0429         completed_batches_++;
0430         G4cout << "=== GPU Batch " << batch_id << " complete ===" << G4endl;
0431     }
0432 
0433     void saveHits(int batch_id, SEvt* sev, unsigned num_hits)
0434     {
0435         std::ostringstream fname;
0436         fname << "gpu_hits_batch_" << batch_id << ".npy";
0437 
0438         NP* arr = NP::Make<float>(num_hits, 4, 4);
0439         for (unsigned idx = 0; idx < num_hits; idx++)
0440         {
0441             sphoton hit;
0442             sev->getHit(hit, idx);
0443             std::memcpy(arr->bytes() + idx * sizeof(sphoton), &hit, sizeof(sphoton));
0444         }
0445         arr->save(fname.str().c_str());
0446         G4cout << "  saved " << num_hits << " hits to " << fname.str() << G4endl;
0447     }
0448 };
0449 
0450 // ============================================================================
0451 // Genstep construction (bypass U4 / SEvt for async path)
0452 // ============================================================================
0453 
0454 static quad6 MakeGenstep_Cerenkov(const G4Track* aTrack, const G4Step* aStep,
0455                                   G4int numPhotons, G4double betaInverse,
0456                                   G4double pmin, G4double pmax, G4double maxCos,
0457                                   G4double maxSin2,
0458                                   G4double meanNumberOfPhotons1,
0459                                   G4double meanNumberOfPhotons2)
0460 {
0461     G4StepPoint*             pre = aStep->GetPreStepPoint();
0462     G4StepPoint*             post = aStep->GetPostStepPoint();
0463     G4ThreeVector            x0 = pre->GetPosition();
0464     G4double                 t0 = pre->GetGlobalTime();
0465     G4ThreeVector            dx = aStep->GetDeltaPosition();
0466     const G4DynamicParticle* dp = aTrack->GetDynamicParticle();
0467     const G4Material*        mat = aTrack->GetMaterial();
0468 
0469     G4double Wmin_nm = h_Planck * c_light / pmax / nm;
0470     G4double Wmax_nm = h_Planck * c_light / pmin / nm;
0471 
0472     quad6 gs;
0473     gs.zero();
0474     scerenkov* ck = (scerenkov*)(&gs);
0475 
0476     ck->gentype = OpticksGenstep_G4Cerenkov_modified;
0477     ck->trackid = aTrack->GetTrackID();
0478     ck->matline = mat->GetIndex() + SEvt::G4_INDEX_OFFSET;
0479     ck->numphoton = numPhotons;
0480 
0481     ck->pos.x = x0.x();
0482     ck->pos.y = x0.y();
0483     ck->pos.z = x0.z();
0484     ck->time = t0;
0485 
0486     ck->DeltaPosition.x = dx.x();
0487     ck->DeltaPosition.y = dx.y();
0488     ck->DeltaPosition.z = dx.z();
0489     ck->step_length = aStep->GetStepLength();
0490 
0491     ck->code = dp->GetDefinition()->GetPDGEncoding();
0492     ck->charge = dp->GetDefinition()->GetPDGCharge();
0493     ck->weight = aTrack->GetWeight();
0494     ck->preVelocity = pre->GetVelocity();
0495 
0496     ck->BetaInverse = betaInverse;
0497     ck->Wmin = Wmin_nm;
0498     ck->Wmax = Wmax_nm;
0499     ck->maxCos = maxCos;
0500 
0501     ck->maxSin2 = maxSin2;
0502     ck->MeanNumberOfPhotons1 = meanNumberOfPhotons1;
0503     ck->MeanNumberOfPhotons2 = meanNumberOfPhotons2;
0504     ck->postVelocity = post->GetVelocity();
0505     return gs;
0506 }
0507 
0508 static quad6 MakeGenstep_Scintillation(const G4Track* aTrack,
0509                                        const G4Step* aStep, G4int numPhotons,
0510                                        G4int scnt, G4double ScintillationTime)
0511 {
0512     G4StepPoint*             pre = aStep->GetPreStepPoint();
0513     G4StepPoint*             post = aStep->GetPostStepPoint();
0514     G4ThreeVector            x0 = pre->GetPosition();
0515     G4double                 t0 = pre->GetGlobalTime();
0516     G4ThreeVector            dx = aStep->GetDeltaPosition();
0517     G4double                 meanV = (pre->GetVelocity() + post->GetVelocity()) * 0.5;
0518     const G4DynamicParticle* dp = aTrack->GetDynamicParticle();
0519     const G4Material*        mat = aTrack->GetMaterial();
0520 
0521     quad6 gs;
0522     gs.zero();
0523     sscint* sc = (sscint*)(&gs);
0524 
0525     sc->gentype = OpticksGenstep_DsG4Scintillation_r4695;
0526     sc->trackid = aTrack->GetTrackID();
0527     sc->matline = mat->GetIndex() + SEvt::G4_INDEX_OFFSET;
0528     sc->numphoton = numPhotons;
0529 
0530     sc->pos.x = x0.x();
0531     sc->pos.y = x0.y();
0532     sc->pos.z = x0.z();
0533     sc->time = t0;
0534 
0535     sc->DeltaPosition.x = dx.x();
0536     sc->DeltaPosition.y = dx.y();
0537     sc->DeltaPosition.z = dx.z();
0538     sc->step_length = aStep->GetStepLength();
0539 
0540     sc->code = dp->GetDefinition()->GetPDGEncoding();
0541     sc->charge = dp->GetDefinition()->GetPDGCharge();
0542     sc->weight = aTrack->GetWeight();
0543     sc->meanVelocity = meanV;
0544 
0545     sc->scnt = scnt;
0546     sc->ScintillationTime = ScintillationTime;
0547     return gs;
0548 }
0549 
0550 // ============================================================================
0551 // Sensitive detector and hits
0552 // ============================================================================
0553 
0554 struct PhotonHit : public G4VHit
0555 {
0556     PhotonHit() = default;
0557     PhotonHit(unsigned id, G4double energy, G4double time, G4ThreeVector position,
0558               G4ThreeVector direction, G4ThreeVector polarization) :
0559         fid(id),
0560         fenergy(energy),
0561         ftime(time),
0562         fposition(position),
0563         fdirection(direction),
0564         fpolarization(polarization)
0565     {
0566     }
0567 
0568     unsigned      fid{0};
0569     G4double      fenergy{0};
0570     G4double      ftime{0};
0571     G4ThreeVector fposition;
0572     G4ThreeVector fdirection;
0573     G4ThreeVector fpolarization;
0574 };
0575 
0576 using PhotonHitsCollection = G4THitsCollection<PhotonHit>;
0577 
0578 struct PhotonSD : public G4VSensitiveDetector
0579 {
0580     PhotonHitsCollection* fPhotonHitsCollection{nullptr};
0581     G4int                 fHCID{-1};
0582     G4int                 fTotalG4Hits{0};
0583 
0584     PhotonSD(const G4String& name) :
0585         G4VSensitiveDetector(name)
0586     {
0587         collectionName.insert("photon_hits");
0588     }
0589 
0590     void Initialize(G4HCofThisEvent* hce) override
0591     {
0592         fPhotonHitsCollection =
0593             new PhotonHitsCollection(SensitiveDetectorName, collectionName[0]);
0594         if (fHCID < 0)
0595             fHCID = G4SDManager::GetSDMpointer()->GetCollectionID(collectionName[0]);
0596         hce->AddHitsCollection(fHCID, fPhotonHitsCollection);
0597     }
0598 
0599     G4bool ProcessHits(G4Step* aStep, G4TouchableHistory*) override
0600     {
0601         G4Track* track = aStep->GetTrack();
0602         if (track->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
0603             return false;
0604 
0605         G4double   energy_eV = track->GetTotalEnergy() / CLHEP::eV;
0606         PhotonHit* hit =
0607             new PhotonHit(0, energy_eV, track->GetGlobalTime(),
0608                           aStep->GetPostStepPoint()->GetPosition(),
0609                           aStep->GetPostStepPoint()->GetMomentumDirection(),
0610                           aStep->GetPostStepPoint()->GetPolarization());
0611         fPhotonHitsCollection->insert(hit);
0612         track->SetTrackStatus(fStopAndKill);
0613         return true;
0614     }
0615 
0616     void EndOfEvent(G4HCofThisEvent*) override
0617     {
0618         G4int n = fPhotonHitsCollection->entries();
0619         if (n <= 0)
0620             return;
0621 
0622         std::lock_guard<std::mutex> lock(g4hits_mutex);
0623         for (G4int i = 0; i < n; i++)
0624         {
0625             PhotonHit* h = (*fPhotonHitsCollection)[i];
0626             float      wl =
0627                 (h->fenergy > 0) ? static_cast<float>(1239.84198 / h->fenergy) : 0.f;
0628             g4_accumulated_hits.push_back(
0629                 {float(h->fposition.x()), float(h->fposition.y()),
0630                  float(h->fposition.z()), float(h->ftime), float(h->fdirection.x()),
0631                  float(h->fdirection.y()), float(h->fdirection.z()), 0.f,
0632                  float(h->fpolarization.x()), float(h->fpolarization.y()),
0633                  float(h->fpolarization.z()), wl, 0.f, 0.f, 0.f, float(h->fid)});
0634         }
0635         fTotalG4Hits += n;
0636     }
0637 
0638     G4int GetTotalG4Hits() const
0639     {
0640         return fTotalG4Hits;
0641     }
0642 };
0643 
0644 // ============================================================================
0645 // Detector construction
0646 // ============================================================================
0647 
0648 struct DetectorConstruction : G4VUserDetectorConstruction
0649 {
0650     std::filesystem::path gdml_file_;
0651     G4GDMLParser          parser_;
0652 
0653     DetectorConstruction(std::filesystem::path gdml_file) :
0654         gdml_file_(gdml_file)
0655     {
0656     }
0657 
0658     G4VPhysicalVolume* Construct() override
0659     {
0660         parser_.Read(gdml_file_.string(), false);
0661         G4VPhysicalVolume* world = parser_.GetWorldVolume();
0662         G4CXOpticks::SetGeometry(world);
0663 
0664         G4LogicalVolumeStore*  lvStore = G4LogicalVolumeStore::GetInstance();
0665         static G4VisAttributes invisibleVisAttr(false);
0666         if (lvStore && !lvStore->empty())
0667         {
0668             for (auto lv : *lvStore)
0669             {
0670                 G4String lname = str_tolower(lv->GetName());
0671                 if (lname.find("detect") != std::string::npos ||
0672                     lname.find("sipm") != std::string::npos ||
0673                     lname.find("sensor") != std::string::npos ||
0674                     lname.find("pmt") != std::string::npos ||
0675                     lname.find("arapuca") != std::string::npos)
0676                 {
0677                     G4String sdName = "PhotonDetector_" + lv->GetName();
0678                     if (!G4SDManager::GetSDMpointer()->FindSensitiveDetector(sdName,
0679                                                                              false))
0680                     {
0681                         PhotonSD* sd = new PhotonSD(sdName);
0682                         G4SDManager::GetSDMpointer()->AddNewDetector(sd);
0683                         lv->SetSensitiveDetector(sd);
0684                     }
0685                 }
0686                 G4VSolid* solid = lv->GetSolid();
0687                 if (solid && IsSubtractionSolid(solid))
0688                     lv->SetVisAttributes(&invisibleVisAttr);
0689             }
0690         }
0691         return world;
0692     }
0693 };
0694 
0695 // ============================================================================
0696 // Primary generator
0697 // ============================================================================
0698 
0699 struct PrimaryGenerator : G4VUserPrimaryGeneratorAction
0700 {
0701     SEvt* sev;
0702     PrimaryGenerator(SEvt* sev) :
0703         sev(sev)
0704     {
0705     }
0706 
0707     void GeneratePrimaries(G4Event* event) override
0708     {
0709         G4ThreeVector pos(0.0 * m, 0.0 * m, 0.0 * m);
0710         G4double      time_ns = 0;
0711         G4ThreeVector dir(0, 0.2, 0.8);
0712 
0713         G4PrimaryParticle* p = new G4PrimaryParticle(G4Electron::Definition());
0714         p->SetKineticEnergy(10 * MeV);
0715         p->SetMomentumDirection(dir);
0716 
0717         G4PrimaryVertex* v = new G4PrimaryVertex(pos, time_ns);
0718         v->SetPrimary(p);
0719         event->AddPrimaryVertex(v);
0720     }
0721 };
0722 
0723 // ============================================================================
0724 // Event action — count G4 hits across all collections
0725 // ============================================================================
0726 
0727 struct EventAction : G4UserEventAction
0728 {
0729     SEvt* sev;
0730     G4int fTotalG4Hits{0};
0731 
0732     EventAction(SEvt* sev) :
0733         sev(sev)
0734     {
0735     }
0736 
0737     void EndOfEventAction(const G4Event* event) override
0738     {
0739         G4HCofThisEvent* hce = event->GetHCofThisEvent();
0740         if (!hce)
0741             return;
0742         G4int n = hce->GetNumberOfCollections();
0743         for (G4int i = 0; i < n; i++)
0744         {
0745             G4VHitsCollection* hc = hce->GetHC(i);
0746             if (!hc)
0747                 continue;
0748             if (auto* phc = dynamic_cast<PhotonHitsCollection*>(hc))
0749                 fTotalG4Hits += phc->entries();
0750             else
0751                 fTotalG4Hits += hc->GetSize();
0752         }
0753     }
0754 
0755     G4int GetTotalG4Hits() const
0756     {
0757         return fTotalG4Hits;
0758     }
0759 };
0760 
0761 // ============================================================================
0762 // Run action — drives the GPU lifecycle (sync or async)
0763 // ============================================================================
0764 
0765 struct RunAction : G4UserRunAction
0766 {
0767     EventAction*    fEventAction;
0768     GPUTaskManager* fGPUTaskMgr{nullptr};
0769 
0770     RunAction(EventAction* ea, GPUTaskManager* mgr = nullptr) :
0771         fEventAction(ea),
0772         fGPUTaskMgr(mgr)
0773     {
0774     }
0775 
0776     void BeginOfRunAction(const G4Run*) override
0777     {
0778         if (G4Threading::IsMasterThread() && fGPUTaskMgr)
0779             fGPUTaskMgr->start();
0780     }
0781 
0782     void EndOfRunAction(const G4Run*) override
0783     {
0784         if (!G4Threading::IsMasterThread())
0785             return;
0786 
0787         if (fGPUTaskMgr)
0788         {
0789             fGPUTaskMgr->flushRemaining(0);
0790 
0791             G4cout << "\n=== Async GPU Summary (std) ===" << G4endl;
0792             G4cout << "Batches processed: " << fGPUTaskMgr->getCompletedBatches()
0793                    << G4endl;
0794             G4cout << "Total GPU photons: " << fGPUTaskMgr->getTotalPhotons()
0795                    << G4endl;
0796             G4cout << "Total GPU hits:    " << fGPUTaskMgr->getTotalHits() << G4endl;
0797             G4cout << "Total GPU time:    " << fGPUTaskMgr->getTotalGPUTime() << " s"
0798                    << G4endl;
0799             G4cout << "G4 hits:           " << fEventAction->GetTotalG4Hits()
0800                    << G4endl;
0801         }
0802         else
0803         {
0804             G4CXOpticks* gx = G4CXOpticks::Get();
0805             auto         t0 = std::chrono::high_resolution_clock::now();
0806             gx->simulate(0, false);
0807             cudaDeviceSynchronize();
0808             auto                          t1 = std::chrono::high_resolution_clock::now();
0809             std::chrono::duration<double> elapsed = t1 - t0;
0810 
0811             SEvt*    sev = SEvt::Get_EGPU();
0812             unsigned num_hits = sev->GetNumHit(0);
0813 
0814             G4cout << "\n=== Sync GPU Summary ===" << G4endl;
0815             G4cout << "GPU sim time:      " << elapsed.count() << " s" << G4endl;
0816             G4cout << "Gensteps:          " << sev->GetNumGenstepFromGenstep(0)
0817                    << G4endl;
0818             G4cout << "Photons collected: " << sev->GetNumPhotonCollected(0)
0819                    << G4endl;
0820             G4cout << "GPU hits:          " << num_hits << G4endl;
0821             G4cout << "G4 hits:           " << fEventAction->GetTotalG4Hits()
0822                    << G4endl;
0823 
0824             if (num_hits > 0)
0825             {
0826                 NP* gpu_h = NP::Make<float>(num_hits, 4, 4);
0827                 for (unsigned idx = 0; idx < num_hits; idx++)
0828                 {
0829                     sphoton hit;
0830                     sev->getHit(hit, idx);
0831                     std::memcpy(gpu_h->bytes() + idx * sizeof(sphoton), &hit,
0832                                 sizeof(sphoton));
0833                 }
0834                 gpu_h->save("gpu_hits.npy");
0835                 G4cout << "Saved GPU hits to gpu_hits.npy" << G4endl;
0836             }
0837         }
0838 
0839         std::lock_guard<std::mutex> lock(g4hits_mutex);
0840         size_t                      ng4 = g4_accumulated_hits.size();
0841         if (ng4 > 0)
0842         {
0843             NP* g4h = NP::Make<float>(ng4, 4, 4);
0844             std::memcpy(g4h->bytes(), g4_accumulated_hits.data(),
0845                         ng4 * 16 * sizeof(float));
0846             g4h->save("g4_hits.npy");
0847             G4cout << "Saved G4 hits (" << ng4 << ") to g4_hits.npy" << G4endl;
0848         }
0849     }
0850 };
0851 
0852 // ============================================================================
0853 // Stepping action — collect Cerenkov / scintillation gensteps
0854 // ============================================================================
0855 
0856 struct SteppingAction : G4UserSteppingAction
0857 {
0858     SEvt*           sev;
0859     GPUTaskManager* fGPUTaskMgr{nullptr};
0860 
0861     SteppingAction(SEvt* sev, GPUTaskManager* mgr = nullptr) :
0862         sev(sev),
0863         fGPUTaskMgr(mgr)
0864     {
0865     }
0866 
0867     void UserSteppingAction(const G4Step* aStep) override
0868     {
0869         // Cap optical photon step count so reflection loops can't run forever
0870         if (aStep->GetTrack()->GetDefinition() ==
0871                 G4OpticalPhoton::OpticalPhotonDefinition() &&
0872             aStep->GetTrack()->GetCurrentStepNumber() > 10000)
0873             aStep->GetTrack()->SetTrackStatus(fStopAndKill);
0874 
0875         G4SteppingManager* sm = G4EventManager::GetEventManager()
0876                                     ->GetTrackingManager()
0877                                     ->GetSteppingManager();
0878         if (sm->GetfStepStatus() == fAtRestDoItProc)
0879             return;
0880 
0881         G4ProcessVector* procPost = sm->GetfPostStepDoItVector();
0882         size_t           MAXofPostStepLoops = sm->GetMAXofPostStepLoops();
0883 
0884         for (size_t i = 0; i < MAXofPostStepLoops; i++)
0885         {
0886             const G4String& pname = (*procPost)[i]->GetProcessName();
0887 
0888             if (pname == "Cerenkov")
0889             {
0890                 G4Track*                   track = aStep->GetTrack();
0891                 const G4DynamicParticle*   dp = track->GetDynamicParticle();
0892                 G4double                   charge = dp->GetDefinition()->GetPDGCharge();
0893                 const G4Material*          mat = track->GetMaterial();
0894                 G4MaterialPropertiesTable* MPT = mat->GetMaterialPropertiesTable();
0895                 G4MaterialPropertyVector*  Rindex =
0896                     MPT ? MPT->GetProperty(kRINDEX) : nullptr;
0897                 if (!Rindex || Rindex->GetVectorLength() == 0)
0898                     return;
0899 
0900                 G4Cerenkov* proc = (G4Cerenkov*)(*procPost)[i];
0901                 G4int       numPhotons = proc->GetNumPhotons();
0902                 if (numPhotons <= 0)
0903                     continue;
0904 
0905                 G4double Pmin = Rindex->Energy(0);
0906                 G4double Pmax = Rindex->GetMaxEnergy();
0907                 G4double nMax = Rindex->GetMaxValue();
0908                 G4double beta1 = aStep->GetPreStepPoint()->GetBeta();
0909                 G4double beta2 = aStep->GetPostStepPoint()->GetBeta();
0910                 G4double beta = (beta1 + beta2) * 0.5;
0911                 G4double BetaInverse = 1. / beta;
0912                 G4double maxCos = BetaInverse / nMax;
0913                 G4double maxSin2 = (1.0 - maxCos) * (1.0 + maxCos);
0914                 G4double mean1 =
0915                     proc->GetAverageNumberOfPhotons(charge, beta1, mat, Rindex);
0916                 G4double mean2 =
0917                     proc->GetAverageNumberOfPhotons(charge, beta2, mat, Rindex);
0918 
0919                 if (fGPUTaskMgr)
0920                 {
0921                     const G4Event* ev =
0922                         G4EventManager::GetEventManager()->GetConstCurrentEvent();
0923                     if (!ev)
0924                         return;
0925                     quad6 gs =
0926                         MakeGenstep_Cerenkov(track, aStep, numPhotons, BetaInverse, Pmin,
0927                                              Pmax, maxCos, maxSin2, mean1, mean2);
0928                     fGPUTaskMgr->addGenstep(gs, numPhotons, ev->GetEventID());
0929                 }
0930                 else
0931                 {
0932                     G4AutoLock lock(&genstep_mutex);
0933                     U4::CollectGenstep_G4Cerenkov_modified(track, aStep, numPhotons,
0934                                                            BetaInverse, Pmin, Pmax,
0935                                                            maxCos, maxSin2, mean1, mean2);
0936                 }
0937             }
0938             else if (pname == "Scintillation")
0939             {
0940                 G4Scintillation* proc = (G4Scintillation*)(*procPost)[i];
0941                 G4int            numPhotons = proc->GetNumPhotons();
0942                 if (numPhotons <= 0)
0943                     continue;
0944 
0945                 G4Track*                   track = aStep->GetTrack();
0946                 const G4Material*          mat = track->GetMaterial();
0947                 G4MaterialPropertiesTable* MPT = mat->GetMaterialPropertiesTable();
0948                 if (!MPT || !MPT->ConstPropertyExists(kSCINTILLATIONTIMECONSTANT1))
0949                     return;
0950 
0951                 const G4int tcKeys[3] = {kSCINTILLATIONTIMECONSTANT1,
0952                                          kSCINTILLATIONTIMECONSTANT2,
0953                                          kSCINTILLATIONTIMECONSTANT3};
0954                 const G4int yieldKeys[3] = {kSCINTILLATIONYIELD1, kSCINTILLATIONYIELD2,
0955                                             kSCINTILLATIONYIELD3};
0956 
0957                 G4double tc[3] = {0, 0, 0};
0958                 G4double yield[3] = {0, 0, 0};
0959                 G4double yieldSum = 0;
0960                 G4int    nComp = 0;
0961                 for (G4int c = 0; c < 3; c++)
0962                 {
0963                     if (MPT->ConstPropertyExists(tcKeys[c]))
0964                     {
0965                         tc[c] = MPT->GetConstProperty(tcKeys[c]);
0966                         yield[c] = MPT->ConstPropertyExists(yieldKeys[c])
0967                                        ? MPT->GetConstProperty(yieldKeys[c])
0968                                        : (c == 0 ? 1.0 : 0.0);
0969                         yieldSum += yield[c];
0970                         nComp = c + 1;
0971                     }
0972                 }
0973 
0974                 if (fGPUTaskMgr)
0975                 {
0976                     const G4Event* ev =
0977                         G4EventManager::GetEventManager()->GetConstCurrentEvent();
0978                     if (!ev)
0979                         return;
0980                     int   eventid = ev->GetEventID();
0981                     G4int remaining = numPhotons;
0982                     for (G4int c = 0; c < nComp; c++)
0983                     {
0984                         G4int n =
0985                             (c == nComp - 1)
0986                                 ? remaining
0987                                 : static_cast<G4int>(numPhotons * yield[c] / yieldSum);
0988                         remaining -= n;
0989                         if (n > 0)
0990                         {
0991                             quad6 gs =
0992                                 MakeGenstep_Scintillation(track, aStep, n, c + 1, tc[c]);
0993                             fGPUTaskMgr->addGenstep(gs, n, eventid);
0994                         }
0995                     }
0996                 }
0997                 else
0998                 {
0999                     G4AutoLock lock(&genstep_mutex);
1000                     G4int      remaining = numPhotons;
1001                     for (G4int c = 0; c < nComp; c++)
1002                     {
1003                         G4int n =
1004                             (c == nComp - 1)
1005                                 ? remaining
1006                                 : static_cast<G4int>(numPhotons * yield[c] / yieldSum);
1007                         remaining -= n;
1008                         if (n > 0)
1009                             U4::CollectGenstep_DsG4Scintillation_r4695(track, aStep, n, c + 1,
1010                                                                        tc[c]);
1011                     }
1012                 }
1013             }
1014         }
1015     }
1016 };
1017 
1018 // ============================================================================
1019 // Tracking action
1020 // ============================================================================
1021 
1022 struct TrackingAction : G4UserTrackingAction
1023 {
1024     SEvt* sev;
1025     TrackingAction(SEvt* sev) :
1026         sev(sev)
1027     {
1028     }
1029 
1030     void PreUserTrackingAction(const G4Track*) override
1031     {
1032     }
1033 
1034     void PostUserTrackingAction(const G4Track*) override
1035     {
1036     }
1037 };
1038 
1039 // ============================================================================
1040 // G4App — wires everything together
1041 // ============================================================================
1042 
1043 struct G4App
1044 {
1045     SEvt*           sev;
1046     GPUTaskManager* gpu_task_mgr_;
1047 
1048     G4VUserDetectorConstruction*   det_cons_;
1049     G4VUserPrimaryGeneratorAction* prim_gen_;
1050     EventAction*                   event_act_;
1051     RunAction*                     run_act_;
1052     SteppingAction*                stepping_;
1053     TrackingAction*                tracking_;
1054 
1055     G4App(std::filesystem::path gdml_file, bool enable_async = true) :
1056         sev(SEvt::CreateOrReuse_EGPU()),
1057         gpu_task_mgr_(enable_async ? new GPUTaskManager() : nullptr),
1058         det_cons_(new DetectorConstruction(gdml_file)),
1059         prim_gen_(new PrimaryGenerator(sev)),
1060         event_act_(new EventAction(sev)),
1061         run_act_(new RunAction(event_act_, gpu_task_mgr_)),
1062         stepping_(new SteppingAction(sev, gpu_task_mgr_)),
1063         tracking_(new TrackingAction(sev))
1064     {
1065         if (gpu_task_mgr_)
1066             G4cout << "G4App [std]: async GPU mode (threshold="
1067                    << gpu_task_mgr_->getThreshold() << " photons)" << G4endl;
1068         else
1069             G4cout << "G4App [std]: sync GPU mode (end-of-run)" << G4endl;
1070     }
1071 
1072     ~G4App()
1073     {
1074         delete gpu_task_mgr_;
1075     }
1076 };