Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-05-15 07:41:49

0001 #pragma once
0002 //
0003 // async_gpu_launch.h — Async CPU+GPU optical photon simulation
0004 //
0005 // Based on GPURaytrace.h with asynchronous GPU processing added.
0006 // Follows the double-buffering pattern from esi-g4ox:
0007 //   https://github.com/BNLNPPS/esi-g4ox/commit/c9f39f59
0008 //
0009 // Architecture:
0010 //   CPU event loop collects gensteps into a buffer.
0011 //   When the buffer reaches a photon threshold, it is submitted
0012 //   to a GPU worker via G4TaskGroup. The buffers are swapped so
0013 //   the CPU can continue filling the next buffer while the GPU
0014 //   processes the previous one.
0015 //
0016 // Modes:
0017 //   --async   : Async double-buffered GPU processing (default)
0018 //   --sync    : Original end-of-run GPU simulation
0019 //
0020 
0021 #include <array>
0022 #include <atomic>
0023 #include <chrono>
0024 #include <cstdlib>
0025 #include <cstring>
0026 #include <filesystem>
0027 #include <fstream>
0028 #include <iostream>
0029 #include <memory>
0030 #include <mutex>
0031 #include <sstream>
0032 #include <vector>
0033 
0034 #include "G4AutoLock.hh"
0035 #include "G4BooleanSolid.hh"
0036 #include "G4Cerenkov.hh"
0037 #include "G4Electron.hh"
0038 #include "G4Event.hh"
0039 #include "G4EventManager.hh"
0040 #include "G4GDMLParser.hh"
0041 #include "G4LogicalVolumeStore.hh"
0042 #include "G4OpBoundaryProcess.hh"
0043 #include "G4OpticalPhoton.hh"
0044 #include "G4PhysicalConstants.hh"
0045 #include "G4PrimaryParticle.hh"
0046 #include "G4PrimaryVertex.hh"
0047 #include "G4RunManager.hh"
0048 #include "G4RunManagerFactory.hh"
0049 #include "G4SDManager.hh"
0050 #include "G4Scintillation.hh"
0051 #include "G4SubtractionSolid.hh"
0052 #include "G4SystemOfUnits.hh"
0053 #include "G4TaskGroup.hh"
0054 #include "G4ThreeVector.hh"
0055 #include "G4Track.hh"
0056 #include "G4TrackStatus.hh"
0057 #include "G4UserEventAction.hh"
0058 #include "G4UserRunAction.hh"
0059 #include "G4UserSteppingAction.hh"
0060 #include "G4UserTrackingAction.hh"
0061 #include "G4VPhysicalVolume.hh"
0062 #include "G4VProcess.hh"
0063 #include "G4VUserDetectorConstruction.hh"
0064 #include "G4VUserPrimaryGeneratorAction.hh"
0065 
0066 #include "g4cx/G4CXOpticks.hh"
0067 #include "sysrap/NP.hh"
0068 #include "sysrap/OpticksGenstep.h"
0069 #include "sysrap/SEvt.hh"
0070 #include "sysrap/scerenkov.h"
0071 #include "sysrap/sgs.h"
0072 #include "sysrap/spho.h"
0073 #include "sysrap/sphoton.h"
0074 #include "sysrap/sscint.h"
0075 #include "u4/U4.hh"
0076 #include "u4/U4Random.hh"
0077 #include "u4/U4StepPoint.hh"
0078 #include "u4/U4Touchable.h"
0079 #include "u4/U4Track.h"
0080 
0081 namespace
0082 {
0083 G4Mutex genstep_mutex = G4MUTEX_INITIALIZER;
0084 G4Mutex g4hits_mutex = G4MUTEX_INITIALIZER;
0085 
0086 std::vector<std::array<float, 16>> g4_accumulated_hits;
0087 } // namespace
0088 
0089 // ============================================================================
0090 // Geometry helpers
0091 // ============================================================================
0092 
0093 bool IsSubtractionSolid(G4VSolid* solid)
0094 {
0095     if (!solid)
0096         return false;
0097     if (dynamic_cast<G4SubtractionSolid*>(solid))
0098         return true;
0099     G4BooleanSolid* booleanSolid = dynamic_cast<G4BooleanSolid*>(solid);
0100     if (booleanSolid)
0101     {
0102         G4VSolid* solidA = booleanSolid->GetConstituentSolid(0);
0103         G4VSolid* solidB = booleanSolid->GetConstituentSolid(1);
0104         if (IsSubtractionSolid(solidA) || IsSubtractionSolid(solidB))
0105             return true;
0106     }
0107     return false;
0108 }
0109 
0110 std::string str_tolower(std::string s)
0111 {
0112     std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); });
0113     return s;
0114 }
0115 
0116 // ============================================================================
0117 // GenstepBuffer — Thread-safe buffer for double-buffering
0118 // ============================================================================
0119 
0120 struct GenstepBuffer
0121 {
0122     std::vector<quad6> gensteps;
0123     std::vector<sgs>   labels;
0124     int64_t            photon_count = 0;
0125     int64_t            genstep_count = 0;
0126     int                event_id = 0;
0127 
0128     void clear()
0129     {
0130         gensteps.clear();
0131         labels.clear();
0132         photon_count = 0;
0133         genstep_count = 0;
0134     }
0135 
0136     void addGenstep(const quad6& gs, int64_t numphotons)
0137     {
0138         sgs label;
0139         label.index = gensteps.size();
0140         label.photons = numphotons;
0141         label.offset = photon_count;
0142         label.gentype = gs.gentype();
0143 
0144         gensteps.push_back(gs);
0145         labels.push_back(label);
0146         photon_count += numphotons;
0147         genstep_count++;
0148     }
0149 
0150     bool empty() const
0151     {
0152         return gensteps.empty();
0153     }
0154     size_t size() const
0155     {
0156         return gensteps.size();
0157     }
0158 };
0159 
0160 // ============================================================================
0161 // GPUTaskManager — Async GPU processing with G4TaskGroup
0162 //
0163 // Uses Geant4's built-in tasking (G4TaskGroup) for the worker thread.
0164 // A single GPU mutex ensures only one kernel runs at a time.
0165 // ============================================================================
0166 
0167 class GPUTaskManager
0168 {
0169   public:
0170     static constexpr int64_t DEFAULT_PHOTON_THRESHOLD = 10000000; // 10M photons
0171 
0172   private:
0173     int64_t photon_threshold_;
0174 
0175     std::shared_ptr<GenstepBuffer> active_buffer_;
0176     G4Mutex                        buffer_mutex_;
0177 
0178     std::unique_ptr<G4TaskGroup<void, void>> task_group_;
0179     G4Mutex                                  gpu_mutex_;
0180 
0181     // Statistics
0182     std::atomic<int>      batch_counter_{0};
0183     std::atomic<int>      completed_batches_{0};
0184     std::atomic<uint64_t> total_hits_{0};
0185     std::atomic<uint64_t> total_photons_{0};
0186     std::atomic<uint64_t> total_gpu_time_us_{0};
0187 
0188   public:
0189     GPUTaskManager(int64_t threshold = DEFAULT_PHOTON_THRESHOLD) :
0190         photon_threshold_(threshold),
0191         active_buffer_(std::make_shared<GenstepBuffer>()),
0192         task_group_(nullptr)
0193     {
0194         const char* env_thresh = std::getenv("GPU_PHOTON_FLUSH_THRESHOLD");
0195         if (env_thresh)
0196             photon_threshold_ = std::atoll(env_thresh);
0197     }
0198 
0199     ~GPUTaskManager()
0200     {
0201         shutdown();
0202     }
0203 
0204     void start()
0205     {
0206         task_group_ = std::make_unique<G4TaskGroup<void, void>>();
0207         G4cout << "GPUTaskManager: Started (threshold=" << photon_threshold_ << " photons)" << G4endl;
0208     }
0209 
0210     void shutdown()
0211     {
0212         {
0213             G4AutoLock lock(&buffer_mutex_);
0214             if (active_buffer_ && !active_buffer_->empty())
0215             {
0216                 submitBuffer(active_buffer_);
0217                 active_buffer_ = std::make_shared<GenstepBuffer>();
0218             }
0219         }
0220         waitForCompletion();
0221         task_group_.reset();
0222 
0223         G4cout << "GPUTaskManager: Shutdown" << G4endl;
0224         G4cout << "  Batches:  " << completed_batches_.load() << G4endl;
0225         G4cout << "  Photons:  " << total_photons_.load() << G4endl;
0226         G4cout << "  Hits:     " << total_hits_.load() << G4endl;
0227         G4cout << "  GPU time: " << (total_gpu_time_us_.load() / 1e6) << " s" << G4endl;
0228     }
0229 
0230     // Hot path — called from SteppingAction
0231     void addGenstep(const quad6& gs, int64_t numphotons, int eventID)
0232     {
0233         std::shared_ptr<GenstepBuffer> buffer_to_submit;
0234 
0235         {
0236             G4AutoLock lock(&buffer_mutex_);
0237             active_buffer_->event_id = eventID;
0238             active_buffer_->addGenstep(gs, numphotons);
0239 
0240             if (active_buffer_->photon_count >= photon_threshold_)
0241             {
0242                 buffer_to_submit = active_buffer_;
0243                 active_buffer_ = std::make_shared<GenstepBuffer>();
0244             }
0245         }
0246 
0247         // Submit outside the lock
0248         if (buffer_to_submit)
0249             submitBuffer(buffer_to_submit);
0250     }
0251 
0252     void flushRemaining(int eventID)
0253     {
0254         std::shared_ptr<GenstepBuffer> buffer_to_submit;
0255         {
0256             G4AutoLock lock(&buffer_mutex_);
0257             if (active_buffer_ && !active_buffer_->empty())
0258             {
0259                 active_buffer_->event_id = eventID;
0260                 buffer_to_submit = active_buffer_;
0261                 active_buffer_ = std::make_shared<GenstepBuffer>();
0262             }
0263         }
0264         if (buffer_to_submit)
0265         {
0266             G4cout << "GPUTaskManager: Final flush (" << buffer_to_submit->photon_count << " photons)" << G4endl;
0267             submitBuffer(buffer_to_submit);
0268         }
0269         waitForCompletion();
0270     }
0271 
0272     void waitForCompletion()
0273     {
0274         if (task_group_)
0275             task_group_->join();
0276     }
0277 
0278     int getCompletedBatches() const
0279     {
0280         return completed_batches_.load();
0281     }
0282     uint64_t getTotalHits() const
0283     {
0284         return total_hits_.load();
0285     }
0286     uint64_t getTotalPhotons() const
0287     {
0288         return total_photons_.load();
0289     }
0290     int64_t getThreshold() const
0291     {
0292         return photon_threshold_;
0293     }
0294     double getTotalGPUTime() const
0295     {
0296         return total_gpu_time_us_.load() / 1e6;
0297     }
0298 
0299   private:
0300     void submitBuffer(std::shared_ptr<GenstepBuffer> buffer)
0301     {
0302         if (!buffer || buffer->empty() || !task_group_)
0303             return;
0304 
0305         int batch_id = batch_counter_.fetch_add(1);
0306 
0307         G4cout << "GPUTaskManager: Queued batch " << batch_id << " (" << buffer->photon_count << " photons, "
0308                << buffer->genstep_count << " gensteps)" << G4endl;
0309 
0310         task_group_->exec([this, batch_id, buffer]() { processGPUTask(batch_id, buffer); });
0311     }
0312 
0313     void processGPUTask(int batch_id, std::shared_ptr<GenstepBuffer> buffer)
0314     {
0315         // Only one GPU task at a time — G4CXOpticks/SEvt are not thread-safe
0316         G4AutoLock gpu_lock(&gpu_mutex_);
0317 
0318         G4cout << "=== GPU Batch " << batch_id << " ===" << G4endl;
0319         G4cout << "  Photons:  " << buffer->photon_count << G4endl;
0320         G4cout << "  Gensteps: " << buffer->genstep_count << G4endl;
0321 
0322         G4CXOpticks* gx = G4CXOpticks::Get();
0323         SEvt*        sev = SEvt::Get_EGPU();
0324 
0325         if (!gx || !sev)
0326         {
0327             G4cerr << "GPUTaskManager: G4CXOpticks or SEvt not available" << G4endl;
0328             return;
0329         }
0330 
0331         // Load buffered gensteps into SEvt
0332         sev->clear_genstep();
0333         NP* gs_array = NP::Make<float>(buffer->gensteps.size(), 6, 4);
0334         memcpy(gs_array->values<float>(), buffer->gensteps.data(), buffer->gensteps.size() * sizeof(quad6));
0335         sev->addGenstep(gs_array);
0336 
0337         // Run GPU simulation
0338         auto start = std::chrono::high_resolution_clock::now();
0339         gx->simulate(buffer->event_id, false);
0340         cudaDeviceSynchronize();
0341         auto end = std::chrono::high_resolution_clock::now();
0342         auto elapsed_us = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
0343 
0344         unsigned int num_hits = sev->GetNumHit(0);
0345 
0346         total_gpu_time_us_ += elapsed_us;
0347         total_hits_ += num_hits;
0348         total_photons_ += buffer->photon_count;
0349 
0350         G4cout << "  GPU time: " << (elapsed_us / 1e6) << " s" << G4endl;
0351         G4cout << "  Hits:     " << num_hits << G4endl;
0352 
0353         if (num_hits > 0)
0354             saveHits(batch_id, sev, num_hits);
0355 
0356         gx->reset(buffer->event_id);
0357         completed_batches_++;
0358 
0359         G4cout << "=== GPU Batch " << batch_id << " Complete ===" << G4endl;
0360     }
0361 
0362     void saveHits(int batch_id, SEvt* sev, unsigned int num_hits)
0363     {
0364         // Save as .npy (sphoton layout: N x 4 x 4 float32)
0365         std::ostringstream fname;
0366         fname << "gpu_hits_batch_" << batch_id << ".npy";
0367 
0368         NP* arr = NP::Make<float>(num_hits, 4, 4);
0369         for (unsigned idx = 0; idx < num_hits; idx++)
0370         {
0371             sphoton hit;
0372             sev->getHit(hit, idx);
0373             memcpy(arr->bytes() + idx * sizeof(sphoton), &hit, sizeof(sphoton));
0374         }
0375         arr->save(fname.str().c_str());
0376         G4cout << "  Saved " << num_hits << " hits to " << fname.str() << G4endl;
0377     }
0378 };
0379 
0380 // ============================================================================
0381 // Genstep construction helpers (bypass U4/SEvt for async mode)
0382 // ============================================================================
0383 
0384 static quad6 MakeGenstep_Cerenkov(const G4Track* aTrack, const G4Step* aStep, G4int numPhotons, G4double betaInverse,
0385                                   G4double pmin, G4double pmax, G4double maxCos, G4double maxSin2,
0386                                   G4double meanNumberOfPhotons1, G4double meanNumberOfPhotons2)
0387 {
0388     G4StepPoint*             pPreStepPoint = aStep->GetPreStepPoint();
0389     G4StepPoint*             pPostStepPoint = aStep->GetPostStepPoint();
0390     G4ThreeVector            x0 = pPreStepPoint->GetPosition();
0391     G4double                 t0 = pPreStepPoint->GetGlobalTime();
0392     G4ThreeVector            deltaPosition = aStep->GetDeltaPosition();
0393     const G4DynamicParticle* aParticle = aTrack->GetDynamicParticle();
0394     const G4Material*        aMaterial = aTrack->GetMaterial();
0395 
0396     G4double Wmin_nm = h_Planck * c_light / pmax / nm;
0397     G4double Wmax_nm = h_Planck * c_light / pmin / nm;
0398 
0399     quad6 gs;
0400     gs.zero();
0401     scerenkov* ck = (scerenkov*)(&gs);
0402 
0403     ck->gentype = OpticksGenstep_G4Cerenkov_modified;
0404     ck->trackid = aTrack->GetTrackID();
0405     ck->matline = aMaterial->GetIndex() + SEvt::G4_INDEX_OFFSET;
0406     ck->numphoton = numPhotons;
0407 
0408     ck->pos.x = x0.x();
0409     ck->pos.y = x0.y();
0410     ck->pos.z = x0.z();
0411     ck->time = t0;
0412 
0413     ck->DeltaPosition.x = deltaPosition.x();
0414     ck->DeltaPosition.y = deltaPosition.y();
0415     ck->DeltaPosition.z = deltaPosition.z();
0416     ck->step_length = aStep->GetStepLength();
0417 
0418     ck->code = aParticle->GetDefinition()->GetPDGEncoding();
0419     ck->charge = aParticle->GetDefinition()->GetPDGCharge();
0420     ck->weight = aTrack->GetWeight();
0421     ck->preVelocity = pPreStepPoint->GetVelocity();
0422 
0423     ck->BetaInverse = betaInverse;
0424     ck->Wmin = Wmin_nm;
0425     ck->Wmax = Wmax_nm;
0426     ck->maxCos = maxCos;
0427 
0428     ck->maxSin2 = maxSin2;
0429     ck->MeanNumberOfPhotons1 = meanNumberOfPhotons1;
0430     ck->MeanNumberOfPhotons2 = meanNumberOfPhotons2;
0431     ck->postVelocity = pPostStepPoint->GetVelocity();
0432 
0433     return gs;
0434 }
0435 
0436 static quad6 MakeGenstep_Scintillation(const G4Track* aTrack, const G4Step* aStep, G4int numPhotons, G4int scnt,
0437                                        G4double ScintillationTime)
0438 {
0439     G4StepPoint*             pPreStepPoint = aStep->GetPreStepPoint();
0440     G4StepPoint*             pPostStepPoint = aStep->GetPostStepPoint();
0441     G4ThreeVector            x0 = pPreStepPoint->GetPosition();
0442     G4double                 t0 = pPreStepPoint->GetGlobalTime();
0443     G4ThreeVector            deltaPosition = aStep->GetDeltaPosition();
0444     G4double                 meanVelocity = (pPreStepPoint->GetVelocity() + pPostStepPoint->GetVelocity()) / 2.;
0445     const G4DynamicParticle* aParticle = aTrack->GetDynamicParticle();
0446     const G4Material*        aMaterial = aTrack->GetMaterial();
0447 
0448     quad6 gs;
0449     gs.zero();
0450     sscint* sc = (sscint*)(&gs);
0451 
0452     sc->gentype = OpticksGenstep_DsG4Scintillation_r4695;
0453     sc->trackid = aTrack->GetTrackID();
0454     sc->matline = aMaterial->GetIndex() + SEvt::G4_INDEX_OFFSET;
0455     sc->numphoton = numPhotons;
0456 
0457     sc->pos.x = x0.x();
0458     sc->pos.y = x0.y();
0459     sc->pos.z = x0.z();
0460     sc->time = t0;
0461 
0462     sc->DeltaPosition.x = deltaPosition.x();
0463     sc->DeltaPosition.y = deltaPosition.y();
0464     sc->DeltaPosition.z = deltaPosition.z();
0465     sc->step_length = aStep->GetStepLength();
0466 
0467     sc->code = aParticle->GetDefinition()->GetPDGEncoding();
0468     sc->charge = aParticle->GetDefinition()->GetPDGCharge();
0469     sc->weight = aTrack->GetWeight();
0470     sc->meanVelocity = meanVelocity;
0471 
0472     sc->scnt = scnt;
0473     sc->ScintillationTime = ScintillationTime;
0474 
0475     return gs;
0476 }
0477 
0478 // ============================================================================
0479 // Sensitive detector and hits
0480 // ============================================================================
0481 
0482 struct PhotonHit : public G4VHit
0483 {
0484     PhotonHit() = default;
0485     PhotonHit(unsigned id, G4double energy, G4double time, G4ThreeVector position, G4ThreeVector direction,
0486               G4ThreeVector polarization) :
0487         fid(id),
0488         fenergy(energy),
0489         ftime(time),
0490         fposition(position),
0491         fdirection(direction),
0492         fpolarization(polarization)
0493     {
0494     }
0495     PhotonHit(const PhotonHit& right) :
0496         G4VHit(right),
0497         fid(right.fid),
0498         fenergy(right.fenergy),
0499         ftime(right.ftime),
0500         fposition(right.fposition),
0501         fdirection(right.fdirection),
0502         fpolarization(right.fpolarization)
0503     {
0504     }
0505 
0506     unsigned      fid{0};
0507     G4double      fenergy{0};
0508     G4double      ftime{0};
0509     G4ThreeVector fposition;
0510     G4ThreeVector fdirection;
0511     G4ThreeVector fpolarization;
0512 };
0513 
0514 using PhotonHitsCollection = G4THitsCollection<PhotonHit>;
0515 
0516 struct PhotonSD : public G4VSensitiveDetector
0517 {
0518     PhotonHitsCollection* fPhotonHitsCollection{nullptr};
0519     G4int                 fHCID{-1};
0520     G4int                 fTotalG4Hits{0};
0521 
0522     PhotonSD(const G4String& name) :
0523         G4VSensitiveDetector(name)
0524     {
0525         collectionName.insert("photon_hits");
0526     }
0527 
0528     void Initialize(G4HCofThisEvent* hce) override
0529     {
0530         fPhotonHitsCollection = new PhotonHitsCollection(SensitiveDetectorName, collectionName[0]);
0531         if (fHCID < 0)
0532             fHCID = G4SDManager::GetSDMpointer()->GetCollectionID(collectionName[0]);
0533         hce->AddHitsCollection(fHCID, fPhotonHitsCollection);
0534     }
0535 
0536     G4bool ProcessHits(G4Step* aStep, G4TouchableHistory*) override
0537     {
0538         G4Track* theTrack = aStep->GetTrack();
0539         if (theTrack->GetDefinition() != G4OpticalPhoton::OpticalPhotonDefinition())
0540             return false;
0541 
0542         G4double   theEnergy = theTrack->GetTotalEnergy() / CLHEP::eV;
0543         PhotonHit* newHit = new PhotonHit(
0544             0, theEnergy, theTrack->GetGlobalTime(), aStep->GetPostStepPoint()->GetPosition(),
0545             aStep->GetPostStepPoint()->GetMomentumDirection(), aStep->GetPostStepPoint()->GetPolarization());
0546         fPhotonHitsCollection->insert(newHit);
0547         theTrack->SetTrackStatus(fStopAndKill);
0548         return true;
0549     }
0550 
0551     void EndOfEvent(G4HCofThisEvent*) override
0552     {
0553         G4int NbHits = fPhotonHitsCollection->entries();
0554         if (NbHits > 0)
0555         {
0556             G4AutoLock lock(&g4hits_mutex);
0557             for (G4int i = 0; i < NbHits; i++)
0558             {
0559                 PhotonHit* hit = (*fPhotonHitsCollection)[i];
0560                 float      wl = (hit->fenergy > 0) ? static_cast<float>(1239.84198 / hit->fenergy) : 0.f;
0561                 g4_accumulated_hits.push_back({float(hit->fposition.x()), float(hit->fposition.y()),
0562                                                float(hit->fposition.z()), float(hit->ftime), float(hit->fdirection.x()),
0563                                                float(hit->fdirection.y()), float(hit->fdirection.z()), 0.f,
0564                                                float(hit->fpolarization.x()), float(hit->fpolarization.y()),
0565                                                float(hit->fpolarization.z()), wl, 0.f, 0.f, 0.f, float(hit->fid)});
0566             }
0567             fTotalG4Hits += NbHits;
0568         }
0569     }
0570 
0571     G4int GetTotalG4Hits() const
0572     {
0573         return fTotalG4Hits;
0574     }
0575 };
0576 
0577 // ============================================================================
0578 // Detector construction
0579 // ============================================================================
0580 
0581 struct DetectorConstruction : G4VUserDetectorConstruction
0582 {
0583     std::filesystem::path gdml_file_;
0584     G4GDMLParser          parser_;
0585 
0586     DetectorConstruction(std::filesystem::path gdml_file) :
0587         gdml_file_(gdml_file)
0588     {
0589     }
0590 
0591     G4VPhysicalVolume* Construct() override
0592     {
0593         parser_.Read(gdml_file_.string(), false);
0594         G4VPhysicalVolume* world = parser_.GetWorldVolume();
0595         G4CXOpticks::SetGeometry(world);
0596 
0597         G4LogicalVolumeStore*  lvStore = G4LogicalVolumeStore::GetInstance();
0598         static G4VisAttributes invisibleVisAttr(false);
0599         if (lvStore && !lvStore->empty())
0600         {
0601             for (auto lv : *lvStore)
0602             {
0603                 G4String name = str_tolower(lv->GetName());
0604                 if (name.find("detect") != std::string::npos || name.find("sipm") != std::string::npos ||
0605                     name.find("sensor") != std::string::npos || name.find("pmt") != std::string::npos ||
0606                     name.find("arapuca") != std::string::npos)
0607                 {
0608                     G4String sdName = "PhotonDetector_" + lv->GetName();
0609                     if (!G4SDManager::GetSDMpointer()->FindSensitiveDetector(sdName, false))
0610                     {
0611                         PhotonSD* photonSD = new PhotonSD(sdName);
0612                         G4SDManager::GetSDMpointer()->AddNewDetector(photonSD);
0613                         lv->SetSensitiveDetector(photonSD);
0614                     }
0615                 }
0616                 G4VSolid* solid = lv->GetSolid();
0617                 if (solid && IsSubtractionSolid(solid))
0618                     lv->SetVisAttributes(&invisibleVisAttr);
0619             }
0620         }
0621         return world;
0622     }
0623 };
0624 
0625 // ============================================================================
0626 // Primary generator
0627 // ============================================================================
0628 
0629 struct PrimaryGenerator : G4VUserPrimaryGeneratorAction
0630 {
0631     SEvt* sev;
0632     PrimaryGenerator(SEvt* sev) :
0633         sev(sev)
0634     {
0635     }
0636 
0637     void GeneratePrimaries(G4Event* event) override
0638     {
0639         G4ThreeVector position_mm(0.0 * m, 0.0 * m, 0.0 * m);
0640         G4double      time_ns = 0;
0641         G4ThreeVector direction(0, 0.2, 0.8);
0642 
0643         G4PrimaryParticle* particle = new G4PrimaryParticle(G4Electron::Definition());
0644         particle->SetKineticEnergy(10 * MeV);
0645         particle->SetMomentumDirection(direction);
0646 
0647         G4PrimaryVertex* vertex = new G4PrimaryVertex(position_mm, time_ns);
0648         vertex->SetPrimary(particle);
0649         event->AddPrimaryVertex(vertex);
0650     }
0651 };
0652 
0653 // ============================================================================
0654 // Event action
0655 // ============================================================================
0656 
0657 struct EventAction : G4UserEventAction
0658 {
0659     SEvt* sev;
0660     G4int fTotalG4Hits{0};
0661 
0662     EventAction(SEvt* sev) :
0663         sev(sev)
0664     {
0665     }
0666 
0667     void EndOfEventAction(const G4Event* event) override
0668     {
0669         G4HCofThisEvent* hce = event->GetHCofThisEvent();
0670         if (!hce)
0671             return;
0672         G4int nCollections = hce->GetNumberOfCollections();
0673         for (G4int i = 0; i < nCollections; i++)
0674         {
0675             G4VHitsCollection* hc = hce->GetHC(i);
0676             if (!hc)
0677                 continue;
0678             PhotonHitsCollection* phc = dynamic_cast<PhotonHitsCollection*>(hc);
0679             if (phc)
0680                 fTotalG4Hits += phc->entries();
0681             else
0682                 fTotalG4Hits += hc->GetSize();
0683         }
0684     }
0685 
0686     G4int GetTotalG4Hits() const
0687     {
0688         return fTotalG4Hits;
0689     }
0690 };
0691 
0692 // ============================================================================
0693 // Forward declarations
0694 // ============================================================================
0695 
0696 struct SteppingAction;
0697 struct TrackingAction;
0698 
0699 // ============================================================================
0700 // RunAction — manages GPU lifecycle (sync or async)
0701 // ============================================================================
0702 
0703 struct RunAction : G4UserRunAction
0704 {
0705     EventAction*    fEventAction;
0706     GPUTaskManager* fGPUTaskMgr{nullptr}; // null = sync mode
0707 
0708     RunAction(EventAction* eventAction, GPUTaskManager* gpuMgr = nullptr) :
0709         fEventAction(eventAction),
0710         fGPUTaskMgr(gpuMgr)
0711     {
0712     }
0713 
0714     void BeginOfRunAction(const G4Run*) override
0715     {
0716         if (G4Threading::IsMasterThread() && fGPUTaskMgr)
0717             fGPUTaskMgr->start();
0718     }
0719 
0720     void EndOfRunAction(const G4Run*) override
0721     {
0722         if (!G4Threading::IsMasterThread())
0723             return;
0724 
0725         if (fGPUTaskMgr)
0726         {
0727             // Async mode: flush remaining gensteps and wait
0728             fGPUTaskMgr->flushRemaining(0);
0729 
0730             G4cout << "\n=== Async GPU Summary ===" << G4endl;
0731             G4cout << "Batches processed: " << fGPUTaskMgr->getCompletedBatches() << G4endl;
0732             G4cout << "Total GPU photons: " << fGPUTaskMgr->getTotalPhotons() << G4endl;
0733             G4cout << "Total GPU hits:    " << fGPUTaskMgr->getTotalHits() << G4endl;
0734             G4cout << "Total GPU time:    " << fGPUTaskMgr->getTotalGPUTime() << " s" << G4endl;
0735             G4cout << "G4 hits:           " << fEventAction->GetTotalG4Hits() << G4endl;
0736         }
0737         else
0738         {
0739             // Sync mode: run all gensteps at once
0740             G4CXOpticks* gx = G4CXOpticks::Get();
0741 
0742             auto start = std::chrono::high_resolution_clock::now();
0743             gx->simulate(0, false);
0744             cudaDeviceSynchronize();
0745             auto                          end = std::chrono::high_resolution_clock::now();
0746             std::chrono::duration<double> elapsed = end - start;
0747 
0748             SEvt*        sev = SEvt::Get_EGPU();
0749             unsigned int num_hits = sev->GetNumHit(0);
0750 
0751             G4cout << "\n=== Sync GPU Summary ===" << G4endl;
0752             G4cout << "GPU sim time:      " << elapsed.count() << " s" << G4endl;
0753             G4cout << "Gensteps:          " << sev->GetNumGenstepFromGenstep(0) << G4endl;
0754             G4cout << "Photons collected: " << sev->GetNumPhotonCollected(0) << G4endl;
0755             G4cout << "GPU hits:          " << num_hits << G4endl;
0756             G4cout << "G4 hits:           " << fEventAction->GetTotalG4Hits() << G4endl;
0757 
0758             // Save GPU hits
0759             if (num_hits > 0)
0760             {
0761                 NP* gpu_h = NP::Make<float>(num_hits, 4, 4);
0762                 for (unsigned idx = 0; idx < num_hits; idx++)
0763                 {
0764                     sphoton hit;
0765                     sev->getHit(hit, idx);
0766                     memcpy(gpu_h->bytes() + idx * sizeof(sphoton), &hit, sizeof(sphoton));
0767                 }
0768                 gpu_h->save("gpu_hits.npy");
0769                 G4cout << "Saved GPU hits to gpu_hits.npy" << G4endl;
0770             }
0771         }
0772 
0773         // Save G4 hits
0774         {
0775             G4AutoLock lock(&g4hits_mutex);
0776             size_t     ng4 = g4_accumulated_hits.size();
0777             if (ng4 > 0)
0778             {
0779                 NP* g4h = NP::Make<float>(ng4, 4, 4);
0780                 memcpy(g4h->bytes(), g4_accumulated_hits.data(), ng4 * 16 * sizeof(float));
0781                 g4h->save("g4_hits.npy");
0782                 G4cout << "Saved G4 hits (" << ng4 << ") to g4_hits.npy" << G4endl;
0783             }
0784         }
0785     }
0786 };
0787 
0788 // ============================================================================
0789 // SteppingAction — routes gensteps to SEvt (sync) or GPUTaskManager (async)
0790 // ============================================================================
0791 
0792 struct SteppingAction : G4UserSteppingAction
0793 {
0794     SEvt*           sev;
0795     GPUTaskManager* fGPUTaskMgr{nullptr};
0796 
0797     SteppingAction(SEvt* sev, GPUTaskManager* gpuMgr = nullptr) :
0798         sev(sev),
0799         fGPUTaskMgr(gpuMgr)
0800     {
0801     }
0802 
0803     void UserSteppingAction(const G4Step* aStep) override
0804     {
0805         G4Track* aTrack;
0806         G4int    fNumPhotons = 0;
0807 
0808         G4StepPoint*       postStep = aStep->GetPostStepPoint();
0809         G4VPhysicalVolume* volume = postStep->GetPhysicalVolume();
0810 
0811         // Kill optical photons stuck in reflection loops
0812         if (aStep->GetTrack()->GetDefinition() == G4OpticalPhoton::OpticalPhotonDefinition())
0813         {
0814             if (aStep->GetTrack()->GetCurrentStepNumber() > 10000)
0815                 aStep->GetTrack()->SetTrackStatus(fStopAndKill);
0816         }
0817 
0818         // Collect gensteps from Cerenkov and Scintillation processes
0819         G4SteppingManager* fpSteppingManager =
0820             G4EventManager::GetEventManager()->GetTrackingManager()->GetSteppingManager();
0821         G4StepStatus stepStatus = fpSteppingManager->GetfStepStatus();
0822 
0823         if (stepStatus == fAtRestDoItProc)
0824             return;
0825 
0826         G4ProcessVector* procPost = fpSteppingManager->GetfPostStepDoItVector();
0827         size_t           MAXofPostStepLoops = fpSteppingManager->GetMAXofPostStepLoops();
0828 
0829         for (size_t i3 = 0; i3 < MAXofPostStepLoops; i3++)
0830         {
0831             // --- Cerenkov ---
0832             if ((*procPost)[i3]->GetProcessName() == "Cerenkov")
0833             {
0834                 aTrack = aStep->GetTrack();
0835                 const G4DynamicParticle*   aParticle = aTrack->GetDynamicParticle();
0836                 G4double                   charge = aParticle->GetDefinition()->GetPDGCharge();
0837                 const G4Material*          aMaterial = aTrack->GetMaterial();
0838                 G4MaterialPropertiesTable* MPT = aMaterial->GetMaterialPropertiesTable();
0839                 G4MaterialPropertyVector*  Rindex = MPT->GetProperty(kRINDEX);
0840 
0841                 if (!Rindex || Rindex->GetVectorLength() == 0)
0842                     return;
0843 
0844                 G4Cerenkov* proc = (G4Cerenkov*)(*procPost)[i3];
0845                 fNumPhotons = proc->GetNumPhotons();
0846 
0847                 if (fNumPhotons > 0)
0848                 {
0849                     G4double Pmin = Rindex->Energy(0);
0850                     G4double Pmax = Rindex->GetMaxEnergy();
0851                     G4double nMax = Rindex->GetMaxValue();
0852                     G4double beta1 = aStep->GetPreStepPoint()->GetBeta();
0853                     G4double beta2 = aStep->GetPostStepPoint()->GetBeta();
0854                     G4double beta = (beta1 + beta2) * 0.5;
0855                     G4double BetaInverse = 1. / beta;
0856                     G4double maxCos = BetaInverse / nMax;
0857                     G4double maxSin2 = (1.0 - maxCos) * (1.0 + maxCos);
0858                     G4double MeanNumberOfPhotons1 = proc->GetAverageNumberOfPhotons(charge, beta1, aMaterial, Rindex);
0859                     G4double MeanNumberOfPhotons2 = proc->GetAverageNumberOfPhotons(charge, beta2, aMaterial, Rindex);
0860 
0861                     if (fGPUTaskMgr)
0862                     {
0863                         // ASYNC: construct quad6 directly into buffer
0864                         const G4Event* event = G4EventManager::GetEventManager()->GetConstCurrentEvent();
0865                         if (!event)
0866                             return;
0867                         quad6 gs = MakeGenstep_Cerenkov(aTrack, aStep, fNumPhotons, BetaInverse, Pmin, Pmax, maxCos,
0868                                                         maxSin2, MeanNumberOfPhotons1, MeanNumberOfPhotons2);
0869                         fGPUTaskMgr->addGenstep(gs, fNumPhotons, event->GetEventID());
0870                     }
0871                     else
0872                     {
0873                         // SYNC: standard SEvt path
0874                         G4AutoLock lock(&genstep_mutex);
0875                         U4::CollectGenstep_G4Cerenkov_modified(aTrack, aStep, fNumPhotons, BetaInverse, Pmin, Pmax,
0876                                                                maxCos, maxSin2, MeanNumberOfPhotons1,
0877                                                                MeanNumberOfPhotons2);
0878                     }
0879                 }
0880             }
0881 
0882             // --- Scintillation ---
0883             if ((*procPost)[i3]->GetProcessName() == "Scintillation")
0884             {
0885                 G4Scintillation* proc1 = (G4Scintillation*)(*procPost)[i3];
0886                 fNumPhotons = proc1->GetNumPhotons();
0887                 if (fNumPhotons > 0)
0888                 {
0889                     aTrack = aStep->GetTrack();
0890                     const G4Material*          aMaterial = aTrack->GetMaterial();
0891                     G4MaterialPropertiesTable* MPT = aMaterial->GetMaterialPropertiesTable();
0892                     if (!MPT || !MPT->ConstPropertyExists(kSCINTILLATIONTIMECONSTANT1))
0893                         return;
0894 
0895                     const G4int tcKeys[3] = {kSCINTILLATIONTIMECONSTANT1, kSCINTILLATIONTIMECONSTANT2,
0896                                              kSCINTILLATIONTIMECONSTANT3};
0897                     const G4int yieldKeys[3] = {kSCINTILLATIONYIELD1, kSCINTILLATIONYIELD2, kSCINTILLATIONYIELD3};
0898 
0899                     G4double tc[3] = {0, 0, 0};
0900                     G4double yield[3] = {0, 0, 0};
0901                     G4double yieldSum = 0;
0902                     G4int    nComp = 0;
0903 
0904                     for (G4int c = 0; c < 3; c++)
0905                     {
0906                         if (MPT->ConstPropertyExists(tcKeys[c]))
0907                         {
0908                             tc[c] = MPT->GetConstProperty(tcKeys[c]);
0909                             yield[c] = MPT->ConstPropertyExists(yieldKeys[c]) ? MPT->GetConstProperty(yieldKeys[c])
0910                                                                               : (c == 0 ? 1.0 : 0.0);
0911                             yieldSum += yield[c];
0912                             nComp = c + 1;
0913                         }
0914                     }
0915 
0916                     if (fGPUTaskMgr)
0917                     {
0918                         // ASYNC: construct quad6 directly into buffer
0919                         const G4Event* event = G4EventManager::GetEventManager()->GetConstCurrentEvent();
0920                         if (!event)
0921                             return;
0922                         int eventid = event->GetEventID();
0923 
0924                         G4int nRemaining = fNumPhotons;
0925                         for (G4int c = 0; c < nComp; c++)
0926                         {
0927                             G4int nPhotComp;
0928                             if (c == nComp - 1)
0929                                 nPhotComp = nRemaining;
0930                             else
0931                                 nPhotComp = static_cast<G4int>(fNumPhotons * yield[c] / yieldSum);
0932                             nRemaining -= nPhotComp;
0933 
0934                             if (nPhotComp > 0)
0935                             {
0936                                 quad6 gs = MakeGenstep_Scintillation(aTrack, aStep, nPhotComp, c + 1, tc[c]);
0937                                 fGPUTaskMgr->addGenstep(gs, nPhotComp, eventid);
0938                             }
0939                         }
0940                     }
0941                     else
0942                     {
0943                         // SYNC: standard SEvt path
0944                         G4AutoLock lock(&genstep_mutex);
0945                         G4int      nRemaining = fNumPhotons;
0946                         for (G4int c = 0; c < nComp; c++)
0947                         {
0948                             G4int nPhotComp;
0949                             if (c == nComp - 1)
0950                                 nPhotComp = nRemaining;
0951                             else
0952                                 nPhotComp = static_cast<G4int>(fNumPhotons * yield[c] / yieldSum);
0953                             nRemaining -= nPhotComp;
0954 
0955                             if (nPhotComp > 0)
0956                                 U4::CollectGenstep_DsG4Scintillation_r4695(aTrack, aStep, nPhotComp, c + 1, tc[c]);
0957                         }
0958                     }
0959                 }
0960             }
0961         }
0962     }
0963 };
0964 
0965 // ============================================================================
0966 // TrackingAction
0967 // ============================================================================
0968 
0969 struct TrackingAction : G4UserTrackingAction
0970 {
0971     const G4Track* transient_fSuspend_track = nullptr;
0972     SEvt*          sev;
0973 
0974     TrackingAction(SEvt* sev) :
0975         sev(sev)
0976     {
0977     }
0978 
0979     void PreUserTrackingAction(const G4Track*) override
0980     {
0981     }
0982 
0983     void PostUserTrackingAction(const G4Track* track) override
0984     {
0985         if (track->GetTrackStatus() == fSuspend)
0986             transient_fSuspend_track = track;
0987     }
0988 };
0989 
0990 // ============================================================================
0991 // G4App — wires everything together
0992 // ============================================================================
0993 
0994 struct G4App
0995 {
0996     SEvt*           sev;
0997     GPUTaskManager* gpu_task_mgr_;
0998 
0999     G4VUserDetectorConstruction*   det_cons_;
1000     G4VUserPrimaryGeneratorAction* prim_gen_;
1001     EventAction*                   event_act_;
1002     RunAction*                     run_act_;
1003     SteppingAction*                stepping_;
1004     TrackingAction*                tracking_;
1005 
1006     G4App(std::filesystem::path gdml_file, bool enable_async = true) :
1007         sev(SEvt::CreateOrReuse_EGPU()),
1008         gpu_task_mgr_(enable_async ? new GPUTaskManager() : nullptr),
1009         det_cons_(new DetectorConstruction(gdml_file)),
1010         prim_gen_(new PrimaryGenerator(sev)),
1011         event_act_(new EventAction(sev)),
1012         run_act_(new RunAction(event_act_, gpu_task_mgr_)),
1013         stepping_(new SteppingAction(sev, gpu_task_mgr_)),
1014         tracking_(new TrackingAction(sev))
1015     {
1016         if (gpu_task_mgr_)
1017             G4cout << "G4App: Async GPU mode (threshold=" << gpu_task_mgr_->getThreshold() << " photons)" << G4endl;
1018         else
1019             G4cout << "G4App: Sync GPU mode (end-of-run)" << G4endl;
1020     }
1021 
1022     ~G4App()
1023     {
1024         delete gpu_task_mgr_;
1025     }
1026 };