File indexing completed on 2025-12-16 09:24:27
0001
0002
0003
0004
0005
0006
0007
0008
0009 #include "ActsPlugins/Gnn/ModuleMapCuda.hpp"
0010 #include "ActsPlugins/Gnn/detail/CudaUtils.cuh"
0011 #include "ActsPlugins/Gnn/detail/CudaUtils.hpp"
0012 #include "ActsPlugins/Gnn/detail/ModuleMapUtils.cuh"
0013
0014 #include <CUDA_graph_creator>
0015 #include <CUDA_module_map_doublet>
0016 #include <CUDA_module_map_triplet>
0017 #include <TTree_hits>
0018 #include <chrono>
0019
0020 #include <cub/block/block_merge_sort.cuh>
0021 #include <thrust/execution_policy.h>
0022 #include <thrust/functional.h>
0023 #include <thrust/scan.h>
0024 #include <thrust/sort.h>
0025 #include <thrust/transform_scan.h>
0026
0027 using Clock = std::chrono::high_resolution_clock;
0028
0029 namespace {
0030
0031 template <typename T>
0032 class ScopedCudaPtr {
0033 cudaStream_t *m_stream = nullptr;
0034 T *m_ptr = nullptr;
0035
0036 public:
0037 ScopedCudaPtr(std::size_t n, cudaStream_t &stream) : m_stream(&stream) {
0038 ACTS_CUDA_CHECK(cudaMallocAsync(&m_ptr, n * sizeof(T), stream));
0039 }
0040
0041 ScopedCudaPtr(const ScopedCudaPtr &) = delete;
0042 ScopedCudaPtr(ScopedCudaPtr &&) = delete;
0043
0044 ~ScopedCudaPtr() { ACTS_CUDA_CHECK(cudaFreeAsync(m_ptr, *m_stream)); }
0045
0046 T *data() { return m_ptr; }
0047 const T *data() const { return m_ptr; }
0048 };
0049
0050 }
0051
0052 using namespace Acts;
0053
0054 namespace ActsPlugins {
0055
0056 ModuleMapCuda::ModuleMapCuda(const Config &cfg,
0057 std::unique_ptr<const Logger> logger_)
0058 : m_logger(std::move(logger_)), m_cfg(cfg) {
0059 module_map_triplet<float> moduleMapCpu;
0060 moduleMapCpu.read_TTree(cfg.moduleMapPath.c_str());
0061 if (!moduleMapCpu) {
0062 throw std::runtime_error("Cannot retrieve ModuleMap from " +
0063 cfg.moduleMapPath);
0064 }
0065
0066 ACTS_DEBUG("ModuleMap GPU block dim: " << m_cfg.gpuBlocks);
0067
0068 m_cudaModuleMapDoublet =
0069 std::make_unique<CUDA_module_map_doublet<float>>(moduleMapCpu);
0070 m_cudaModuleMapDoublet->HostToDevice();
0071 m_cudaModuleMapTriplet =
0072 std::make_unique<CUDA_module_map_triplet<float>>(moduleMapCpu);
0073 m_cudaModuleMapTriplet->HostToDevice();
0074
0075 ACTS_DEBUG("# of modules = " << moduleMapCpu.module_map().size());
0076
0077
0078 std::map<std::uint64_t, int> test;
0079
0080 std::vector<std::uint64_t> keys;
0081 keys.reserve(m_cudaModuleMapDoublet->module_map().size());
0082 std::vector<int> vals;
0083 vals.reserve(m_cudaModuleMapDoublet->module_map().size());
0084
0085 for (auto [key, value] : m_cudaModuleMapDoublet->module_map()) {
0086 auto [it, success] = test.insert({key, value});
0087 if (!success) {
0088 throw std::runtime_error("Duplicate key in module map");
0089 }
0090 keys.push_back(key);
0091 vals.push_back(value);
0092 }
0093
0094
0095 m_cudaModuleMapSize = m_cudaModuleMapDoublet->module_map().size();
0096 ACTS_CUDA_CHECK(cudaMalloc(&m_cudaModuleMapKeys,
0097 m_cudaModuleMapSize * sizeof(std::uint64_t)));
0098 ACTS_CUDA_CHECK(
0099 cudaMalloc(&m_cudaModuleMapVals, m_cudaModuleMapSize * sizeof(int)));
0100
0101 ACTS_CUDA_CHECK(cudaMemcpy(m_cudaModuleMapKeys, keys.data(),
0102 m_cudaModuleMapSize * sizeof(std::uint64_t),
0103 cudaMemcpyHostToDevice));
0104 ACTS_CUDA_CHECK(cudaMemcpy(m_cudaModuleMapVals, vals.data(),
0105 m_cudaModuleMapSize * sizeof(int),
0106 cudaMemcpyHostToDevice));
0107 }
0108
0109 ModuleMapCuda::~ModuleMapCuda() {
0110 ACTS_CUDA_CHECK(cudaFree(m_cudaModuleMapKeys));
0111 ACTS_CUDA_CHECK(cudaFree(m_cudaModuleMapVals));
0112 }
0113
0114 namespace {}
0115
0116 PipelineTensors ModuleMapCuda::operator()(
0117 std::vector<float> &inputValues, std::size_t numNodes,
0118 const std::vector<std::uint64_t> &moduleIds,
0119 const ExecutionContext &execContext) {
0120 auto t0 = std::chrono::high_resolution_clock::now();
0121 assert(execContext.device.isCuda());
0122
0123 if (moduleIds.empty()) {
0124 throw NoEdgesError{};
0125 }
0126
0127 const auto nHits = moduleIds.size();
0128 assert(inputValues.size() % moduleIds.size() == 0);
0129 const auto nFeatures = inputValues.size() / moduleIds.size();
0130 auto &features = inputValues;
0131
0132 const dim3 blockDim = m_cfg.gpuBlocks;
0133 const dim3 gridDimHits = (nHits + blockDim.x - 1) / blockDim.x;
0134 ACTS_VERBOSE("gridDimHits: " << gridDimHits.x
0135 << ", blockDim: " << blockDim.x);
0136
0137
0138 cudaStream_t stream = cudaStreamLegacy;
0139 if (execContext.stream) {
0140 ACTS_DEBUG("Got stream " << *execContext.stream);
0141 stream = execContext.stream.value();
0142 }
0143
0144
0145
0146
0147
0148
0149
0150 auto nodeFeatures = Tensor<float>::Create({nHits, nFeatures}, execContext);
0151 float *cudaNodeFeaturePtr = nodeFeatures.data();
0152 ACTS_CUDA_CHECK(cudaMemcpyAsync(cudaNodeFeaturePtr, features.data(),
0153 features.size() * sizeof(float),
0154 cudaMemcpyHostToDevice, stream));
0155
0156
0157 ScopedCudaPtr<std::uint64_t> cudaModuleIds(nHits, stream);
0158 ACTS_CUDA_CHECK(cudaMemcpyAsync(cudaModuleIds.data(), moduleIds.data(),
0159 nHits * sizeof(std::uint64_t),
0160 cudaMemcpyHostToDevice, stream));
0161
0162
0163
0164 ScopedCudaPtr<float> cudaNodeFeaturesTransposed(6 * nHits, stream);
0165
0166 ActsPlugins::detail::CUDA_hit_data<float> inputData{};
0167 inputData.m_size = nHits;
0168 inputData.m_cuda_R = cudaNodeFeaturesTransposed.data() + 0 * nHits;
0169 inputData.m_cuda_phi = cudaNodeFeaturesTransposed.data() + 1 * nHits;
0170 inputData.m_cuda_z = cudaNodeFeaturesTransposed.data() + 2 * nHits;
0171 inputData.m_cuda_x = cudaNodeFeaturesTransposed.data() + 3 * nHits;
0172 inputData.m_cuda_y = cudaNodeFeaturesTransposed.data() + 4 * nHits;
0173 inputData.m_cuda_eta = cudaNodeFeaturesTransposed.data() + 5 * nHits;
0174
0175
0176 constexpr std::size_t rOffset = 0;
0177 constexpr std::size_t phiOffset = 1;
0178 constexpr std::size_t zOffset = 2;
0179 constexpr std::size_t etaOffset = 3;
0180
0181 const auto srcStride = sizeof(float) * nFeatures;
0182 const auto dstStride = sizeof(float);
0183 const auto width = sizeof(float);
0184 const auto height = nHits;
0185
0186 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0187 inputData.cuda_R(), dstStride, cudaNodeFeaturePtr + rOffset, srcStride,
0188 width, height, cudaMemcpyDeviceToDevice, stream));
0189 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0190 inputData.cuda_phi(), dstStride, cudaNodeFeaturePtr + phiOffset,
0191 srcStride, width, height, cudaMemcpyDeviceToDevice, stream));
0192 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0193 inputData.cuda_z(), dstStride, cudaNodeFeaturePtr + zOffset, srcStride,
0194 width, height, cudaMemcpyDeviceToDevice, stream));
0195 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0196 inputData.cuda_eta(), dstStride, cudaNodeFeaturePtr + etaOffset,
0197 srcStride, width, height, cudaMemcpyDeviceToDevice, stream));
0198
0199
0200 ScopedCudaPtr<int> cudaNbHits(m_cudaModuleMapSize + 1, stream);
0201 ACTS_CUDA_CHECK(cudaMemsetAsync(
0202 cudaNbHits.data(), 0, (m_cudaModuleMapSize + 1) * sizeof(int), stream));
0203
0204
0205 detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0206 nHits, inputData.cuda_z(), m_cfg.zScale);
0207 ACTS_CUDA_CHECK(cudaGetLastError());
0208 detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0209 nHits, inputData.cuda_R(), m_cfg.rScale);
0210 ACTS_CUDA_CHECK(cudaGetLastError());
0211 detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0212 nHits, inputData.cuda_phi(), m_cfg.phiScale);
0213 ACTS_CUDA_CHECK(cudaGetLastError());
0214
0215 detail::computeXandY<<<gridDimHits, blockDim, 0, stream>>>(
0216 nHits, inputData.cuda_x(), inputData.cuda_y(), inputData.cuda_R(),
0217 inputData.cuda_phi());
0218 ACTS_CUDA_CHECK(cudaGetLastError());
0219
0220 ScopedCudaPtr<std::uint64_t> cudaHitId(nHits, stream);
0221 detail::iota<<<gridDimHits, blockDim, 0, stream>>>(nHits, cudaHitId.data());
0222 ACTS_CUDA_CHECK(cudaGetLastError());
0223
0224 detail::mapModuleIdsToNbHits<<<gridDimHits, blockDim, 0, stream>>>(
0225 cudaNbHits.data(), nHits, cudaModuleIds.data(), m_cudaModuleMapSize,
0226 m_cudaModuleMapKeys, m_cudaModuleMapVals);
0227 ACTS_CUDA_CHECK(cudaGetLastError());
0228
0229 thrust::exclusive_scan(thrust::device.on(stream), cudaNbHits.data(),
0230 cudaNbHits.data() + m_cudaModuleMapSize + 1,
0231 cudaNbHits.data());
0232 ACTS_CUDA_CHECK(cudaGetLastError());
0233 int *cudaHitIndice = cudaNbHits.data();
0234
0235
0236
0237
0238
0239 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0240 auto t1 = std::chrono::high_resolution_clock::now();
0241
0242
0243 const auto edgeData = makeEdges(inputData, cudaHitIndice, stream);
0244 ACTS_CUDA_CHECK(cudaGetLastError());
0245 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0246
0247 auto t2 = std::chrono::high_resolution_clock::now();
0248
0249 if (edgeData.nEdges == 0) {
0250 throw NoEdgesError{};
0251 }
0252
0253 dim3 gridDimEdges = (edgeData.nEdges + blockDim.x - 1) / blockDim.x;
0254 ACTS_DEBUG("gridDimEdges: " << gridDimEdges.x
0255 << ", blockDim: " << blockDim.x);
0256
0257
0258 auto edgeFeatures = Tensor<float>::Create({edgeData.nEdges, 6}, execContext);
0259 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0260
0261 detail::makeEdgeFeatures<<<gridDimEdges, blockDim, 0, stream>>>(
0262 edgeData.nEdges, edgeData.cudaEdgePtr,
0263 edgeData.cudaEdgePtr + edgeData.nEdges, nFeatures, cudaNodeFeaturePtr,
0264 edgeFeatures.data());
0265 ACTS_CUDA_CHECK(cudaGetLastError());
0266 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0267
0268 auto edgeIndex =
0269 Tensor<std::int64_t>::Create({2, edgeData.nEdges}, execContext);
0270 thrust::transform(thrust::cuda::par.on(stream), edgeData.cudaEdgePtr,
0271 edgeData.cudaEdgePtr + 2 * edgeData.nEdges,
0272 edgeIndex.data(),
0273 [] __device__(int i) -> std::int64_t { return i; });
0274 ACTS_CUDA_CHECK(cudaFreeAsync(edgeData.cudaEdgePtr, stream));
0275
0276 ACTS_CUDA_CHECK(cudaGetLastError());
0277 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0278 auto t3 = std::chrono::high_resolution_clock::now();
0279
0280 auto ms = [](auto a, auto b) {
0281 return std::chrono::duration<double, std::milli>(b - a).count();
0282 };
0283 ACTS_DEBUG("Preparation: " << ms(t0, t1));
0284 ACTS_DEBUG("Inference: " << ms(t1, t2));
0285 ACTS_DEBUG("Postprocessing: " << ms(t2, t3));
0286
0287 return {std::move(nodeFeatures),
0288 std::move(edgeIndex),
0289 std::move(edgeFeatures),
0290 {}};
0291 }
0292
0293 struct ArgsortFun {
0294 int *cudaPtr = nullptr;
0295 bool __device__ operator()(int l, int r) { return cudaPtr[l] < cudaPtr[r]; }
0296 };
0297
0298 struct CastBoolToInt {
0299 int __device__ operator()(bool b) { return static_cast<int>(b); }
0300 };
0301
0302 std::string debugPrintEdges(std::size_t nbEdges, const int *cudaSrc,
0303 const int *cudaDst) {
0304 std::stringstream ss;
0305 if (nbEdges == 0) {
0306 return "zero edges remained";
0307 }
0308 nbEdges = std::min(10ul, nbEdges);
0309 std::vector<int> src(nbEdges), dst(nbEdges);
0310 ACTS_CUDA_CHECK(cudaDeviceSynchronize());
0311 ACTS_CUDA_CHECK(cudaMemcpy(src.data(), cudaSrc, nbEdges * sizeof(int),
0312 cudaMemcpyDeviceToHost));
0313 ACTS_CUDA_CHECK(cudaMemcpy(dst.data(), cudaDst, nbEdges * sizeof(int),
0314 cudaMemcpyDeviceToHost));
0315 for (std::size_t i = 0; i < nbEdges; ++i) {
0316 ss << src.at(i) << " ";
0317 }
0318 ss << "\n";
0319 for (std::size_t i = 0; i < nbEdges; ++i) {
0320 ss << dst.at(i) << " ";
0321 }
0322 return ss.str();
0323 }
0324
0325 detail::CUDA_edge_data<float> ModuleMapCuda::makeEdges(
0326 detail::CUDA_hit_data<float> cuda_TThits, int *cuda_hit_indice,
0327 cudaStream_t &stream) const {
0328 const dim3 block_dim = m_cfg.gpuBlocks;
0329
0330
0331
0332 const int nb_doublets = m_cudaModuleMapDoublet->size();
0333 ACTS_DEBUG("nb doublets " << nb_doublets);
0334 dim3 grid_dim = ((nb_doublets + block_dim.x - 1) / block_dim.x);
0335
0336
0337
0338 std::optional<ScopedCudaPtr<int>> cuda_reduced_M1_hits, cuda_reduced_M2_hits;
0339
0340 ScopedCudaPtr<int> cuda_edge_sum(nb_doublets + 1, stream);
0341
0342 int nb_doublet_edges{};
0343
0344
0345
0346
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369 ScopedCudaPtr<int> cuda_nb_src_hits_per_doublet(nb_doublets + 1, stream);
0370
0371 detail::count_src_hits_per_doublet<<<grid_dim, block_dim, 0, stream>>>(
0372 nb_doublets, m_cudaModuleMapDoublet->cuda_module1(), cuda_hit_indice,
0373 cuda_nb_src_hits_per_doublet.data());
0374 ACTS_CUDA_CHECK(cudaGetLastError());
0375
0376 thrust::exclusive_scan(thrust::device.on(stream),
0377 cuda_nb_src_hits_per_doublet.data(),
0378 cuda_nb_src_hits_per_doublet.data() + nb_doublets + 1,
0379 cuda_nb_src_hits_per_doublet.data());
0380
0381 int sum_nb_src_hits_per_doublet{};
0382 ACTS_CUDA_CHECK(
0383 cudaMemcpyAsync(&sum_nb_src_hits_per_doublet,
0384 &cuda_nb_src_hits_per_doublet.data()[nb_doublets],
0385 sizeof(int), cudaMemcpyDeviceToHost, stream));
0386 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0387 ACTS_DEBUG("sum_nb_hits_per_doublet: " << sum_nb_src_hits_per_doublet);
0388
0389 if (sum_nb_src_hits_per_doublet == 0) {
0390 throw NoEdgesError{};
0391 }
0392
0393 ScopedCudaPtr<int> cuda_edge_sum_per_src_hit(sum_nb_src_hits_per_doublet + 1,
0394 stream);
0395
0396 dim3 grid_dim_shpd =
0397 ((sum_nb_src_hits_per_doublet + block_dim.x - 1) / block_dim.x);
0398 detail::count_doublet_edges<float><<<grid_dim_shpd, block_dim, 0, stream>>>(
0399 sum_nb_src_hits_per_doublet, nb_doublets,
0400 cuda_nb_src_hits_per_doublet.data(),
0401 m_cudaModuleMapDoublet->cuda_module1(),
0402 m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0403 cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(), cuda_TThits.cuda_phi(),
0404 m_cudaModuleMapDoublet->cuda_z0_min(),
0405 m_cudaModuleMapDoublet->cuda_z0_max(),
0406 m_cudaModuleMapDoublet->cuda_deta_min(),
0407 m_cudaModuleMapDoublet->cuda_deta_max(),
0408 m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0409 m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0410 m_cudaModuleMapDoublet->cuda_dphi_min(),
0411 m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice,
0412 cuda_edge_sum_per_src_hit.data(), m_cfg.epsilon);
0413 ACTS_CUDA_CHECK(cudaGetLastError());
0414
0415 thrust::exclusive_scan(
0416 thrust::device.on(stream), cuda_edge_sum_per_src_hit.data(),
0417 cuda_edge_sum_per_src_hit.data() + sum_nb_src_hits_per_doublet + 1,
0418 cuda_edge_sum_per_src_hit.data());
0419
0420 ACTS_CUDA_CHECK(cudaMemcpyAsync(
0421 &nb_doublet_edges,
0422 &cuda_edge_sum_per_src_hit.data()[sum_nb_src_hits_per_doublet],
0423 sizeof(int), cudaMemcpyDeviceToHost, stream));
0424 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0425 ACTS_DEBUG("nb_doublet_edges: " << nb_doublet_edges);
0426 ACTS_DEBUG("Allocate " << (2ul * nb_doublet_edges * sizeof(int)) * 1.0e-6
0427 << " MB for edges");
0428 cuda_reduced_M1_hits.emplace(nb_doublet_edges, stream);
0429 cuda_reduced_M2_hits.emplace(nb_doublet_edges, stream);
0430
0431 detail::build_doublet_edges<float><<<grid_dim_shpd, block_dim, 0, stream>>>(
0432 sum_nb_src_hits_per_doublet, nb_doublets,
0433 cuda_nb_src_hits_per_doublet.data(),
0434 m_cudaModuleMapDoublet->cuda_module1(),
0435 m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0436 cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(), cuda_TThits.cuda_phi(),
0437 m_cudaModuleMapDoublet->cuda_z0_min(),
0438 m_cudaModuleMapDoublet->cuda_z0_max(),
0439 m_cudaModuleMapDoublet->cuda_deta_min(),
0440 m_cudaModuleMapDoublet->cuda_deta_max(),
0441 m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0442 m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0443 m_cudaModuleMapDoublet->cuda_dphi_min(),
0444 m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice,
0445 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0446 cuda_edge_sum_per_src_hit.data(), m_cfg.epsilon);
0447 ACTS_CUDA_CHECK(cudaGetLastError());
0448
0449 detail::computeDoubletEdgeSum<<<grid_dim, block_dim, 0, stream>>>(
0450 nb_doublets, cuda_nb_src_hits_per_doublet.data(),
0451 cuda_edge_sum_per_src_hit.data(), cuda_edge_sum.data());
0452 ACTS_CUDA_CHECK(cudaGetLastError());
0453
0454 ACTS_VERBOSE("First 10 doublet edges:\n"
0455 << debugPrintEdges(nb_doublet_edges,
0456 cuda_reduced_M1_hits->data(),
0457 cuda_reduced_M2_hits->data()));
0458 if (nb_doublet_edges == 0) {
0459 throw NoEdgesError{};
0460 }
0461
0462
0463
0464
0465 ScopedCudaPtr<int> cuda_sorted_M2_hits(nb_doublet_edges, stream);
0466
0467 grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0468 init_vector<<<grid_dim, block_dim, 0, stream>>>(cuda_sorted_M2_hits.data(),
0469 nb_doublet_edges);
0470 ACTS_CUDA_CHECK(cudaGetLastError());
0471
0472 if (m_cfg.moreParallel) {
0473 dim3 block_dim_even_odd = 64;
0474 ACTS_DEBUG("Using block_odd_even_sort, grid_dim.x = "
0475 << nb_doublets << ", block_dim.x = " << block_dim_even_odd.x);
0476 detail::block_odd_even_sort<<<nb_doublets, block_dim_even_odd, 0, stream>>>(
0477 cuda_reduced_M2_hits->data(), cuda_edge_sum.data(),
0478 cuda_sorted_M2_hits.data());
0479 } else {
0480 grid_dim = ((nb_doublets + block_dim.x - 1) / block_dim.x);
0481 partial_quick_sort<<<grid_dim, block_dim, 0, stream>>>(
0482 cuda_sorted_M2_hits.data(), cuda_reduced_M2_hits->data(),
0483 cuda_edge_sum.data(), nb_doublets);
0484 ACTS_CUDA_CHECK(cudaGetLastError());
0485 }
0486
0487
0488
0489
0490
0491 ScopedCudaPtr<float> cuda_z0(nb_doublet_edges, stream),
0492 cuda_phi_slope(nb_doublet_edges, stream),
0493 cuda_deta(nb_doublet_edges, stream), cuda_dphi(nb_doublet_edges, stream);
0494 grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0495 hits_geometric_cuts<<<grid_dim, block_dim, 0, stream>>>(
0496 cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(), cuda_dphi.data(),
0497 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0498 cuda_TThits.cuda_R(), cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(),
0499 cuda_TThits.cuda_phi(), TMath::Pi(), m_cfg.epsilon, nb_doublet_edges);
0500 ACTS_CUDA_CHECK(cudaGetLastError());
0501 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0502
0503 ScopedCudaPtr<bool> cuda_mask(nb_doublet_edges + 1, stream);
0504 ACTS_CUDA_CHECK(
0505 cudaMemset(cuda_mask.data(), 0, (nb_doublet_edges + 1) * sizeof(bool)));
0506 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0507
0508
0509
0510
0511 int nb_triplets = m_cudaModuleMapTriplet->size();
0512 grid_dim = ((nb_triplets + block_dim.x - 1) / block_dim.x);
0513
0514 ScopedCudaPtr<bool> cuda_vertices(cuda_TThits.size(), stream);
0515 ACTS_CUDA_CHECK(cudaMemsetAsync(cuda_vertices.data(), 0,
0516 cuda_TThits.size() * sizeof(bool), stream));
0517 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0518
0519
0520 ScopedCudaPtr<int> cuda_src_hits_per_triplet(nb_triplets + 1, stream);
0521
0522 detail::count_triplet_hits<<<grid_dim, block_dim, 0, stream>>>(
0523 nb_triplets, m_cudaModuleMapTriplet->cuda_module12_map(),
0524 m_cudaModuleMapTriplet->cuda_module23_map(), cuda_edge_sum.data(),
0525 cuda_src_hits_per_triplet.data());
0526 ACTS_CUDA_CHECK(cudaGetLastError());
0527
0528
0529 thrust::exclusive_scan(thrust::device.on(stream),
0530 cuda_src_hits_per_triplet.data(),
0531 cuda_src_hits_per_triplet.data() + nb_triplets + 1,
0532 cuda_src_hits_per_triplet.data());
0533
0534 int nb_src_hits_per_triplet_sum{};
0535 ACTS_CUDA_CHECK(
0536 cudaMemcpyAsync(&nb_src_hits_per_triplet_sum,
0537 &cuda_src_hits_per_triplet.data()[nb_triplets],
0538 sizeof(int), cudaMemcpyDeviceToHost, stream));
0539 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0540 ACTS_DEBUG("nb_src_hits_per_triplet_sum: " << nb_src_hits_per_triplet_sum);
0541 if (nb_src_hits_per_triplet_sum == 0) {
0542 throw NoEdgesError{};
0543 }
0544
0545 dim3 grid_dim_shpt =
0546 ((nb_src_hits_per_triplet_sum + block_dim.x - 1) / block_dim.x);
0547
0548 detail::triplet_cuts<float><<<grid_dim_shpt, block_dim, 0, stream>>>(
0549 nb_src_hits_per_triplet_sum, nb_triplets,
0550 cuda_src_hits_per_triplet.data(),
0551 m_cudaModuleMapTriplet->cuda_module12_map(),
0552 m_cudaModuleMapTriplet->cuda_module23_map(), cuda_TThits.cuda_x(),
0553 cuda_TThits.cuda_y(), cuda_TThits.cuda_z(), cuda_TThits.cuda_R(),
0554 cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(), cuda_dphi.data(),
0555 m_cudaModuleMapTriplet->module12().cuda_z0_min(),
0556 m_cudaModuleMapTriplet->module12().cuda_z0_max(),
0557 m_cudaModuleMapTriplet->module12().cuda_deta_min(),
0558 m_cudaModuleMapTriplet->module12().cuda_deta_max(),
0559 m_cudaModuleMapTriplet->module12().cuda_phi_slope_min(),
0560 m_cudaModuleMapTriplet->module12().cuda_phi_slope_max(),
0561 m_cudaModuleMapTriplet->module12().cuda_dphi_min(),
0562 m_cudaModuleMapTriplet->module12().cuda_dphi_max(),
0563 m_cudaModuleMapTriplet->module23().cuda_z0_min(),
0564 m_cudaModuleMapTriplet->module23().cuda_z0_max(),
0565 m_cudaModuleMapTriplet->module23().cuda_deta_min(),
0566 m_cudaModuleMapTriplet->module23().cuda_deta_max(),
0567 m_cudaModuleMapTriplet->module23().cuda_phi_slope_min(),
0568 m_cudaModuleMapTriplet->module23().cuda_phi_slope_max(),
0569 m_cudaModuleMapTriplet->module23().cuda_dphi_min(),
0570 m_cudaModuleMapTriplet->module23().cuda_dphi_max(),
0571 m_cudaModuleMapTriplet->cuda_diff_dydx_min(),
0572 m_cudaModuleMapTriplet->cuda_diff_dydx_max(),
0573 m_cudaModuleMapTriplet->cuda_diff_dzdr_min(),
0574 m_cudaModuleMapTriplet->cuda_diff_dzdr_max(), TMath::Pi(),
0575 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0576 cuda_sorted_M2_hits.data(), cuda_edge_sum.data(), cuda_vertices.data(),
0577 cuda_mask.data(), m_cfg.epsilon);
0578 ACTS_CUDA_CHECK(cudaGetLastError());
0579
0580
0581
0582
0583 ScopedCudaPtr<int> cuda_mask_sum(nb_doublet_edges + 1, stream);
0584 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0585 thrust::transform_exclusive_scan(thrust::device.on(stream), cuda_mask.data(),
0586 cuda_mask.data() + (nb_doublet_edges + 1),
0587 cuda_mask_sum.data(), CastBoolToInt{}, 0,
0588 thrust::plus<int>());
0589
0590 int nb_graph_edges{};
0591 ACTS_CUDA_CHECK(cudaMemcpyAsync(&nb_graph_edges,
0592 &cuda_mask_sum.data()[nb_doublet_edges],
0593 sizeof(int), cudaMemcpyDeviceToHost, stream));
0594 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0595
0596 ACTS_DEBUG("nb_graph_edges: " << nb_graph_edges);
0597 if (nb_graph_edges == 0) {
0598 throw NoEdgesError{};
0599 }
0600
0601
0602
0603
0604 int *cuda_graph_edge_ptr{};
0605 ACTS_CUDA_CHECK(cudaMallocAsync(&cuda_graph_edge_ptr,
0606 2 * nb_graph_edges * sizeof(int), stream));
0607 int *cuda_graph_M1_hits = cuda_graph_edge_ptr;
0608 int *cuda_graph_M2_hits = cuda_graph_edge_ptr + nb_graph_edges;
0609
0610 grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0611 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0612 compact_stream<<<grid_dim, block_dim, 0, stream>>>(
0613 cuda_graph_M1_hits, cuda_reduced_M1_hits->data(), cuda_mask.data(),
0614 cuda_mask_sum.data(), nb_doublet_edges);
0615 ACTS_CUDA_CHECK(cudaGetLastError());
0616 compact_stream<<<grid_dim, block_dim, 0, stream>>>(
0617 cuda_graph_M2_hits, cuda_reduced_M2_hits->data(), cuda_mask.data(),
0618 cuda_mask_sum.data(), nb_doublet_edges);
0619 ACTS_CUDA_CHECK(cudaGetLastError());
0620
0621 ACTS_VERBOSE("First 10 doublet edges:\n"
0622 << debugPrintEdges(nb_graph_edges, cuda_graph_M1_hits,
0623 cuda_graph_M2_hits));
0624
0625 detail::CUDA_edge_data<float> edge_data{};
0626 edge_data.nEdges = nb_graph_edges;
0627 edge_data.cudaEdgePtr = cuda_graph_edge_ptr;
0628
0629 return edge_data;
0630 }
0631
0632 }