Gnn/src/ModuleMapCuda.cu

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008
0009 #include "ActsPlugins/Gnn/ModuleMapCuda.hpp"
0010 #include "ActsPlugins/Gnn/detail/CudaUtils.cuh"
0011 #include "ActsPlugins/Gnn/detail/CudaUtils.hpp"
0012 #include "ActsPlugins/Gnn/detail/ModuleMapUtils.cuh"
0013
0014 #include <CUDA_graph_creator>
0015 #include <CUDA_module_map_doublet>
0016 #include <CUDA_module_map_triplet>
0017 #include <TTree_hits>
0018 #include <chrono>
0019
0020 #include <cub/block/block_merge_sort.cuh>
0021 #include <thrust/execution_policy.h>
0022 #include <thrust/functional.h>
0023 #include <thrust/scan.h>
0024 #include <thrust/sort.h>
0025 #include <thrust/transform_scan.h>
0026
0027 using Clock = std::chrono::high_resolution_clock;
0028
0029 namespace {
0030
0031 template <typename T>
0032 class ScopedCudaPtr {
0033   cudaStream_t *m_stream = nullptr;
0034   T *m_ptr = nullptr;
0035
0036  public:
0037   ScopedCudaPtr(std::size_t n, cudaStream_t &stream) : m_stream(&stream) {
0038     ACTS_CUDA_CHECK(cudaMallocAsync(&m_ptr, n * sizeof(T), stream));
0039   }
0040
0041   ScopedCudaPtr(const ScopedCudaPtr &) = delete;
0042   ScopedCudaPtr(ScopedCudaPtr &&) = delete;
0043
0044   ~ScopedCudaPtr() { ACTS_CUDA_CHECK(cudaFreeAsync(m_ptr, *m_stream)); }
0045
0046   T *data() { return m_ptr; }
0047   const T *data() const { return m_ptr; }
0048 };
0049
0050 }  // namespace
0051
0052 using namespace Acts;
0053
0054 namespace ActsPlugins {
0055
0056 ModuleMapCuda::ModuleMapCuda(const Config &cfg,
0057                              std::unique_ptr<const Logger> logger_)
0058     : m_logger(std::move(logger_)), m_cfg(cfg) {
0059   module_map_triplet<float> moduleMapCpu;
0060   moduleMapCpu.read_TTree(cfg.moduleMapPath.c_str());
0061   if (!moduleMapCpu) {
0062     throw std::runtime_error("Cannot retrieve ModuleMap from " +
0063                              cfg.moduleMapPath);
0064   }
0065
0066   ACTS_DEBUG("ModuleMap GPU block dim: " << m_cfg.gpuBlocks);
0067
0068   m_cudaModuleMapDoublet =
0069       std::make_unique<CUDA_module_map_doublet<float>>(moduleMapCpu);
0070   m_cudaModuleMapDoublet->HostToDevice();
0071   m_cudaModuleMapTriplet =
0072       std::make_unique<CUDA_module_map_triplet<float>>(moduleMapCpu);
0073   m_cudaModuleMapTriplet->HostToDevice();
0074
0075   ACTS_DEBUG("# of modules = " << moduleMapCpu.module_map().size());
0076
0077   // check if we actually have a module map
0078   std::map<std::uint64_t, int> test;
0079
0080   std::vector<std::uint64_t> keys;
0081   keys.reserve(m_cudaModuleMapDoublet->module_map().size());
0082   std::vector<int> vals;
0083   vals.reserve(m_cudaModuleMapDoublet->module_map().size());
0084
0085   for (auto [key, value] : m_cudaModuleMapDoublet->module_map()) {
0086     auto [it, success] = test.insert({key, value});
0087     if (!success) {
0088       throw std::runtime_error("Duplicate key in module map");
0089     }
0090     keys.push_back(key);
0091     vals.push_back(value);
0092   }
0093
0094   // copy module map to device
0095   m_cudaModuleMapSize = m_cudaModuleMapDoublet->module_map().size();
0096   ACTS_CUDA_CHECK(cudaMalloc(&m_cudaModuleMapKeys,
0097                              m_cudaModuleMapSize * sizeof(std::uint64_t)));
0098   ACTS_CUDA_CHECK(
0099       cudaMalloc(&m_cudaModuleMapVals, m_cudaModuleMapSize * sizeof(int)));
0100
0101   ACTS_CUDA_CHECK(cudaMemcpy(m_cudaModuleMapKeys, keys.data(),
0102                              m_cudaModuleMapSize * sizeof(std::uint64_t),
0103                              cudaMemcpyHostToDevice));
0104   ACTS_CUDA_CHECK(cudaMemcpy(m_cudaModuleMapVals, vals.data(),
0105                              m_cudaModuleMapSize * sizeof(int),
0106                              cudaMemcpyHostToDevice));
0107 }
0108
0109 ModuleMapCuda::~ModuleMapCuda() {
0110   ACTS_CUDA_CHECK(cudaFree(m_cudaModuleMapKeys));
0111   ACTS_CUDA_CHECK(cudaFree(m_cudaModuleMapVals));
0112 }
0113
0114 namespace {}  // namespace
0115
0116 PipelineTensors ModuleMapCuda::operator()(
0117     std::vector<float> &inputValues, std::size_t numNodes,
0118     const std::vector<std::uint64_t> &moduleIds,
0119     const ExecutionContext &execContext) {
0120   auto t0 = std::chrono::high_resolution_clock::now();
0121   assert(execContext.device.isCuda());
0122
0123   if (moduleIds.empty()) {
0124     throw NoEdgesError{};
0125   }
0126
0127   const auto nHits = moduleIds.size();
0128   assert(inputValues.size() % moduleIds.size() == 0);
0129   const auto nFeatures = inputValues.size() / moduleIds.size();
0130   auto &features = inputValues;
0131
0132   const dim3 blockDim = m_cfg.gpuBlocks;
0133   const dim3 gridDimHits = (nHits + blockDim.x - 1) / blockDim.x;
0134   ACTS_VERBOSE("gridDimHits: " << gridDimHits.x
0135                                << ", blockDim: " << blockDim.x);
0136
0137   // Get stream if available, otherwise use default stream
0138   cudaStream_t stream = cudaStreamLegacy;
0139   if (execContext.stream) {
0140     ACTS_DEBUG("Got stream " << *execContext.stream);
0141     stream = execContext.stream.value();
0142   }
0143
0144   /////////////////////////
0145   // Prepare input data
0146   ////////////////////////
0147
0148   // Full node features to device
0149
0150   auto nodeFeatures = Tensor<float>::Create({nHits, nFeatures}, execContext);
0151   float *cudaNodeFeaturePtr = nodeFeatures.data();
0152   ACTS_CUDA_CHECK(cudaMemcpyAsync(cudaNodeFeaturePtr, features.data(),
0153                                   features.size() * sizeof(float),
0154                                   cudaMemcpyHostToDevice, stream));
0155
0156   // Module IDs to device
0157   ScopedCudaPtr<std::uint64_t> cudaModuleIds(nHits, stream);
0158   ACTS_CUDA_CHECK(cudaMemcpyAsync(cudaModuleIds.data(), moduleIds.data(),
0159                                   nHits * sizeof(std::uint64_t),
0160                                   cudaMemcpyHostToDevice, stream));
0161
0162   // Allocate memory for transposed node features that are needed for the
0163   // module map kernels in one block
0164   ScopedCudaPtr<float> cudaNodeFeaturesTransposed(6 * nHits, stream);
0165
0166   ActsPlugins::detail::CUDA_hit_data<float> inputData{};
0167   inputData.m_size = nHits;
0168   inputData.m_cuda_R = cudaNodeFeaturesTransposed.data() + 0 * nHits;
0169   inputData.m_cuda_phi = cudaNodeFeaturesTransposed.data() + 1 * nHits;
0170   inputData.m_cuda_z = cudaNodeFeaturesTransposed.data() + 2 * nHits;
0171   inputData.m_cuda_x = cudaNodeFeaturesTransposed.data() + 3 * nHits;
0172   inputData.m_cuda_y = cudaNodeFeaturesTransposed.data() + 4 * nHits;
0173   inputData.m_cuda_eta = cudaNodeFeaturesTransposed.data() + 5 * nHits;
0174
0175   // Node features for module map graph
0176   constexpr std::size_t rOffset = 0;
0177   constexpr std::size_t phiOffset = 1;
0178   constexpr std::size_t zOffset = 2;
0179   constexpr std::size_t etaOffset = 3;
0180
0181   const auto srcStride = sizeof(float) * nFeatures;
0182   const auto dstStride = sizeof(float);  // contiguous in destination
0183   const auto width = sizeof(float);      // only copy 1 column
0184   const auto height = nHits;
0185
0186   ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0187       inputData.cuda_R(), dstStride, cudaNodeFeaturePtr + rOffset, srcStride,
0188       width, height, cudaMemcpyDeviceToDevice, stream));
0189   ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0190       inputData.cuda_phi(), dstStride, cudaNodeFeaturePtr + phiOffset,
0191       srcStride, width, height, cudaMemcpyDeviceToDevice, stream));
0192   ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0193       inputData.cuda_z(), dstStride, cudaNodeFeaturePtr + zOffset, srcStride,
0194       width, height, cudaMemcpyDeviceToDevice, stream));
0195   ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0196       inputData.cuda_eta(), dstStride, cudaNodeFeaturePtr + etaOffset,
0197       srcStride, width, height, cudaMemcpyDeviceToDevice, stream));
0198
0199   // Allocate helper nb hits memory
0200   ScopedCudaPtr<int> cudaNbHits(m_cudaModuleMapSize + 1, stream);
0201   ACTS_CUDA_CHECK(cudaMemsetAsync(
0202       cudaNbHits.data(), 0, (m_cudaModuleMapSize + 1) * sizeof(int), stream));
0203
0204   // Preprocess features
0205   detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0206       nHits, inputData.cuda_z(), m_cfg.zScale);
0207   ACTS_CUDA_CHECK(cudaGetLastError());
0208   detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0209       nHits, inputData.cuda_R(), m_cfg.rScale);
0210   ACTS_CUDA_CHECK(cudaGetLastError());
0211   detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0212       nHits, inputData.cuda_phi(), m_cfg.phiScale);
0213   ACTS_CUDA_CHECK(cudaGetLastError());
0214
0215   detail::computeXandY<<<gridDimHits, blockDim, 0, stream>>>(
0216       nHits, inputData.cuda_x(), inputData.cuda_y(), inputData.cuda_R(),
0217       inputData.cuda_phi());
0218   ACTS_CUDA_CHECK(cudaGetLastError());
0219
0220   ScopedCudaPtr<std::uint64_t> cudaHitId(nHits, stream);
0221   detail::iota<<<gridDimHits, blockDim, 0, stream>>>(nHits, cudaHitId.data());
0222   ACTS_CUDA_CHECK(cudaGetLastError());
0223
0224   detail::mapModuleIdsToNbHits<<<gridDimHits, blockDim, 0, stream>>>(
0225       cudaNbHits.data(), nHits, cudaModuleIds.data(), m_cudaModuleMapSize,
0226       m_cudaModuleMapKeys, m_cudaModuleMapVals);
0227   ACTS_CUDA_CHECK(cudaGetLastError());
0228
0229   thrust::exclusive_scan(thrust::device.on(stream), cudaNbHits.data(),
0230                          cudaNbHits.data() + m_cudaModuleMapSize + 1,
0231                          cudaNbHits.data());
0232   ACTS_CUDA_CHECK(cudaGetLastError());
0233   int *cudaHitIndice = cudaNbHits.data();
0234
0235   ///////////////////////////////////
0236   // Perform module map inference
0237   ////////////////////////////////////
0238
0239   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0240   auto t1 = std::chrono::high_resolution_clock::now();
0241
0242   // TODO refactor this to avoid that inputData type in this form
0243   const auto edgeData = makeEdges(inputData, cudaHitIndice, stream);
0244   ACTS_CUDA_CHECK(cudaGetLastError());
0245   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0246
0247   auto t2 = std::chrono::high_resolution_clock::now();
0248
0249   if (edgeData.nEdges == 0) {
0250     throw NoEdgesError{};
0251   }
0252
0253   dim3 gridDimEdges = (edgeData.nEdges + blockDim.x - 1) / blockDim.x;
0254   ACTS_DEBUG("gridDimEdges: " << gridDimEdges.x
0255                               << ", blockDim: " << blockDim.x);
0256
0257   // Make edge features
0258   auto edgeFeatures = Tensor<float>::Create({edgeData.nEdges, 6}, execContext);
0259   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0260
0261   detail::makeEdgeFeatures<<<gridDimEdges, blockDim, 0, stream>>>(
0262       edgeData.nEdges, edgeData.cudaEdgePtr,
0263       edgeData.cudaEdgePtr + edgeData.nEdges, nFeatures, cudaNodeFeaturePtr,
0264       edgeFeatures.data());
0265   ACTS_CUDA_CHECK(cudaGetLastError());
0266   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0267
0268   auto edgeIndex =
0269       Tensor<std::int64_t>::Create({2, edgeData.nEdges}, execContext);
0270   thrust::transform(thrust::cuda::par.on(stream), edgeData.cudaEdgePtr,
0271                     edgeData.cudaEdgePtr + 2 * edgeData.nEdges,
0272                     edgeIndex.data(),
0273                     [] __device__(int i) -> std::int64_t { return i; });
0274   ACTS_CUDA_CHECK(cudaFreeAsync(edgeData.cudaEdgePtr, stream));
0275
0276   ACTS_CUDA_CHECK(cudaGetLastError());
0277   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0278   auto t3 = std::chrono::high_resolution_clock::now();
0279
0280   auto ms = [](auto a, auto b) {
0281     return std::chrono::duration<double, std::milli>(b - a).count();
0282   };
0283   ACTS_DEBUG("Preparation: " << ms(t0, t1));
0284   ACTS_DEBUG("Inference: " << ms(t1, t2));
0285   ACTS_DEBUG("Postprocessing: " << ms(t2, t3));
0286
0287   return {std::move(nodeFeatures),
0288           std::move(edgeIndex),
0289           std::move(edgeFeatures),
0290           {}};
0291 }
0292
0293 struct ArgsortFun {
0294   int *cudaPtr = nullptr;
0295   bool __device__ operator()(int l, int r) { return cudaPtr[l] < cudaPtr[r]; }
0296 };
0297
0298 struct CastBoolToInt {
0299   int __device__ operator()(bool b) { return static_cast<int>(b); }
0300 };
0301
0302 std::string debugPrintEdges(std::size_t nbEdges, const int *cudaSrc,
0303                             const int *cudaDst) {
0304   std::stringstream ss;
0305   if (nbEdges == 0) {
0306     return "zero edges remained";
0307   }
0308   nbEdges = std::min(10ul, nbEdges);
0309   std::vector<int> src(nbEdges), dst(nbEdges);
0310   ACTS_CUDA_CHECK(cudaDeviceSynchronize());
0311   ACTS_CUDA_CHECK(cudaMemcpy(src.data(), cudaSrc, nbEdges * sizeof(int),
0312                              cudaMemcpyDeviceToHost));
0313   ACTS_CUDA_CHECK(cudaMemcpy(dst.data(), cudaDst, nbEdges * sizeof(int),
0314                              cudaMemcpyDeviceToHost));
0315   for (std::size_t i = 0; i < nbEdges; ++i) {
0316     ss << src.at(i) << " ";
0317   }
0318   ss << "\n";
0319   for (std::size_t i = 0; i < nbEdges; ++i) {
0320     ss << dst.at(i) << " ";
0321   }
0322   return ss.str();
0323 }
0324
0325 detail::CUDA_edge_data<float> ModuleMapCuda::makeEdges(
0326     detail::CUDA_hit_data<float> cuda_TThits, int *cuda_hit_indice,
0327     cudaStream_t &stream) const {
0328   const dim3 block_dim = m_cfg.gpuBlocks;
0329   // ----------------------------------
0330   // memory allocation for hits + edges
0331   // ----------------------------------
0332   const int nb_doublets = m_cudaModuleMapDoublet->size();
0333   ACTS_DEBUG("nb doublets " << nb_doublets);
0334   dim3 grid_dim = ((nb_doublets + block_dim.x - 1) / block_dim.x);
0335
0336   // hit buffer pointers need to be visible outside the scope
0337   // Should be definitively filled after the if block
0338   std::optional<ScopedCudaPtr<int>> cuda_reduced_M1_hits, cuda_reduced_M2_hits;
0339
0340   ScopedCudaPtr<int> cuda_edge_sum(nb_doublets + 1, stream);
0341
0342   int nb_doublet_edges{};
0343
0344   // Algorithm to build edges parallel for each hit+doublet combination
0345   // ==================================================================
0346   //
0347   // The motivation for this is the work imbalance for different doublets
0348   // By essentially pulling out the outer loop over the hits on a module
0349   // we have a better change that the work is more evenly distributed
0350   //
0351   // a) Assume hit ids
0352   // 0 1 2 3 4 5
0353   //
0354   // b) Assume module ids for hits
0355   // 0 0 1 1 2 3
0356   //
0357   // c) Assume doublet module map
0358   // 0: 0 -> 1
0359   // 1: 0 -> 2
0360   // 2: 1 -> 2
0361   // 3: 2 -> 3
0362   //
0363   // d) Count start hits per doublet
0364   // 2 2 2 1
0365   //
0366   // e) Prefix sum
0367   // 0 2 4 6 7
0368
0369   ScopedCudaPtr<int> cuda_nb_src_hits_per_doublet(nb_doublets + 1, stream);
0370
0371   detail::count_src_hits_per_doublet<<<grid_dim, block_dim, 0, stream>>>(
0372       nb_doublets, m_cudaModuleMapDoublet->cuda_module1(), cuda_hit_indice,
0373       cuda_nb_src_hits_per_doublet.data());
0374   ACTS_CUDA_CHECK(cudaGetLastError());
0375
0376   thrust::exclusive_scan(thrust::device.on(stream),
0377                          cuda_nb_src_hits_per_doublet.data(),
0378                          cuda_nb_src_hits_per_doublet.data() + nb_doublets + 1,
0379                          cuda_nb_src_hits_per_doublet.data());
0380
0381   int sum_nb_src_hits_per_doublet{};
0382   ACTS_CUDA_CHECK(
0383       cudaMemcpyAsync(&sum_nb_src_hits_per_doublet,
0384                       &cuda_nb_src_hits_per_doublet.data()[nb_doublets],
0385                       sizeof(int), cudaMemcpyDeviceToHost, stream));
0386   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0387   ACTS_DEBUG("sum_nb_hits_per_doublet: " << sum_nb_src_hits_per_doublet);
0388
0389   if (sum_nb_src_hits_per_doublet == 0) {
0390     throw NoEdgesError{};
0391   }
0392
0393   ScopedCudaPtr<int> cuda_edge_sum_per_src_hit(sum_nb_src_hits_per_doublet + 1,
0394                                                stream);
0395
0396   dim3 grid_dim_shpd =
0397       ((sum_nb_src_hits_per_doublet + block_dim.x - 1) / block_dim.x);
0398   detail::count_doublet_edges<float><<<grid_dim_shpd, block_dim, 0, stream>>>(
0399       sum_nb_src_hits_per_doublet, nb_doublets,
0400       cuda_nb_src_hits_per_doublet.data(),
0401       m_cudaModuleMapDoublet->cuda_module1(),
0402       m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0403       cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(), cuda_TThits.cuda_phi(),
0404       m_cudaModuleMapDoublet->cuda_z0_min(),
0405       m_cudaModuleMapDoublet->cuda_z0_max(),
0406       m_cudaModuleMapDoublet->cuda_deta_min(),
0407       m_cudaModuleMapDoublet->cuda_deta_max(),
0408       m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0409       m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0410       m_cudaModuleMapDoublet->cuda_dphi_min(),
0411       m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice,
0412       cuda_edge_sum_per_src_hit.data(), m_cfg.epsilon);
0413   ACTS_CUDA_CHECK(cudaGetLastError());
0414
0415   thrust::exclusive_scan(
0416       thrust::device.on(stream), cuda_edge_sum_per_src_hit.data(),
0417       cuda_edge_sum_per_src_hit.data() + sum_nb_src_hits_per_doublet + 1,
0418       cuda_edge_sum_per_src_hit.data());
0419
0420   ACTS_CUDA_CHECK(cudaMemcpyAsync(
0421       &nb_doublet_edges,
0422       &cuda_edge_sum_per_src_hit.data()[sum_nb_src_hits_per_doublet],
0423       sizeof(int), cudaMemcpyDeviceToHost, stream));
0424   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0425   ACTS_DEBUG("nb_doublet_edges: " << nb_doublet_edges);
0426   ACTS_DEBUG("Allocate " << (2ul * nb_doublet_edges * sizeof(int)) * 1.0e-6
0427                          << " MB for edges");
0428   cuda_reduced_M1_hits.emplace(nb_doublet_edges, stream);
0429   cuda_reduced_M2_hits.emplace(nb_doublet_edges, stream);
0430
0431   detail::build_doublet_edges<float><<<grid_dim_shpd, block_dim, 0, stream>>>(
0432       sum_nb_src_hits_per_doublet, nb_doublets,
0433       cuda_nb_src_hits_per_doublet.data(),
0434       m_cudaModuleMapDoublet->cuda_module1(),
0435       m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0436       cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(), cuda_TThits.cuda_phi(),
0437       m_cudaModuleMapDoublet->cuda_z0_min(),
0438       m_cudaModuleMapDoublet->cuda_z0_max(),
0439       m_cudaModuleMapDoublet->cuda_deta_min(),
0440       m_cudaModuleMapDoublet->cuda_deta_max(),
0441       m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0442       m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0443       m_cudaModuleMapDoublet->cuda_dphi_min(),
0444       m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice,
0445       cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0446       cuda_edge_sum_per_src_hit.data(), m_cfg.epsilon);
0447   ACTS_CUDA_CHECK(cudaGetLastError());
0448
0449   detail::computeDoubletEdgeSum<<<grid_dim, block_dim, 0, stream>>>(
0450       nb_doublets, cuda_nb_src_hits_per_doublet.data(),
0451       cuda_edge_sum_per_src_hit.data(), cuda_edge_sum.data());
0452   ACTS_CUDA_CHECK(cudaGetLastError());
0453
0454   ACTS_VERBOSE("First 10 doublet edges:\n"
0455                << debugPrintEdges(nb_doublet_edges,
0456                                   cuda_reduced_M1_hits->data(),
0457                                   cuda_reduced_M2_hits->data()));
0458   if (nb_doublet_edges == 0) {
0459     throw NoEdgesError{};
0460   }
0461   //---------------------------------------------
0462   // reduce nb of hits and sort by hid id M2 hits
0463   //---------------------------------------------
0464
0465   ScopedCudaPtr<int> cuda_sorted_M2_hits(nb_doublet_edges, stream);
0466
0467   grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0468   init_vector<<<grid_dim, block_dim, 0, stream>>>(cuda_sorted_M2_hits.data(),
0469                                                   nb_doublet_edges);
0470   ACTS_CUDA_CHECK(cudaGetLastError());
0471
0472   if (m_cfg.moreParallel) {
0473     dim3 block_dim_even_odd = 64;
0474     ACTS_DEBUG("Using block_odd_even_sort, grid_dim.x = "
0475                << nb_doublets << ", block_dim.x = " << block_dim_even_odd.x);
0476     detail::block_odd_even_sort<<<nb_doublets, block_dim_even_odd, 0, stream>>>(
0477         cuda_reduced_M2_hits->data(), cuda_edge_sum.data(),
0478         cuda_sorted_M2_hits.data());
0479   } else {
0480     grid_dim = ((nb_doublets + block_dim.x - 1) / block_dim.x);
0481     partial_quick_sort<<<grid_dim, block_dim, 0, stream>>>(
0482         cuda_sorted_M2_hits.data(), cuda_reduced_M2_hits->data(),
0483         cuda_edge_sum.data(), nb_doublets);
0484     ACTS_CUDA_CHECK(cudaGetLastError());
0485   }
0486
0487   // -----------------------------
0488   // build doublets geometric cuts
0489   // -----------------------------
0490
0491   ScopedCudaPtr<float> cuda_z0(nb_doublet_edges, stream),
0492       cuda_phi_slope(nb_doublet_edges, stream),
0493       cuda_deta(nb_doublet_edges, stream), cuda_dphi(nb_doublet_edges, stream);
0494   grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0495   hits_geometric_cuts<<<grid_dim, block_dim, 0, stream>>>(
0496       cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(), cuda_dphi.data(),
0497       cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0498       cuda_TThits.cuda_R(), cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(),
0499       cuda_TThits.cuda_phi(), TMath::Pi(), m_cfg.epsilon, nb_doublet_edges);
0500   ACTS_CUDA_CHECK(cudaGetLastError());
0501   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0502
0503   ScopedCudaPtr<bool> cuda_mask(nb_doublet_edges + 1, stream);
0504   ACTS_CUDA_CHECK(
0505       cudaMemset(cuda_mask.data(), 0, (nb_doublet_edges + 1) * sizeof(bool)));
0506   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0507
0508   // -------------------------
0509   // loop over module triplets
0510   // -------------------------
0511   int nb_triplets = m_cudaModuleMapTriplet->size();
0512   grid_dim = ((nb_triplets + block_dim.x - 1) / block_dim.x);
0513
0514   ScopedCudaPtr<bool> cuda_vertices(cuda_TThits.size(), stream);
0515   ACTS_CUDA_CHECK(cudaMemsetAsync(cuda_vertices.data(), 0,
0516                                   cuda_TThits.size() * sizeof(bool), stream));
0517   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0518
0519   // Allocate memory for the number of hits per triplet
0520   ScopedCudaPtr<int> cuda_src_hits_per_triplet(nb_triplets + 1, stream);
0521
0522   detail::count_triplet_hits<<<grid_dim, block_dim, 0, stream>>>(
0523       nb_triplets, m_cudaModuleMapTriplet->cuda_module12_map(),
0524       m_cudaModuleMapTriplet->cuda_module23_map(), cuda_edge_sum.data(),
0525       cuda_src_hits_per_triplet.data());
0526   ACTS_CUDA_CHECK(cudaGetLastError());
0527
0528   // Perform prefix sum to get the offset for each triplet
0529   thrust::exclusive_scan(thrust::device.on(stream),
0530                          cuda_src_hits_per_triplet.data(),
0531                          cuda_src_hits_per_triplet.data() + nb_triplets + 1,
0532                          cuda_src_hits_per_triplet.data());
0533
0534   int nb_src_hits_per_triplet_sum{};
0535   ACTS_CUDA_CHECK(
0536       cudaMemcpyAsync(&nb_src_hits_per_triplet_sum,
0537                       &cuda_src_hits_per_triplet.data()[nb_triplets],
0538                       sizeof(int), cudaMemcpyDeviceToHost, stream));
0539   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0540   ACTS_DEBUG("nb_src_hits_per_triplet_sum: " << nb_src_hits_per_triplet_sum);
0541   if (nb_src_hits_per_triplet_sum == 0) {
0542     throw NoEdgesError{};
0543   }
0544
0545   dim3 grid_dim_shpt =
0546       ((nb_src_hits_per_triplet_sum + block_dim.x - 1) / block_dim.x);
0547
0548   detail::triplet_cuts<float><<<grid_dim_shpt, block_dim, 0, stream>>>(
0549       nb_src_hits_per_triplet_sum, nb_triplets,
0550       cuda_src_hits_per_triplet.data(),
0551       m_cudaModuleMapTriplet->cuda_module12_map(),
0552       m_cudaModuleMapTriplet->cuda_module23_map(), cuda_TThits.cuda_x(),
0553       cuda_TThits.cuda_y(), cuda_TThits.cuda_z(), cuda_TThits.cuda_R(),
0554       cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(), cuda_dphi.data(),
0555       m_cudaModuleMapTriplet->module12().cuda_z0_min(),
0556       m_cudaModuleMapTriplet->module12().cuda_z0_max(),
0557       m_cudaModuleMapTriplet->module12().cuda_deta_min(),
0558       m_cudaModuleMapTriplet->module12().cuda_deta_max(),
0559       m_cudaModuleMapTriplet->module12().cuda_phi_slope_min(),
0560       m_cudaModuleMapTriplet->module12().cuda_phi_slope_max(),
0561       m_cudaModuleMapTriplet->module12().cuda_dphi_min(),
0562       m_cudaModuleMapTriplet->module12().cuda_dphi_max(),
0563       m_cudaModuleMapTriplet->module23().cuda_z0_min(),
0564       m_cudaModuleMapTriplet->module23().cuda_z0_max(),
0565       m_cudaModuleMapTriplet->module23().cuda_deta_min(),
0566       m_cudaModuleMapTriplet->module23().cuda_deta_max(),
0567       m_cudaModuleMapTriplet->module23().cuda_phi_slope_min(),
0568       m_cudaModuleMapTriplet->module23().cuda_phi_slope_max(),
0569       m_cudaModuleMapTriplet->module23().cuda_dphi_min(),
0570       m_cudaModuleMapTriplet->module23().cuda_dphi_max(),
0571       m_cudaModuleMapTriplet->cuda_diff_dydx_min(),
0572       m_cudaModuleMapTriplet->cuda_diff_dydx_max(),
0573       m_cudaModuleMapTriplet->cuda_diff_dzdr_min(),
0574       m_cudaModuleMapTriplet->cuda_diff_dzdr_max(), TMath::Pi(),
0575       cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0576       cuda_sorted_M2_hits.data(), cuda_edge_sum.data(), cuda_vertices.data(),
0577       cuda_mask.data(), m_cfg.epsilon);
0578   ACTS_CUDA_CHECK(cudaGetLastError());
0579
0580   //----------------
0581   // edges reduction
0582   //----------------
0583   ScopedCudaPtr<int> cuda_mask_sum(nb_doublet_edges + 1, stream);
0584   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0585   thrust::transform_exclusive_scan(thrust::device.on(stream), cuda_mask.data(),
0586                                    cuda_mask.data() + (nb_doublet_edges + 1),
0587                                    cuda_mask_sum.data(), CastBoolToInt{}, 0,
0588                                    thrust::plus<int>());
0589
0590   int nb_graph_edges{};
0591   ACTS_CUDA_CHECK(cudaMemcpyAsync(&nb_graph_edges,
0592                                   &cuda_mask_sum.data()[nb_doublet_edges],
0593                                   sizeof(int), cudaMemcpyDeviceToHost, stream));
0594   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0595
0596   ACTS_DEBUG("nb_graph_edges: " << nb_graph_edges);
0597   if (nb_graph_edges == 0) {
0598     throw NoEdgesError{};
0599   }
0600
0601   // Leave this as a bare pointer for now, since there is only very simple
0602   // control flow after here and we can keep the interface clean form the
0603   // ScopedCudaPtr type
0604   int *cuda_graph_edge_ptr{};
0605   ACTS_CUDA_CHECK(cudaMallocAsync(&cuda_graph_edge_ptr,
0606                                   2 * nb_graph_edges * sizeof(int), stream));
0607   int *cuda_graph_M1_hits = cuda_graph_edge_ptr;
0608   int *cuda_graph_M2_hits = cuda_graph_edge_ptr + nb_graph_edges;
0609
0610   grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0611   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0612   compact_stream<<<grid_dim, block_dim, 0, stream>>>(
0613       cuda_graph_M1_hits, cuda_reduced_M1_hits->data(), cuda_mask.data(),
0614       cuda_mask_sum.data(), nb_doublet_edges);
0615   ACTS_CUDA_CHECK(cudaGetLastError());
0616   compact_stream<<<grid_dim, block_dim, 0, stream>>>(
0617       cuda_graph_M2_hits, cuda_reduced_M2_hits->data(), cuda_mask.data(),
0618       cuda_mask_sum.data(), nb_doublet_edges);
0619   ACTS_CUDA_CHECK(cudaGetLastError());
0620
0621   ACTS_VERBOSE("First 10 doublet edges:\n"
0622                << debugPrintEdges(nb_graph_edges, cuda_graph_M1_hits,
0623                                   cuda_graph_M2_hits));
0624
0625   detail::CUDA_edge_data<float> edge_data{};
0626   edge_data.nEdges = nb_graph_edges;
0627   edge_data.cudaEdgePtr = cuda_graph_edge_ptr;
0628
0629   return edge_data;
0630 }
0631
0632 }  // namespace ActsPlugins