File indexing completed on 2025-07-02 07:51:55
0001
0002
0003
0004
0005
0006
0007
0008
0009 #include "Acts/Plugins/ExaTrkX/ModuleMapCuda.hpp"
0010 #include "Acts/Plugins/ExaTrkX/detail/CudaUtils.cuh"
0011 #include "Acts/Plugins/ExaTrkX/detail/CudaUtils.hpp"
0012 #include "Acts/Plugins/ExaTrkX/detail/ModuleMapUtils.cuh"
0013
0014 #include <CUDA_graph_creator>
0015 #include <CUDA_module_map_doublet>
0016 #include <CUDA_module_map_triplet>
0017 #include <TTree_hits>
0018 #include <chrono>
0019
0020 #include <cub/block/block_merge_sort.cuh>
0021 #include <thrust/execution_policy.h>
0022 #include <thrust/functional.h>
0023 #include <thrust/scan.h>
0024 #include <thrust/sort.h>
0025 #include <thrust/transform_scan.h>
0026
0027 using Clock = std::chrono::high_resolution_clock;
0028
0029 namespace {
0030
0031 template <typename T>
0032 class ScopedCudaPtr {
0033 cudaStream_t *m_stream = nullptr;
0034 T *m_ptr = nullptr;
0035
0036 public:
0037 ScopedCudaPtr(std::size_t n, cudaStream_t &stream) : m_stream(&stream) {
0038 ACTS_CUDA_CHECK(cudaMallocAsync(&m_ptr, n * sizeof(T), stream));
0039 }
0040
0041 ScopedCudaPtr(const ScopedCudaPtr &) = delete;
0042 ScopedCudaPtr(ScopedCudaPtr &&) = delete;
0043
0044 ~ScopedCudaPtr() { ACTS_CUDA_CHECK(cudaFreeAsync(m_ptr, *m_stream)); }
0045
0046 T *data() { return m_ptr; }
0047 const T *data() const { return m_ptr; }
0048 };
0049
0050 }
0051
0052 namespace Acts {
0053
0054 ModuleMapCuda::ModuleMapCuda(const Config &cfg,
0055 std::unique_ptr<const Acts::Logger> logger_)
0056 : m_logger(std::move(logger_)), m_cfg(cfg) {
0057 module_map_triplet<float> moduleMapCpu;
0058 moduleMapCpu.read_TTree(cfg.moduleMapPath.c_str());
0059 if (!moduleMapCpu) {
0060 throw std::runtime_error("Cannot retrieve ModuleMap from " +
0061 cfg.moduleMapPath);
0062 }
0063
0064 ACTS_DEBUG("ModuleMap GPU block dim: " << m_cfg.gpuBlocks);
0065
0066 m_cudaModuleMapDoublet =
0067 std::make_unique<CUDA_module_map_doublet<float>>(moduleMapCpu);
0068 m_cudaModuleMapDoublet->HostToDevice();
0069 m_cudaModuleMapTriplet =
0070 std::make_unique<CUDA_module_map_triplet<float>>(moduleMapCpu);
0071 m_cudaModuleMapTriplet->HostToDevice();
0072
0073 ACTS_DEBUG("# of modules = " << moduleMapCpu.module_map().size());
0074
0075
0076 std::map<std::uint64_t, int> test;
0077
0078 std::vector<std::uint64_t> keys;
0079 keys.reserve(m_cudaModuleMapDoublet->module_map().size());
0080 std::vector<int> vals;
0081 vals.reserve(m_cudaModuleMapDoublet->module_map().size());
0082
0083 for (auto [key, value] : m_cudaModuleMapDoublet->module_map()) {
0084 auto [it, success] = test.insert({key, value});
0085 if (!success) {
0086 throw std::runtime_error("Duplicate key in module map");
0087 }
0088 keys.push_back(key);
0089 vals.push_back(value);
0090 }
0091
0092
0093 m_cudaModuleMapSize = m_cudaModuleMapDoublet->module_map().size();
0094 ACTS_CUDA_CHECK(cudaMalloc(&m_cudaModuleMapKeys,
0095 m_cudaModuleMapSize * sizeof(std::uint64_t)));
0096 ACTS_CUDA_CHECK(
0097 cudaMalloc(&m_cudaModuleMapVals, m_cudaModuleMapSize * sizeof(int)));
0098
0099 ACTS_CUDA_CHECK(cudaMemcpy(m_cudaModuleMapKeys, keys.data(),
0100 m_cudaModuleMapSize * sizeof(std::uint64_t),
0101 cudaMemcpyHostToDevice));
0102 ACTS_CUDA_CHECK(cudaMemcpy(m_cudaModuleMapVals, vals.data(),
0103 m_cudaModuleMapSize * sizeof(int),
0104 cudaMemcpyHostToDevice));
0105 }
0106
0107 ModuleMapCuda::~ModuleMapCuda() {
0108 ACTS_CUDA_CHECK(cudaFree(m_cudaModuleMapKeys));
0109 ACTS_CUDA_CHECK(cudaFree(m_cudaModuleMapVals));
0110 }
0111
0112 namespace {}
0113
0114 PipelineTensors ModuleMapCuda::operator()(
0115 std::vector<float> &inputValues, std::size_t numNodes,
0116 const std::vector<std::uint64_t> &moduleIds,
0117 const ExecutionContext &execContext) {
0118 auto t0 = std::chrono::high_resolution_clock::now();
0119 assert(execContext.device.isCuda());
0120
0121 if (moduleIds.empty()) {
0122 throw NoEdgesError{};
0123 }
0124
0125 const auto nHits = moduleIds.size();
0126 assert(inputValues.size() % moduleIds.size() == 0);
0127 const auto nFeatures = inputValues.size() / moduleIds.size();
0128 auto &features = inputValues;
0129
0130 const dim3 blockDim = m_cfg.gpuBlocks;
0131 const dim3 gridDimHits = (nHits + blockDim.x - 1) / blockDim.x;
0132 ACTS_VERBOSE("gridDimHits: " << gridDimHits.x
0133 << ", blockDim: " << blockDim.x);
0134
0135
0136 cudaStream_t stream = cudaStreamLegacy;
0137 if (execContext.stream) {
0138 ACTS_DEBUG("Got stream " << *execContext.stream);
0139 stream = execContext.stream.value();
0140 }
0141
0142
0143
0144
0145
0146
0147
0148 auto nodeFeatures =
0149 Acts::Tensor<float>::Create({nHits, nFeatures}, execContext);
0150 float *cudaNodeFeaturePtr = nodeFeatures.data();
0151 ACTS_CUDA_CHECK(cudaMemcpyAsync(cudaNodeFeaturePtr, features.data(),
0152 features.size() * sizeof(float),
0153 cudaMemcpyHostToDevice, stream));
0154
0155
0156 ScopedCudaPtr<std::uint64_t> cudaModuleIds(nHits, stream);
0157 ACTS_CUDA_CHECK(cudaMemcpyAsync(cudaModuleIds.data(), moduleIds.data(),
0158 nHits * sizeof(std::uint64_t),
0159 cudaMemcpyHostToDevice, stream));
0160
0161
0162
0163 ScopedCudaPtr<float> cudaNodeFeaturesTransposed(6 * nHits, stream);
0164
0165 Acts::detail::CUDA_hit_data<float> inputData{};
0166 inputData.m_size = nHits;
0167 inputData.m_cuda_R = cudaNodeFeaturesTransposed.data() + 0 * nHits;
0168 inputData.m_cuda_phi = cudaNodeFeaturesTransposed.data() + 1 * nHits;
0169 inputData.m_cuda_z = cudaNodeFeaturesTransposed.data() + 2 * nHits;
0170 inputData.m_cuda_x = cudaNodeFeaturesTransposed.data() + 3 * nHits;
0171 inputData.m_cuda_y = cudaNodeFeaturesTransposed.data() + 4 * nHits;
0172 inputData.m_cuda_eta = cudaNodeFeaturesTransposed.data() + 5 * nHits;
0173
0174
0175 constexpr std::size_t rOffset = 0;
0176 constexpr std::size_t phiOffset = 1;
0177 constexpr std::size_t zOffset = 2;
0178 constexpr std::size_t etaOffset = 3;
0179
0180 const auto srcStride = sizeof(float) * nFeatures;
0181 const auto dstStride = sizeof(float);
0182 const auto width = sizeof(float);
0183 const auto height = nHits;
0184
0185 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0186 inputData.cuda_R(), dstStride, cudaNodeFeaturePtr + rOffset, srcStride,
0187 width, height, cudaMemcpyDeviceToDevice, stream));
0188 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0189 inputData.cuda_phi(), dstStride, cudaNodeFeaturePtr + phiOffset,
0190 srcStride, width, height, cudaMemcpyDeviceToDevice, stream));
0191 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0192 inputData.cuda_z(), dstStride, cudaNodeFeaturePtr + zOffset, srcStride,
0193 width, height, cudaMemcpyDeviceToDevice, stream));
0194 ACTS_CUDA_CHECK(cudaMemcpy2DAsync(
0195 inputData.cuda_eta(), dstStride, cudaNodeFeaturePtr + etaOffset,
0196 srcStride, width, height, cudaMemcpyDeviceToDevice, stream));
0197
0198
0199 ScopedCudaPtr<int> cudaNbHits(m_cudaModuleMapSize + 1, stream);
0200 ACTS_CUDA_CHECK(cudaMemsetAsync(
0201 cudaNbHits.data(), 0, (m_cudaModuleMapSize + 1) * sizeof(int), stream));
0202
0203
0204 detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0205 nHits, inputData.cuda_z(), m_cfg.zScale);
0206 ACTS_CUDA_CHECK(cudaGetLastError());
0207 detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0208 nHits, inputData.cuda_R(), m_cfg.rScale);
0209 ACTS_CUDA_CHECK(cudaGetLastError());
0210 detail::rescaleFeature<<<gridDimHits, blockDim, 0, stream>>>(
0211 nHits, inputData.cuda_phi(), m_cfg.phiScale);
0212 ACTS_CUDA_CHECK(cudaGetLastError());
0213
0214 detail::computeXandY<<<gridDimHits, blockDim, 0, stream>>>(
0215 nHits, inputData.cuda_x(), inputData.cuda_y(), inputData.cuda_R(),
0216 inputData.cuda_phi());
0217 ACTS_CUDA_CHECK(cudaGetLastError());
0218
0219 ScopedCudaPtr<std::uint64_t> cudaHitId(nHits, stream);
0220 detail::iota<<<gridDimHits, blockDim, 0, stream>>>(nHits, cudaHitId.data());
0221 ACTS_CUDA_CHECK(cudaGetLastError());
0222
0223 detail::mapModuleIdsToNbHits<<<gridDimHits, blockDim, 0, stream>>>(
0224 cudaNbHits.data(), nHits, cudaModuleIds.data(), m_cudaModuleMapSize,
0225 m_cudaModuleMapKeys, m_cudaModuleMapVals);
0226 ACTS_CUDA_CHECK(cudaGetLastError());
0227
0228 thrust::exclusive_scan(thrust::device.on(stream), cudaNbHits.data(),
0229 cudaNbHits.data() + m_cudaModuleMapSize + 1,
0230 cudaNbHits.data());
0231 ACTS_CUDA_CHECK(cudaGetLastError());
0232 int *cudaHitIndice = cudaNbHits.data();
0233
0234
0235
0236
0237
0238 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0239 auto t1 = std::chrono::high_resolution_clock::now();
0240
0241
0242 const auto edgeData = makeEdges(inputData, cudaHitIndice, stream);
0243 ACTS_CUDA_CHECK(cudaGetLastError());
0244 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0245
0246 auto t2 = std::chrono::high_resolution_clock::now();
0247
0248 if (edgeData.nEdges == 0) {
0249 throw NoEdgesError{};
0250 }
0251
0252 dim3 gridDimEdges = (edgeData.nEdges + blockDim.x - 1) / blockDim.x;
0253 ACTS_DEBUG("gridDimEdges: " << gridDimEdges.x
0254 << ", blockDim: " << blockDim.x);
0255
0256
0257 auto edgeFeatures =
0258 Acts::Tensor<float>::Create({edgeData.nEdges, 6}, execContext);
0259 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0260
0261 detail::makeEdgeFeatures<<<gridDimEdges, blockDim, 0, stream>>>(
0262 edgeData.nEdges, edgeData.cudaEdgePtr,
0263 edgeData.cudaEdgePtr + edgeData.nEdges, nFeatures, cudaNodeFeaturePtr,
0264 edgeFeatures.data());
0265 ACTS_CUDA_CHECK(cudaGetLastError());
0266 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0267
0268 auto edgeIndex =
0269 Tensor<std::int64_t>::Create({2, edgeData.nEdges}, execContext);
0270 thrust::transform(thrust::cuda::par.on(stream), edgeData.cudaEdgePtr,
0271 edgeData.cudaEdgePtr + 2 * edgeData.nEdges,
0272 edgeIndex.data(),
0273 [] __device__(int i) -> std::int64_t { return i; });
0274 ACTS_CUDA_CHECK(cudaFreeAsync(edgeData.cudaEdgePtr, stream));
0275
0276 ACTS_CUDA_CHECK(cudaGetLastError());
0277 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0278 auto t3 = std::chrono::high_resolution_clock::now();
0279
0280 auto ms = [](auto a, auto b) {
0281 return std::chrono::duration<double, std::milli>(b - a).count();
0282 };
0283 ACTS_DEBUG("Preparation: " << ms(t0, t1));
0284 ACTS_DEBUG("Inference: " << ms(t1, t2));
0285 ACTS_DEBUG("Postprocessing: " << ms(t2, t3));
0286
0287 return {std::move(nodeFeatures),
0288 std::move(edgeIndex),
0289 std::move(edgeFeatures),
0290 {}};
0291 }
0292
0293 struct ArgsortFun {
0294 int *cudaPtr = nullptr;
0295 bool __device__ operator()(int l, int r) { return cudaPtr[l] < cudaPtr[r]; }
0296 };
0297
0298 struct CastBoolToInt {
0299 int __device__ operator()(bool b) { return static_cast<int>(b); }
0300 };
0301
0302 std::string debugPrintEdges(std::size_t nbEdges, const int *cudaSrc,
0303 const int *cudaDst) {
0304 std::stringstream ss;
0305 if (nbEdges == 0) {
0306 return "zero edges remained";
0307 }
0308 nbEdges = std::min(10ul, nbEdges);
0309 std::vector<int> src(nbEdges), dst(nbEdges);
0310 ACTS_CUDA_CHECK(cudaDeviceSynchronize());
0311 ACTS_CUDA_CHECK(cudaMemcpy(src.data(), cudaSrc, nbEdges * sizeof(int),
0312 cudaMemcpyDeviceToHost));
0313 ACTS_CUDA_CHECK(cudaMemcpy(dst.data(), cudaDst, nbEdges * sizeof(int),
0314 cudaMemcpyDeviceToHost));
0315 for (std::size_t i = 0; i < nbEdges; ++i) {
0316 ss << src.at(i) << " ";
0317 }
0318 ss << "\n";
0319 for (std::size_t i = 0; i < nbEdges; ++i) {
0320 ss << dst.at(i) << " ";
0321 }
0322 return ss.str();
0323 }
0324
0325 detail::CUDA_edge_data<float> ModuleMapCuda::makeEdges(
0326 detail::CUDA_hit_data<float> cuda_TThits, int *cuda_hit_indice,
0327 cudaStream_t &stream) const {
0328 const dim3 block_dim = m_cfg.gpuBlocks;
0329
0330
0331
0332 const int nb_doublets = m_cudaModuleMapDoublet->size();
0333 ACTS_DEBUG("nb doublets " << nb_doublets);
0334 dim3 grid_dim = ((nb_doublets + block_dim.x - 1) / block_dim.x);
0335
0336
0337
0338 std::optional<ScopedCudaPtr<int>> cuda_reduced_M1_hits, cuda_reduced_M2_hits;
0339
0340 ScopedCudaPtr<int> cuda_edge_sum(nb_doublets + 1, stream);
0341
0342 int nb_doublet_edges{};
0343
0344
0345
0346 if (m_cfg.moreParallel) {
0347
0348
0349
0350
0351
0352
0353
0354
0355
0356
0357
0358
0359
0360
0361
0362
0363
0364
0365
0366
0367
0368
0369
0370
0371
0372 ScopedCudaPtr<int> cuda_nb_src_hits_per_doublet(nb_doublets + 1, stream);
0373
0374 detail::count_src_hits_per_doublet<<<grid_dim, block_dim, 0, stream>>>(
0375 nb_doublets, m_cudaModuleMapDoublet->cuda_module1(), cuda_hit_indice,
0376 cuda_nb_src_hits_per_doublet.data());
0377 ACTS_CUDA_CHECK(cudaGetLastError());
0378
0379 thrust::exclusive_scan(
0380 thrust::device.on(stream), cuda_nb_src_hits_per_doublet.data(),
0381 cuda_nb_src_hits_per_doublet.data() + nb_doublets + 1,
0382 cuda_nb_src_hits_per_doublet.data());
0383
0384 int sum_nb_src_hits_per_doublet{};
0385 ACTS_CUDA_CHECK(
0386 cudaMemcpyAsync(&sum_nb_src_hits_per_doublet,
0387 &cuda_nb_src_hits_per_doublet.data()[nb_doublets],
0388 sizeof(int), cudaMemcpyDeviceToHost, stream));
0389 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0390 ACTS_DEBUG("sum_nb_hits_per_doublet: " << sum_nb_src_hits_per_doublet);
0391
0392 if (sum_nb_src_hits_per_doublet == 0) {
0393 throw NoEdgesError{};
0394 }
0395
0396 ScopedCudaPtr<int> cuda_edge_sum_per_src_hit(
0397 sum_nb_src_hits_per_doublet + 1, stream);
0398
0399 dim3 grid_dim_shpd =
0400 ((sum_nb_src_hits_per_doublet + block_dim.x - 1) / block_dim.x);
0401 detail::count_doublet_edges_new<float>
0402 <<<grid_dim_shpd, block_dim, 0, stream>>>(
0403 sum_nb_src_hits_per_doublet, nb_doublets,
0404 cuda_nb_src_hits_per_doublet.data(),
0405 m_cudaModuleMapDoublet->cuda_module1(),
0406 m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0407 cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(),
0408 cuda_TThits.cuda_phi(), m_cudaModuleMapDoublet->cuda_z0_min(),
0409 m_cudaModuleMapDoublet->cuda_z0_max(),
0410 m_cudaModuleMapDoublet->cuda_deta_min(),
0411 m_cudaModuleMapDoublet->cuda_deta_max(),
0412 m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0413 m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0414 m_cudaModuleMapDoublet->cuda_dphi_min(),
0415 m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice,
0416 cuda_edge_sum_per_src_hit.data(), m_cfg.epsilon);
0417 ACTS_CUDA_CHECK(cudaGetLastError());
0418
0419 thrust::exclusive_scan(
0420 thrust::device.on(stream), cuda_edge_sum_per_src_hit.data(),
0421 cuda_edge_sum_per_src_hit.data() + sum_nb_src_hits_per_doublet + 1,
0422 cuda_edge_sum_per_src_hit.data());
0423
0424 ACTS_CUDA_CHECK(cudaMemcpyAsync(
0425 &nb_doublet_edges,
0426 &cuda_edge_sum_per_src_hit.data()[sum_nb_src_hits_per_doublet],
0427 sizeof(int), cudaMemcpyDeviceToHost, stream));
0428 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0429 ACTS_DEBUG("nb_doublet_edges: " << nb_doublet_edges);
0430 ACTS_DEBUG("Allocate " << (2ul * nb_doublet_edges * sizeof(int)) * 1.0e-6
0431 << " MB for edges");
0432 cuda_reduced_M1_hits.emplace(nb_doublet_edges, stream);
0433 cuda_reduced_M2_hits.emplace(nb_doublet_edges, stream);
0434
0435 detail::build_doublet_edges_new<float>
0436 <<<grid_dim_shpd, block_dim, 0, stream>>>(
0437 sum_nb_src_hits_per_doublet, nb_doublets,
0438 cuda_nb_src_hits_per_doublet.data(),
0439 m_cudaModuleMapDoublet->cuda_module1(),
0440 m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0441 cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(),
0442 cuda_TThits.cuda_phi(), m_cudaModuleMapDoublet->cuda_z0_min(),
0443 m_cudaModuleMapDoublet->cuda_z0_max(),
0444 m_cudaModuleMapDoublet->cuda_deta_min(),
0445 m_cudaModuleMapDoublet->cuda_deta_max(),
0446 m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0447 m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0448 m_cudaModuleMapDoublet->cuda_dphi_min(),
0449 m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice,
0450 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0451 cuda_edge_sum_per_src_hit.data(), m_cfg.epsilon);
0452 ACTS_CUDA_CHECK(cudaGetLastError());
0453
0454 detail::computeDoubletEdgeSum<<<grid_dim, block_dim, 0, stream>>>(
0455 nb_doublets, cuda_nb_src_hits_per_doublet.data(),
0456 cuda_edge_sum_per_src_hit.data(), cuda_edge_sum.data());
0457 ACTS_CUDA_CHECK(cudaGetLastError());
0458 } else {
0459
0460 ScopedCudaPtr<int> cuda_nb_doublet_edges(1, stream);
0461 ACTS_CUDA_CHECK(
0462 cudaMemsetAsync(cuda_nb_doublet_edges.data(), 0, sizeof(int), stream));
0463
0464 detail::count_doublet_edges<float><<<grid_dim, block_dim, 0, stream>>>(
0465 nb_doublets, m_cudaModuleMapDoublet->cuda_module1(),
0466 m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0467 cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(), cuda_TThits.cuda_phi(),
0468 m_cudaModuleMapDoublet->cuda_z0_min(),
0469 m_cudaModuleMapDoublet->cuda_z0_max(),
0470 m_cudaModuleMapDoublet->cuda_deta_min(),
0471 m_cudaModuleMapDoublet->cuda_deta_max(),
0472 m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0473 m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0474 m_cudaModuleMapDoublet->cuda_dphi_min(),
0475 m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice, TMath::Pi(),
0476 cuda_nb_doublet_edges.data(), cuda_edge_sum.data(), m_cfg.epsilon);
0477 ACTS_CUDA_CHECK(cudaGetLastError());
0478
0479
0480 ACTS_CUDA_CHECK(cudaMemcpyAsync(&nb_doublet_edges,
0481 cuda_nb_doublet_edges.data(), sizeof(int),
0482 cudaMemcpyDeviceToHost, stream));
0483 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0484 ACTS_DEBUG("nb_doublet_edges: " << nb_doublet_edges);
0485
0486 ACTS_DEBUG("Allocate " << (2ul * nb_doublet_edges * sizeof(int)) * 1.0e-6
0487 << " MB for edges");
0488 cuda_reduced_M1_hits.emplace(nb_doublet_edges, stream);
0489 cuda_reduced_M2_hits.emplace(nb_doublet_edges, stream);
0490
0491
0492 thrust::exclusive_scan(thrust::device.on(stream), cuda_edge_sum.data(),
0493 cuda_edge_sum.data() + (nb_doublets + 1),
0494 cuda_edge_sum.data());
0495
0496 detail::doublet_cuts_new<float><<<grid_dim, block_dim, 0, stream>>>(
0497 nb_doublets, m_cudaModuleMapDoublet->cuda_module1(),
0498 m_cudaModuleMapDoublet->cuda_module2(), cuda_TThits.cuda_R(),
0499 cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(), cuda_TThits.cuda_phi(),
0500 m_cudaModuleMapDoublet->cuda_z0_min(),
0501 m_cudaModuleMapDoublet->cuda_z0_max(),
0502 m_cudaModuleMapDoublet->cuda_deta_min(),
0503 m_cudaModuleMapDoublet->cuda_deta_max(),
0504 m_cudaModuleMapDoublet->cuda_phi_slope_min(),
0505 m_cudaModuleMapDoublet->cuda_phi_slope_max(),
0506 m_cudaModuleMapDoublet->cuda_dphi_min(),
0507 m_cudaModuleMapDoublet->cuda_dphi_max(), cuda_hit_indice, TMath::Pi(),
0508 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0509 cuda_edge_sum.data(), m_cfg.epsilon);
0510 ACTS_CUDA_CHECK(cudaGetLastError());
0511 }
0512
0513 ACTS_VERBOSE("First 10 doublet edges:\n"
0514 << debugPrintEdges(nb_doublet_edges,
0515 cuda_reduced_M1_hits->data(),
0516 cuda_reduced_M2_hits->data()));
0517 if (nb_doublet_edges == 0) {
0518 throw NoEdgesError{};
0519 }
0520
0521
0522
0523
0524 ScopedCudaPtr<int> cuda_sorted_M2_hits(nb_doublet_edges, stream);
0525
0526 grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0527 init_vector<<<grid_dim, block_dim, 0, stream>>>(cuda_sorted_M2_hits.data(),
0528 nb_doublet_edges);
0529 ACTS_CUDA_CHECK(cudaGetLastError());
0530
0531 if (m_cfg.moreParallel) {
0532 dim3 grid_dim_sort = nb_doublets;
0533 dim3 block_dim_even_odd = 64;
0534 ACTS_DEBUG("Using block_odd_even_sort, grid_dim.x = "
0535 << nb_doublets << ", block_dim.x = " << block_dim_even_odd.x);
0536 detail::block_odd_even_sort<<<nb_doublets, block_dim_even_odd, 0, stream>>>(
0537 cuda_reduced_M2_hits->data(), cuda_edge_sum.data(),
0538 cuda_sorted_M2_hits.data());
0539 } else {
0540 grid_dim = ((nb_doublets + block_dim.x - 1) / block_dim.x);
0541 partial_quick_sort<<<grid_dim, block_dim, 0, stream>>>(
0542 cuda_sorted_M2_hits.data(), cuda_reduced_M2_hits->data(),
0543 cuda_edge_sum.data(), nb_doublets);
0544 ACTS_CUDA_CHECK(cudaGetLastError());
0545 }
0546
0547
0548
0549
0550
0551 ScopedCudaPtr<float> cuda_z0(nb_doublet_edges, stream),
0552 cuda_phi_slope(nb_doublet_edges, stream),
0553 cuda_deta(nb_doublet_edges, stream), cuda_dphi(nb_doublet_edges, stream);
0554 grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0555 hits_geometric_cuts<<<grid_dim, block_dim, 0, stream>>>(
0556 cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(), cuda_dphi.data(),
0557 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0558 cuda_TThits.cuda_R(), cuda_TThits.cuda_z(), cuda_TThits.cuda_eta(),
0559 cuda_TThits.cuda_phi(), TMath::Pi(), nb_doublet_edges, m_cfg.epsilon);
0560 ACTS_CUDA_CHECK(cudaGetLastError());
0561 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0562
0563 ScopedCudaPtr<bool> cuda_mask(nb_doublet_edges + 1, stream);
0564 ACTS_CUDA_CHECK(
0565 cudaMemset(cuda_mask.data(), 0, (nb_doublet_edges + 1) * sizeof(bool)));
0566 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0567
0568
0569
0570
0571 int nb_triplets = m_cudaModuleMapTriplet->size();
0572 grid_dim = ((nb_triplets + block_dim.x - 1) / block_dim.x);
0573
0574 ScopedCudaPtr<bool> cuda_vertices(cuda_TThits.size(), stream);
0575 ACTS_CUDA_CHECK(cudaMemsetAsync(cuda_vertices.data(), 0,
0576 cuda_TThits.size() * sizeof(bool), stream));
0577 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0578
0579 if (m_cfg.moreParallel) {
0580
0581 ScopedCudaPtr<int> cuda_src_hits_per_triplet(nb_triplets + 1, stream);
0582
0583 detail::count_triplet_hits<<<grid_dim, block_dim, 0, stream>>>(
0584 nb_triplets, m_cudaModuleMapTriplet->cuda_module12_map(),
0585 m_cudaModuleMapTriplet->cuda_module23_map(), cuda_edge_sum.data(),
0586 cuda_src_hits_per_triplet.data());
0587 ACTS_CUDA_CHECK(cudaGetLastError());
0588
0589
0590 thrust::exclusive_scan(thrust::device.on(stream),
0591 cuda_src_hits_per_triplet.data(),
0592 cuda_src_hits_per_triplet.data() + nb_triplets + 1,
0593 cuda_src_hits_per_triplet.data());
0594
0595 int nb_src_hits_per_triplet_sum{};
0596 ACTS_CUDA_CHECK(
0597 cudaMemcpyAsync(&nb_src_hits_per_triplet_sum,
0598 &cuda_src_hits_per_triplet.data()[nb_triplets],
0599 sizeof(int), cudaMemcpyDeviceToHost, stream));
0600 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0601 ACTS_DEBUG("nb_src_hits_per_triplet_sum: " << nb_src_hits_per_triplet_sum);
0602 if (nb_src_hits_per_triplet_sum == 0) {
0603 throw NoEdgesError{};
0604 }
0605
0606 dim3 grid_dim_shpt =
0607 ((nb_src_hits_per_triplet_sum + block_dim.x - 1) / block_dim.x);
0608
0609 detail::triplet_cuts_new2<float><<<grid_dim_shpt, block_dim, 0, stream>>>(
0610 nb_src_hits_per_triplet_sum, nb_triplets,
0611 cuda_src_hits_per_triplet.data(),
0612 m_cudaModuleMapTriplet->cuda_module12_map(),
0613 m_cudaModuleMapTriplet->cuda_module23_map(), cuda_TThits.cuda_x(),
0614 cuda_TThits.cuda_y(), cuda_TThits.cuda_z(), cuda_TThits.cuda_R(),
0615 cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(),
0616 cuda_dphi.data(), m_cudaModuleMapTriplet->module12().cuda_z0_min(),
0617 m_cudaModuleMapTriplet->module12().cuda_z0_max(),
0618 m_cudaModuleMapTriplet->module12().cuda_deta_min(),
0619 m_cudaModuleMapTriplet->module12().cuda_deta_max(),
0620 m_cudaModuleMapTriplet->module12().cuda_phi_slope_min(),
0621 m_cudaModuleMapTriplet->module12().cuda_phi_slope_max(),
0622 m_cudaModuleMapTriplet->module12().cuda_dphi_min(),
0623 m_cudaModuleMapTriplet->module12().cuda_dphi_max(),
0624 m_cudaModuleMapTriplet->module23().cuda_z0_min(),
0625 m_cudaModuleMapTriplet->module23().cuda_z0_max(),
0626 m_cudaModuleMapTriplet->module23().cuda_deta_min(),
0627 m_cudaModuleMapTriplet->module23().cuda_deta_max(),
0628 m_cudaModuleMapTriplet->module23().cuda_phi_slope_min(),
0629 m_cudaModuleMapTriplet->module23().cuda_phi_slope_max(),
0630 m_cudaModuleMapTriplet->module23().cuda_dphi_min(),
0631 m_cudaModuleMapTriplet->module23().cuda_dphi_max(),
0632 m_cudaModuleMapTriplet->cuda_diff_dydx_min(),
0633 m_cudaModuleMapTriplet->cuda_diff_dydx_max(),
0634 m_cudaModuleMapTriplet->cuda_diff_dzdr_min(),
0635 m_cudaModuleMapTriplet->cuda_diff_dzdr_max(), TMath::Pi(),
0636 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0637 cuda_sorted_M2_hits.data(), cuda_edge_sum.data(), cuda_vertices.data(),
0638 cuda_mask.data(), m_cfg.epsilon);
0639 ACTS_CUDA_CHECK(cudaGetLastError());
0640 } else {
0641 Acts::detail::triplet_cuts_new<float><<<grid_dim, block_dim, 0, stream>>>(
0642 nb_triplets, m_cudaModuleMapTriplet->cuda_module12_map(),
0643 m_cudaModuleMapTriplet->cuda_module23_map(), cuda_TThits.cuda_x(),
0644 cuda_TThits.cuda_y(), cuda_TThits.cuda_z(), cuda_TThits.cuda_R(),
0645 cuda_z0.data(), cuda_phi_slope.data(), cuda_deta.data(),
0646 cuda_dphi.data(), m_cudaModuleMapTriplet->module12().cuda_z0_min(),
0647 m_cudaModuleMapTriplet->module12().cuda_z0_max(),
0648 m_cudaModuleMapTriplet->module12().cuda_deta_min(),
0649 m_cudaModuleMapTriplet->module12().cuda_deta_max(),
0650 m_cudaModuleMapTriplet->module12().cuda_phi_slope_min(),
0651 m_cudaModuleMapTriplet->module12().cuda_phi_slope_max(),
0652 m_cudaModuleMapTriplet->module12().cuda_dphi_min(),
0653 m_cudaModuleMapTriplet->module12().cuda_dphi_max(),
0654 m_cudaModuleMapTriplet->module23().cuda_z0_min(),
0655 m_cudaModuleMapTriplet->module23().cuda_z0_max(),
0656 m_cudaModuleMapTriplet->module23().cuda_deta_min(),
0657 m_cudaModuleMapTriplet->module23().cuda_deta_max(),
0658 m_cudaModuleMapTriplet->module23().cuda_phi_slope_min(),
0659 m_cudaModuleMapTriplet->module23().cuda_phi_slope_max(),
0660 m_cudaModuleMapTriplet->module23().cuda_dphi_min(),
0661 m_cudaModuleMapTriplet->module23().cuda_dphi_max(),
0662 m_cudaModuleMapTriplet->cuda_diff_dydx_min(),
0663 m_cudaModuleMapTriplet->cuda_diff_dydx_max(),
0664 m_cudaModuleMapTriplet->cuda_diff_dzdr_min(),
0665 m_cudaModuleMapTriplet->cuda_diff_dzdr_max(), TMath::Pi(),
0666 cuda_reduced_M1_hits->data(), cuda_reduced_M2_hits->data(),
0667 cuda_sorted_M2_hits.data(), cuda_edge_sum.data(), cuda_vertices.data(),
0668 cuda_mask.data(), m_cfg.epsilon);
0669 ACTS_CUDA_CHECK(cudaGetLastError());
0670 }
0671
0672
0673
0674
0675 ScopedCudaPtr<int> cuda_mask_sum(nb_doublet_edges + 1, stream);
0676 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0677 thrust::transform_exclusive_scan(thrust::device.on(stream), cuda_mask.data(),
0678 cuda_mask.data() + (nb_doublet_edges + 1),
0679 cuda_mask_sum.data(), CastBoolToInt{}, 0,
0680 thrust::plus<int>());
0681
0682 int nb_graph_edges{};
0683 ACTS_CUDA_CHECK(cudaMemcpyAsync(&nb_graph_edges,
0684 &cuda_mask_sum.data()[nb_doublet_edges],
0685 sizeof(int), cudaMemcpyDeviceToHost, stream));
0686 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0687
0688 ACTS_DEBUG("nb_graph_edges: " << nb_graph_edges);
0689 if (nb_graph_edges == 0) {
0690 throw NoEdgesError{};
0691 }
0692
0693
0694
0695
0696 int *cuda_graph_edge_ptr{};
0697 ACTS_CUDA_CHECK(cudaMallocAsync(&cuda_graph_edge_ptr,
0698 2 * nb_graph_edges * sizeof(int), stream));
0699 int *cuda_graph_M1_hits = cuda_graph_edge_ptr;
0700 int *cuda_graph_M2_hits = cuda_graph_edge_ptr + nb_graph_edges;
0701
0702 grid_dim = ((nb_doublet_edges + block_dim.x - 1) / block_dim.x);
0703 ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0704 compact_stream<<<grid_dim, block_dim, 0, stream>>>(
0705 cuda_graph_M1_hits, cuda_reduced_M1_hits->data(), cuda_mask.data(),
0706 cuda_mask_sum.data(), nb_doublet_edges);
0707 ACTS_CUDA_CHECK(cudaGetLastError());
0708 compact_stream<<<grid_dim, block_dim, 0, stream>>>(
0709 cuda_graph_M2_hits, cuda_reduced_M2_hits->data(), cuda_mask.data(),
0710 cuda_mask_sum.data(), nb_doublet_edges);
0711 ACTS_CUDA_CHECK(cudaGetLastError());
0712
0713 ACTS_VERBOSE("First 10 doublet edges:\n"
0714 << debugPrintEdges(nb_graph_edges, cuda_graph_M1_hits,
0715 cuda_graph_M2_hits));
0716
0717 detail::CUDA_edge_data<float> edge_data{};
0718 edge_data.nEdges = nb_graph_edges;
0719 edge_data.cudaEdgePtr = cuda_graph_edge_ptr;
0720
0721 return edge_data;
0722 }
0723
0724 }