Gnn/src/Stages.cu

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008
0009 #include "ActsPlugins/Gnn/Stages.hpp"
0010 #include "ActsPlugins/Gnn/detail/CudaUtils.hpp"
0011
0012 #include <thrust/copy.h>
0013 #include <thrust/execution_policy.h>
0014 #include <thrust/sort.h>
0015 #include <thrust/unique.h>
0016
0017 namespace {
0018
0019 /// Scatter true into mask[usedNodes[i]] for each i in [0, nUsed)
0020 __global__ void buildNodeMaskKernel(const std::int64_t *usedNodes,
0021                                     std::size_t nUsed, bool *mask) {
0022   const std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
0023   if (i < nUsed) {
0024     mask[usedNodes[i]] = true;
0025   }
0026 }
0027
0028 /// Scatter new index newIdx into oldToNew[usedNodes[newIdx]] for each newIdx
0029 __global__ void buildReverseMapKernel(const std::int64_t *usedNodes,
0030                                       std::size_t nUsed,
0031                                       std::int64_t *oldToNew) {
0032   const std::size_t newIdx = blockIdx.x * blockDim.x + threadIdx.x;
0033   if (newIdx < nUsed) {
0034     oldToNew[usedNodes[newIdx]] = static_cast<std::int64_t>(newIdx);
0035   }
0036 }
0037
0038 /// Apply old→new remapping to all 2*nEdges edge endpoint values in-place
0039 __global__ void remapEdgesKernel(std::size_t nTotal,
0040                                  const std::int64_t *oldToNew,
0041                                  std::int64_t *edgeData) {
0042   const std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
0043   if (i < nTotal) {
0044     edgeData[i] = oldToNew[edgeData[i]];
0045   }
0046 }
0047
0048 }  // namespace
0049
0050 namespace ActsPlugins::detail {
0051
0052 PipelineTensors cudaRemoveUnusedNodes(PipelineTensors &&tensors,
0053                                       std::vector<int> &spacePointIds,
0054                                       const ExecutionContext &execCtx) {
0055   const auto stream = execCtx.stream.value();
0056   const auto nNodes = tensors.nodeFeatures.shape()[0];
0057   const auto nEdges = tensors.edgeIndex.shape()[1];
0058
0059   // Copy edgeIndex into a scratch buffer — thrust needs a mutable working
0060   // copy and the edgeIndex tensor is remapped in-place later.
0061   auto tmp = Tensor<std::int64_t>::Create({1, 2 * nEdges}, execCtx);
0062   ACTS_CUDA_CHECK(cudaMemcpyAsync(tmp.data(), tensors.edgeIndex.data(),
0063                                   tmp.nbytes(), cudaMemcpyDeviceToDevice,
0064                                   stream));
0065
0066   // Sort + unique → sorted unique used-node indices in tmp[0..nUsed)
0067   thrust::sort(thrust::device.on(stream), tmp.data(), tmp.data() + 2 * nEdges);
0068   auto *uniqEnd = thrust::unique(thrust::device.on(stream), tmp.data(),
0069                                  tmp.data() + 2 * nEdges);
0070   // nUsed must be read on host — sync the stream just for this scalar
0071   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0072   const std::size_t nUsed = static_cast<std::size_t>(uniqEnd - tmp.data());
0073
0074   // Boolean node mask [nNodes, 1]: true at each surviving node index
0075   auto mask = Tensor<bool>::Create({nNodes, 1}, execCtx);
0076   ACTS_CUDA_CHECK(cudaMemsetAsync(mask.data(), 0, mask.nbytes(), stream));
0077   const dim3 blockDim = 1024;
0078   const dim3 gridUsed = (nUsed + blockDim.x - 1) / blockDim.x;
0079   buildNodeMaskKernel<<<gridUsed, blockDim, 0, stream>>>(tmp.data(), nUsed,
0080                                                          mask.data());
0081   ACTS_CUDA_CHECK(cudaGetLastError());
0082
0083   // Reverse map [nNodes, 1]: oldToNew[old] = new index after compaction
0084   auto oldToNew = Tensor<std::int64_t>::Create({nNodes, 1}, execCtx);
0085   buildReverseMapKernel<<<gridUsed, blockDim, 0, stream>>>(tmp.data(), nUsed,
0086                                                            oldToNew.data());
0087   ACTS_CUDA_CHECK(cudaGetLastError());
0088
0089   // Shrink nodeFeatures to surviving rows using the mask
0090   auto newNodeFeatures = selectRows(tensors.nodeFeatures, mask, execCtx);
0091
0092   // Remap edge endpoint indices in-place using the old→new map
0093   const dim3 gridEdges = (2 * nEdges + blockDim.x - 1) / blockDim.x;
0094   remapEdgesKernel<<<gridEdges, blockDim, 0, stream>>>(
0095       2 * nEdges, oldToNew.data(), tensors.edgeIndex.data());
0096   ACTS_CUDA_CHECK(cudaGetLastError());
0097
0098   // Copy surviving node indices to host to update spacePointIds
0099   std::vector<std::int64_t> hostUsedNodes(nUsed);
0100   ACTS_CUDA_CHECK(cudaMemcpyAsync(hostUsedNodes.data(), tmp.data(),
0101                                   nUsed * sizeof(std::int64_t),
0102                                   cudaMemcpyDeviceToHost, stream));
0103   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0104
0105   std::vector<int> remapped;
0106   remapped.reserve(nUsed);
0107   for (const auto oldIdx : hostUsedNodes) {
0108     remapped.push_back(spacePointIds[static_cast<std::size_t>(oldIdx)]);
0109   }
0110   spacePointIds = std::move(remapped);
0111
0112   return {std::move(newNodeFeatures), std::move(tensors.edgeIndex),
0113           std::move(tensors.edgeFeatures), std::move(tensors.edgeScores)};
0114 }
0115
0116 }  // namespace ActsPlugins::detail