Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:12:16

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008 
0009 // CUDA plugin include(s).
0010 #include "Acts/Plugins/Cuda/Seeding2/Details/CountDublets.hpp"
0011 #include "Acts/Plugins/Cuda/Seeding2/Details/Types.hpp"
0012 
0013 #include "../Utilities/ErrorCheck.cuh"
0014 
0015 // CUDA include(s).
0016 #include <cuda_runtime.h>
0017 
0018 // System include(s).
0019 #include <algorithm>
0020 
0021 namespace Acts {
0022 namespace Cuda {
0023 namespace Kernels {
0024 
0025 /// Kernel performing the dublet countint
0026 ///
0027 /// This is pretty much just a copy of "reduce3" kernel from the samples shipped
0028 /// with CUDA. It finds the sums and maxima of a couple of parameters across all
0029 /// the dublets found by @c Acts::Cuda::Details::findDublets.
0030 ///
0031 /// @param[in] nMiddleSPs The number of middle spacepoints
0032 /// @param[in] middleBottomCounts 1-D array of the number of middle-bottom
0033 ///            dublets found for each middle spacepoint
0034 /// @param[in] middleTopCounts 1-D array of the number of middle-top dublets
0035 ///            found for each middle spacepoint
0036 /// @param[out] dubletCounts 1-D array of @c Acts::Cuda::Details::DubletCounts
0037 ///             objects for each execution block
0038 ///
0039 __global__ void countDublets(std::size_t nMiddleSPs,
0040                              const unsigned int* middleBottomCounts,
0041                              const unsigned int* middleTopCounts,
0042                              Details::DubletCounts* dubletCounts) {
0043   // Sum shared by all threads in a single execution block.
0044   extern __shared__ Details::DubletCounts sum[];
0045 
0046   // Get the thread identifier. Note that the kernel launch requests half as
0047   // many threads than how many elements we have in the arrays.
0048   const int middleIndex = blockIdx.x * blockDim.x * 2 + threadIdx.x;
0049 
0050   Details::DubletCounts thisSum;
0051   if (middleIndex < nMiddleSPs) {
0052     thisSum.nDublets =
0053         (middleBottomCounts[middleIndex] + middleTopCounts[middleIndex]);
0054     thisSum.nTriplets =
0055         (middleBottomCounts[middleIndex] * middleTopCounts[middleIndex]);
0056     thisSum.maxMBDublets = middleBottomCounts[middleIndex];
0057     thisSum.maxMTDublets = middleTopCounts[middleIndex];
0058     thisSum.maxTriplets = thisSum.nTriplets;
0059   }
0060   if (middleIndex + blockDim.x < nMiddleSPs) {
0061     thisSum.nDublets += (middleBottomCounts[middleIndex + blockDim.x] +
0062                          middleTopCounts[middleIndex + blockDim.x]);
0063     thisSum.nTriplets += (middleBottomCounts[middleIndex + blockDim.x] *
0064                           middleTopCounts[middleIndex + blockDim.x]);
0065     thisSum.maxMBDublets =
0066         max(middleBottomCounts[middleIndex + blockDim.x], thisSum.maxMBDublets);
0067     thisSum.maxMTDublets =
0068         max(middleTopCounts[middleIndex + blockDim.x], thisSum.maxMTDublets);
0069     thisSum.maxTriplets = max((middleBottomCounts[middleIndex + blockDim.x] *
0070                                middleTopCounts[middleIndex + blockDim.x]),
0071                               thisSum.maxTriplets);
0072   }
0073 
0074   // Load the first sum step into shared memory.
0075   sum[threadIdx.x] = thisSum;
0076   __syncthreads();
0077 
0078   // Do the summation in some iterations.
0079   for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) {
0080     if (threadIdx.x < i) {
0081       const Details::DubletCounts& otherSum = sum[threadIdx.x + i];
0082       thisSum.nDublets += otherSum.nDublets;
0083       thisSum.nTriplets += otherSum.nTriplets;
0084       thisSum.maxMBDublets = max(thisSum.maxMBDublets, otherSum.maxMBDublets);
0085       thisSum.maxMTDublets = max(thisSum.maxMTDublets, otherSum.maxMTDublets);
0086       thisSum.maxTriplets = max(thisSum.maxTriplets, otherSum.maxTriplets);
0087       sum[threadIdx.x] = thisSum;
0088     }
0089     __syncthreads();
0090   }
0091 
0092   // Write the result of this execution block into the global memory.
0093   if (threadIdx.x == 0) {
0094     dubletCounts[blockIdx.x] = thisSum;
0095   }
0096   return;
0097 }
0098 
0099 }  // namespace Kernels
0100 
0101 namespace Details {
0102 
0103 DubletCounts countDublets(
0104     std::size_t maxBlockSize, std::size_t nMiddleSP,
0105     const device_array<unsigned int>& middleBottomCountArray,
0106     const device_array<unsigned int>& middleTopCountArray) {
0107   // Calculate the parallelisation for the dublet counting.
0108   const int numBlocks = (nMiddleSP + maxBlockSize - 1) / maxBlockSize;
0109   const int sharedMem = maxBlockSize * sizeof(DubletCounts);
0110 
0111   // Create the small memory block in which we will get the count back for each
0112   // execution block.
0113   auto dubletCountsDevice = make_device_array<DubletCounts>(numBlocks);
0114 
0115   // Run the reduction kernel.
0116   Kernels::countDublets<<<numBlocks, maxBlockSize, sharedMem>>>(
0117       nMiddleSP, middleBottomCountArray.get(), middleTopCountArray.get(),
0118       dubletCountsDevice.get());
0119   ACTS_CUDA_ERROR_CHECK(cudaGetLastError());
0120   ACTS_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
0121 
0122   // Copy the sum(s) back to the host.
0123   auto dubletCountsHost = make_host_array<DubletCounts>(numBlocks);
0124   copyToHost(dubletCountsHost, dubletCountsDevice, numBlocks);
0125 
0126   // Perform the final summation on the host. Assuming that the number of
0127   // middle space points is not so large that it would make sense to do the
0128   // summation iteratively on the device. (We should get one result object per
0129   // 1024 middle spacepoints on any modern GPU.)
0130   DubletCounts result;
0131   for (int i = 0; i < numBlocks; ++i) {
0132     result.nDublets += dubletCountsHost.get()[i].nDublets;
0133     result.nTriplets += dubletCountsHost.get()[i].nTriplets;
0134     result.maxMBDublets =
0135         std::max(dubletCountsHost.get()[i].maxMBDublets, result.maxMBDublets);
0136     result.maxMTDublets =
0137         std::max(dubletCountsHost.get()[i].maxMTDublets, result.maxMTDublets);
0138     result.maxTriplets =
0139         std::max(dubletCountsHost.get()[i].maxTriplets, result.maxTriplets);
0140   }
0141   return result;
0142 }
0143 
0144 }  // namespace Details
0145 }  // namespace Cuda
0146 }  // namespace Acts