File indexing completed on 2025-01-18 10:13:51
0001
0002
0003
0004 #ifndef VECGEOM_BACKEND_CUDA_INTERFACE_H_
0005 #define VECGEOM_BACKEND_CUDA_INTERFACE_H_
0006
0007 #include "VecGeom/base/Config.h"
0008 #include "VecGeom/base/Global.h"
0009
0010 #ifdef VECGEOM_ENABLE_CUDA
0011
0012 #include "driver_types.h" // Required for cudaError_t type
0013 #include "cuda_runtime.h"
0014
0015 #include <vector>
0016 #include <unordered_map>
0017 #include <type_traits>
0018
0019 namespace vecgeom {
0020
0021 #ifdef VECCORE_CUDA
0022
0023 inline namespace cuda {
0024
0025 template <typename DataClass, typename... ArgsTypes>
0026 __global__ void ConstructOnGpu(DataClass *gpu_ptr, ArgsTypes... params)
0027 {
0028 new (gpu_ptr) DataClass(params...);
0029 }
0030
0031 template <typename DataClass, typename... ArgsTypes>
0032 __global__ void ConstructArrayOnGpu(DataClass *gpu_ptr, size_t nElements, ArgsTypes... params)
0033 {
0034
0035 unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
0036
0037 unsigned int idx = tid;
0038 while (idx < nElements) {
0039 new (gpu_ptr + idx) DataClass(params...);
0040 idx += blockDim.x * gridDim.x;
0041 }
0042 }
0043
0044
0045
0046
0047
0048
0049
0050
0051 template <typename DataClass, typename... ArgsTypes>
0052 __global__ void ConstructManyOnGpu_kernel(size_t nElements, DataClass **gpu_ptrs, const ArgsTypes *... params)
0053 {
0054 const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
0055
0056 for (size_t idx = tid; idx < nElements; idx += blockDim.x * gridDim.x) {
0057 new (gpu_ptrs[idx]) DataClass(params[idx]...);
0058 }
0059 }
0060
0061 template <typename DataClass>
0062 __global__ void CopyBBoxesToGpu(size_t nElements, DataClass **raw_ptrs, Precision *boxes)
0063 {
0064 const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
0065
0066 for (size_t idx = tid; idx < nElements; idx += blockDim.x * gridDim.x) {
0067 raw_ptrs[idx]->SetBBox({boxes[6 * idx], boxes[6 * idx + 1], boxes[6 * idx + 2]},
0068 {boxes[6 * idx + 3], boxes[6 * idx + 4], boxes[6 * idx + 5]});
0069 }
0070 }
0071
0072 template <typename DataClass, typename... ArgsTypes>
0073 void Generic_CopyToGpu(DataClass *const gpu_ptr, ArgsTypes... params)
0074 {
0075 ConstructOnGpu<<<1, 1>>>(gpu_ptr, params...);
0076 }
0077
0078 }
0079
0080 #else
0081
0082 namespace cuda {
0083
0084 template <typename Type>
0085 Type *AllocateOnDevice();
0086 template <typename DataClass, typename... ArgsTypes>
0087 void Generic_CopyToGpu(DataClass *const gpu_ptr, ArgsTypes... params);
0088
0089 }
0090
0091 #endif
0092
0093 #ifdef VECCORE_CUDA
0094 namespace cxx {
0095 #else
0096 inline namespace cxx {
0097 #endif
0098
0099 cudaError_t CudaCheckError(const cudaError_t err);
0100
0101 cudaError_t CudaCheckError();
0102
0103 void CudaAssertError(const cudaError_t err);
0104
0105 void CudaAssertError();
0106
0107 cudaError_t CudaMalloc(void **ptr, unsigned size);
0108
0109 cudaError_t CudaCopyToDevice(void *tgt, void const *src, unsigned size);
0110
0111 cudaError_t CudaCopyFromDevice(void *tgt, void const *src, unsigned size);
0112
0113 cudaError_t CudaCopyFromDeviceAsync(void *tgt, void const * src, unsigned size, cudaStream_t stream);
0114
0115 cudaError_t CudaFree(void *ptr);
0116
0117 cudaError_t CudaDeviceSetStackLimit(unsigned size);
0118
0119 cudaError_t CudaDeviceSetHeapLimit(unsigned size);
0120
0121 template <typename Type>
0122 Type *AllocateOnDevice()
0123 {
0124 Type *ptr;
0125 vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaMalloc((void **)&ptr, sizeof(Type)));
0126 return ptr;
0127 }
0128
0129 template <typename Type>
0130 Type *AllocateOnGpu(const unsigned int size)
0131 {
0132 Type *ptr;
0133 vecgeom::cxx::CudaAssertError(CudaMalloc((void **)&ptr, size));
0134 return ptr;
0135 }
0136
0137 template <typename Type>
0138 Type *AllocateOnGpu()
0139 {
0140 return AllocateOnGpu<Type>(sizeof(Type));
0141 }
0142
0143 template <typename Type>
0144 void FreeFromGpu(Type *const ptr)
0145 {
0146 vecgeom::cxx::CudaAssertError(CudaFree(ptr));
0147 }
0148
0149 template <typename Type>
0150 void CopyToGpu(Type const *const src, Type *const tgt, const unsigned size)
0151 {
0152 vecgeom::cxx::CudaAssertError(CudaCopyToDevice(tgt, src, size));
0153 }
0154
0155 template <typename Type>
0156 void CopyToGpu(Type const *const src, Type *const tgt)
0157 {
0158 CopyToGpu<Type>(src, tgt, sizeof(Type));
0159 }
0160
0161 template <typename Type>
0162 void CopyFromGpu(Type const *const src, Type *const tgt, const unsigned size)
0163 {
0164 vecgeom::cxx::CudaAssertError(CudaCopyFromDevice(tgt, src, size));
0165 }
0166
0167 class DevicePtrBase {
0168 void *fPtr;
0169 #ifdef DEBUG_DEVICEPTR
0170 size_t fAllocatedSize;
0171 bool fIncremented;
0172 #endif
0173
0174 protected:
0175 DevicePtrBase(const DevicePtrBase &orig)
0176 : fPtr(orig.fPtr)
0177 #ifdef DEBUG_DEVICEPTR
0178 ,
0179 fAllocatedSize(0), fIncremented(false)
0180 #endif
0181 {
0182 }
0183
0184 DevicePtrBase &operator=(const DevicePtrBase &orig)
0185 {
0186 fPtr = orig.fPtr;
0187 #ifdef DEBUG_DEVICEPTR
0188 fAllocatedSize = orig.fAllocatedSize;
0189 fIncremented = orig.fIncremented;
0190 #endif
0191 return *this;
0192 }
0193
0194 void MemcpyToDevice(const void *what, unsigned long nbytes)
0195 {
0196 if (nbytes) vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaCopyToDevice(fPtr, what, nbytes));
0197 }
0198
0199 void MemcpyToHostAsync(void *where, unsigned long nbytes, cudaStream_t stream)
0200 {
0201 vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaCopyFromDeviceAsync(where, fPtr, nbytes, stream));
0202 }
0203
0204 VECCORE_ATT_HOST_DEVICE
0205 void *GetPtr() const { return fPtr; }
0206
0207 void Free()
0208 {
0209 vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaFree((void *)fPtr));
0210 #ifdef DEBUG_DEVICEPTR
0211 fAllocatedSize = 0;
0212 #endif
0213 }
0214
0215 void Increment(long add)
0216 {
0217 fPtr = (char *)fPtr + add;
0218 #ifdef DEBUG_DEVICEPTR
0219 if (add) fIncremented = true;
0220 #endif
0221 }
0222
0223 public:
0224 DevicePtrBase()
0225 : fPtr(0)
0226 #ifdef DEBUG_DEVICEPTR
0227 ,
0228 fAllocatedSize(0), fIncremented(0)
0229 #endif
0230 {
0231 }
0232
0233 explicit DevicePtrBase(void *input)
0234 : fPtr(input)
0235 #ifdef DEBUG_DEVICEPTR
0236 ,
0237 fAllocatedSize(0), fIncremented(0)
0238 #endif
0239 {
0240 }
0241
0242 ~DevicePtrBase()
0243 {
0244 }
0245
0246 void Malloc(unsigned long size)
0247 {
0248 vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaMalloc((void **)&fPtr, size));
0249 #ifdef DEBUG_DEVICEPTR
0250 fAllocatedSize = size;
0251 #endif
0252 }
0253 };
0254
0255 template <typename T>
0256 class DevicePtr;
0257
0258 template <typename Type, typename Derived = DevicePtr<Type>>
0259 class DevicePtrImpl : public DevicePtrBase {
0260 protected:
0261 DevicePtrImpl(const DevicePtrImpl & ) = default;
0262 DevicePtrImpl &operator=(const DevicePtrImpl & ) = default;
0263 DevicePtrImpl() = default;
0264 explicit DevicePtrImpl(void *input) : DevicePtrBase(input) {}
0265 ~DevicePtrImpl() = default;
0266
0267 public:
0268 void Allocate(unsigned long nelems = 1) { Malloc(nelems * Derived::SizeOf()); }
0269
0270 void Deallocate() { Free(); }
0271
0272 void ToDevice(const Type *what, unsigned long nelems = 1) { MemcpyToDevice(what, nelems * Derived::SizeOf()); }
0273 void FromDevice(Type *where, cudaStream_t stream)
0274 {
0275
0276 MemcpyToHostAsync(where, Derived::SizeOf(), stream);
0277 }
0278 void FromDevice(Type *where, unsigned long nelems, cudaStream_t stream)
0279 {
0280
0281 MemcpyToHostAsync(where, nelems * Derived::SizeOf(), stream);
0282 }
0283
0284 VECCORE_ATT_HOST_DEVICE
0285 Type *GetPtr() const { return reinterpret_cast<Type *>(DevicePtrBase::GetPtr()); }
0286
0287 VECCORE_ATT_HOST_DEVICE
0288 operator Type *() const { return GetPtr(); }
0289
0290 Derived &operator++()
0291 {
0292 Increment(Derived::SizeOf());
0293 return *(Derived *)this;
0294 }
0295
0296 Derived operator++(int)
0297 {
0298 Derived tmp(*(Derived *)this);
0299 Increment(Derived::SizeOf());
0300 return tmp;
0301 }
0302
0303 Derived &operator+=(long len)
0304 {
0305 Increment(len * Derived::SizeOf());
0306 return *(Derived *)this;
0307 }
0308 };
0309
0310 template <typename Type>
0311 class DevicePtr : public DevicePtrImpl<Type> {
0312 public:
0313 DevicePtr() = default;
0314 DevicePtr(const DevicePtr &) = default;
0315 DevicePtr &operator=(const DevicePtr &orig) = default;
0316
0317
0318 explicit DevicePtr(void *input) : DevicePtrImpl<Type>(input) {}
0319
0320
0321
0322
0323
0324 template <typename inputType>
0325 explicit DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<Type>((void *)input)
0326 {
0327 }
0328
0329
0330 DevicePtr(DevicePtr<const Type> const &input,
0331 typename std::enable_if<!std::is_const<Type>::value, Type>::type * = nullptr) = delete;
0332
0333 #ifdef VECCORE_CUDA
0334
0335 template <typename inputType, typename std::enable_if<std::is_base_of<Type, inputType>::value>::type * = nullptr>
0336 DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<Type>(input.GetPtr())
0337 {
0338 }
0339
0340
0341 template <typename inputType, typename std::enable_if<std::is_base_of<Type, inputType>::value>::type * = nullptr>
0342 DevicePtr(DevicePtr<const inputType> const &input) = delete;
0343 #endif
0344
0345 #ifdef VECCORE_CUDA
0346 template <typename... ArgsTypes>
0347 void Construct(ArgsTypes... params) const
0348 {
0349 ConstructOnGpu<<<1, 1>>>(this->GetPtr(), params...);
0350 }
0351
0352 template <typename... ArgsTypes>
0353 void ConstructArray(size_t nElements, ArgsTypes... params) const
0354 {
0355 ConstructArrayOnGpu<<<nElements, 1>>>(this->GetPtr(), nElements, params...);
0356 }
0357
0358 static size_t SizeOf() { return sizeof(Type); }
0359
0360 #else
0361 template <typename... ArgsTypes>
0362 void Construct(ArgsTypes... params) const;
0363 template <typename... ArgsTypes>
0364 void ConstructArray(size_t nElements, ArgsTypes... params) const;
0365
0366 static size_t SizeOf();
0367 #endif
0368 };
0369
0370 template <typename Type>
0371 class DevicePtr<const Type> : private DevicePtrImpl<const Type> {
0372 public:
0373 DevicePtr() = default;
0374 DevicePtr(const DevicePtr &) = default;
0375 DevicePtr &operator=(const DevicePtr &orig) = default;
0376
0377
0378 explicit DevicePtr(void *input) : DevicePtrBase(input) {}
0379
0380
0381
0382
0383
0384 template <typename inputType>
0385 explicit DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<const Type>((void *)input)
0386 {
0387 }
0388
0389
0390 DevicePtr(DevicePtr<typename std::remove_const<Type>::type> const &input) : DevicePtrImpl<const Type>((void *)input)
0391 {
0392 }
0393
0394 #ifdef VECCORE_CUDA
0395
0396 template <typename inputType, typename std::enable_if<std::is_base_of<Type, inputType>::value>::type * = nullptr>
0397 DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<const Type>(input.GetPtr())
0398 {
0399 }
0400 #endif
0401
0402 VECCORE_ATT_HOST_DEVICE
0403 const Type *GetPtr() const { return reinterpret_cast<const Type *>(DevicePtrBase::GetPtr()); }
0404
0405 VECCORE_ATT_HOST_DEVICE
0406 operator const Type *() const { return GetPtr(); }
0407
0408 #ifdef VECCORE_CUDA
0409 template <typename DataClass, typename... ArgsTypes>
0410 void Construct(ArgsTypes... params) const
0411 {
0412 ConstructOnGpu<<<1, 1>>>(*(*this), params...);
0413 }
0414
0415 template <typename... ArgsTypes>
0416 void ConstructArray(size_t nElements, ArgsTypes... params) const
0417 {
0418 ConstructArrayOnGpu<<<nElements, 1>>>(this->GetPtr(), nElements, params...);
0419 }
0420
0421 static size_t SizeOf() { return sizeof(Type); }
0422
0423 #else
0424 template <typename... ArgsTypes>
0425 void Construct(ArgsTypes... params) const;
0426 template <typename... ArgsTypes>
0427 void ConstructArray(size_t nElements, ArgsTypes... params) const;
0428
0429 static size_t SizeOf();
0430 #endif
0431 };
0432
0433 namespace CudaInterfaceHelpers {
0434
0435
0436
0437
0438
0439
0440
0441
0442
0443
0444 template <typename Arg_t, typename... Args_t>
0445 void allocateAndCopyToGpu(std::unordered_map<const void *, void *> &cpuToGpuMapping, std::size_t nElement,
0446 const Arg_t *toCopy, const Args_t *... restToCopy)
0447 {
0448 const auto nByte = sizeof(toCopy[0]) * nElement;
0449 const void *hostMem = toCopy;
0450 void *deviceMem = AllocateOnGpu<void *>(nByte);
0451 cpuToGpuMapping[hostMem] = deviceMem;
0452 CopyToGpu(hostMem, deviceMem, nByte);
0453
0454 #if __cplusplus >= 201703L
0455 if constexpr (sizeof...(Args_t) > 0) {
0456 allocateAndCopyToGpu(cpuToGpuMapping, nElement, restToCopy...);
0457 }
0458 #else
0459
0460 int expandParameterPack[] = {0, ((void)allocateAndCopyToGpu(cpuToGpuMapping, nElement, restToCopy), 0)...};
0461 (void)expandParameterPack[0];
0462 #endif
0463 }
0464
0465 }
0466
0467
0468
0469
0470
0471
0472
0473
0474
0475 template <class DataClass, class DevPtr_t, typename... Args_t>
0476 void ConstructManyOnGpu(std::size_t nElement, const DevPtr_t *gpu_ptrs, const Args_t *... params)
0477 #ifdef VECCORE_CUDA
0478 {
0479 using namespace CudaInterfaceHelpers;
0480 std::unordered_map<const void *, void *> cpuToGpuMem;
0481 std::vector<DataClass *> raw_gpu_ptrs;
0482 std::transform(gpu_ptrs, gpu_ptrs + nElement, std::back_inserter(raw_gpu_ptrs),
0483 [](const DevPtr_t &ptr) { return static_cast<DataClass *>(ptr.GetPtr()); });
0484 allocateAndCopyToGpu(cpuToGpuMem, nElement, raw_gpu_ptrs.data(), params...);
0485
0486 ConstructManyOnGpu_kernel<<<128, 32>>>(raw_gpu_ptrs.size(),
0487 static_cast<decltype(raw_gpu_ptrs.data())>(cpuToGpuMem[raw_gpu_ptrs.data()]),
0488 static_cast<decltype(params)>(cpuToGpuMem[params])...);
0489
0490 for (const auto &memCpu_memGpu : cpuToGpuMem) {
0491 FreeFromGpu(memCpu_memGpu.second);
0492 }
0493
0494 CudaCheckError();
0495 }
0496 #else
0497 ;
0498 #endif
0499
0500 template <class DataClass, class DevPtr_t>
0501 void CopyBBoxesToGpuImpl(std::size_t nElement, const DevPtr_t *gpu_ptrs, Precision *boxes_data)
0502 #ifdef VECCORE_CUDA
0503 {
0504 std::unordered_map<const void *, void *> cpuToGpuMem;
0505 std::vector<DataClass *> raw_gpu_ptrs;
0506 std::transform(gpu_ptrs, gpu_ptrs + nElement, std::back_inserter(raw_gpu_ptrs),
0507 [](const DevPtr_t &ptr) { return static_cast<DataClass *>(ptr.GetPtr()); });
0508
0509 const auto nByteBoxes = 6 * nElement * sizeof(Precision);
0510 const auto nByteVolumes = nElement * sizeof(DataClass *);
0511 Precision *boxes_data_gpu = AllocateOnGpu<Precision>(nByteBoxes);
0512 DataClass **raw_gpu_ptrs_gpu = AllocateOnGpu<DataClass *>(nByteVolumes);
0513
0514 CopyToGpu(boxes_data, boxes_data_gpu, nByteBoxes);
0515 CopyToGpu(raw_gpu_ptrs.data(), raw_gpu_ptrs_gpu, nByteVolumes);
0516 cudaDeviceSynchronize();
0517
0518 CopyBBoxesToGpu<DataClass><<<128, 32>>>(raw_gpu_ptrs.size(), raw_gpu_ptrs_gpu, boxes_data_gpu);
0519
0520 FreeFromGpu(boxes_data_gpu);
0521 FreeFromGpu(raw_gpu_ptrs_gpu);
0522 }
0523 #else
0524 ;
0525 #endif
0526
0527 }
0528
0529 }
0530
0531 #endif
0532
0533 #endif