backend/cuda/Interface.h

0001 /// \file cuda/Backend.h
0002 /// \author Johannes de Fine Licht (johannes.definelicht@cern.ch)
0003
0004 #ifndef VECGEOM_BACKEND_CUDA_INTERFACE_H_
0005 #define VECGEOM_BACKEND_CUDA_INTERFACE_H_
0006
0007 #include "VecGeom/base/Config.h"
0008 #include "VecGeom/base/Global.h"
0009
0010 #ifdef VECGEOM_ENABLE_CUDA
0011
0012 #include "driver_types.h" // Required for cudaError_t type
0013 #include "cuda_runtime.h"
0014
0015 #include <vector>
0016 #include <unordered_map>
0017 #include <type_traits>
0018
0019 namespace vecgeom {
0020
0021 #ifdef VECCORE_CUDA
0022
0023 inline namespace cuda {
0024
0025 template <typename DataClass, typename... ArgsTypes>
0026 __global__ void ConstructOnGpu(DataClass *gpu_ptr, ArgsTypes... params)
0027 {
0028   new (gpu_ptr) DataClass(params...);
0029 }
0030
0031 template <typename DataClass, typename... ArgsTypes>
0032 __global__ void ConstructArrayOnGpu(DataClass *gpu_ptr, size_t nElements, ArgsTypes... params)
0033 {
0034
0035   unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
0036
0037   unsigned int idx = tid;
0038   while (idx < nElements) {
0039     new (gpu_ptr + idx) DataClass(params...);
0040     idx += blockDim.x * gridDim.x;
0041   }
0042 }
0043
0044 /*!
0045  * Construct many objects on the GPU, whose addresses and parameters are passed as arrays.
0046  * \tparam DataClass Type of the objects to construct.
0047  * \param nElements Number of elements to construct. It is assumed that all argument arrays have this size.
0048  * \param gpu_ptrs  Array of pointers to place the new objects at.
0049  * \param params    Array(s) of constructor parameters for each object.
0050  */
0051 template <typename DataClass, typename... ArgsTypes>
0052 __global__ void ConstructManyOnGpu_kernel(size_t nElements, DataClass **gpu_ptrs, const ArgsTypes *... params)
0053 {
0054   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
0055
0056   for (size_t idx = tid; idx < nElements; idx += blockDim.x * gridDim.x) {
0057     new (gpu_ptrs[idx]) DataClass(params[idx]...);
0058   }
0059 }
0060
0061 template <typename DataClass>
0062 __global__ void CopyBBoxesToGpu(size_t nElements, DataClass **raw_ptrs, Precision *boxes)
0063 {
0064   const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
0065
0066   for (size_t idx = tid; idx < nElements; idx += blockDim.x * gridDim.x) {
0067     raw_ptrs[idx]->SetBBox({boxes[6 * idx], boxes[6 * idx + 1], boxes[6 * idx + 2]},
0068                            {boxes[6 * idx + 3], boxes[6 * idx + 4], boxes[6 * idx + 5]});
0069   }
0070 }
0071
0072 template <typename DataClass, typename... ArgsTypes>
0073 void Generic_CopyToGpu(DataClass *const gpu_ptr, ArgsTypes... params)
0074 {
0075   ConstructOnGpu<<<1, 1>>>(gpu_ptr, params...);
0076 }
0077
0078 } // namespace cuda
0079
0080 #else
0081
0082 namespace cuda {
0083
0084 template <typename Type>
0085 Type *AllocateOnDevice();
0086 template <typename DataClass, typename... ArgsTypes>
0087 void Generic_CopyToGpu(DataClass *const gpu_ptr, ArgsTypes... params);
0088
0089 } // namespace cuda
0090
0091 #endif
0092
0093 #ifdef VECCORE_CUDA
0094 namespace cxx {
0095 #else
0096 inline namespace cxx {
0097 #endif
0098
0099 cudaError_t CudaCheckError(const cudaError_t err);
0100
0101 cudaError_t CudaCheckError();
0102
0103 void CudaAssertError(const cudaError_t err);
0104
0105 void CudaAssertError();
0106
0107 cudaError_t CudaMalloc(void **ptr, unsigned size);
0108
0109 cudaError_t CudaCopyToDevice(void *tgt, void const *src, unsigned size);
0110
0111 cudaError_t CudaCopyFromDevice(void *tgt, void const *src, unsigned size);
0112
0113 cudaError_t CudaCopyFromDeviceAsync(void *tgt, void const * src, unsigned size, cudaStream_t stream);
0114
0115 cudaError_t CudaFree(void *ptr);
0116
0117 cudaError_t CudaDeviceSetStackLimit(unsigned size);
0118
0119 cudaError_t CudaDeviceSetHeapLimit(unsigned size);
0120
0121 template <typename Type>
0122 Type *AllocateOnDevice()
0123 {
0124   Type *ptr;
0125   vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaMalloc((void **)&ptr, sizeof(Type)));
0126   return ptr;
0127 }
0128
0129 template <typename Type>
0130 Type *AllocateOnGpu(const unsigned int size)
0131 {
0132   Type *ptr;
0133   vecgeom::cxx::CudaAssertError(CudaMalloc((void **)&ptr, size));
0134   return ptr;
0135 }
0136
0137 template <typename Type>
0138 Type *AllocateOnGpu()
0139 {
0140   return AllocateOnGpu<Type>(sizeof(Type));
0141 }
0142
0143 template <typename Type>
0144 void FreeFromGpu(Type *const ptr)
0145 {
0146   vecgeom::cxx::CudaAssertError(CudaFree(ptr));
0147 }
0148
0149 template <typename Type>
0150 void CopyToGpu(Type const *const src, Type *const tgt, const unsigned size)
0151 {
0152   vecgeom::cxx::CudaAssertError(CudaCopyToDevice(tgt, src, size));
0153 }
0154
0155 template <typename Type>
0156 void CopyToGpu(Type const *const src, Type *const tgt)
0157 {
0158   CopyToGpu<Type>(src, tgt, sizeof(Type));
0159 }
0160
0161 template <typename Type>
0162 void CopyFromGpu(Type const *const src, Type *const tgt, const unsigned size)
0163 {
0164   vecgeom::cxx::CudaAssertError(CudaCopyFromDevice(tgt, src, size));
0165 }
0166
0167 class DevicePtrBase {
0168   void *fPtr;
0169 #ifdef DEBUG_DEVICEPTR
0170   size_t fAllocatedSize;
0171   bool fIncremented;
0172 #endif
0173
0174 protected:
0175   DevicePtrBase(const DevicePtrBase &orig)
0176       : fPtr(orig.fPtr)
0177 #ifdef DEBUG_DEVICEPTR
0178         ,
0179         fAllocatedSize(0), fIncremented(false)
0180 #endif
0181   {
0182   }
0183
0184   DevicePtrBase &operator=(const DevicePtrBase &orig)
0185   {
0186     fPtr = orig.fPtr;
0187 #ifdef DEBUG_DEVICEPTR
0188     fAllocatedSize = orig.fAllocatedSize;
0189     fIncremented   = orig.fIncremented;
0190 #endif
0191     return *this;
0192   }
0193
0194   void MemcpyToDevice(const void *what, unsigned long nbytes)
0195   {
0196     if (nbytes) vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaCopyToDevice(fPtr, what, nbytes));
0197   }
0198
0199   void MemcpyToHostAsync(void *where, unsigned long nbytes, cudaStream_t stream)
0200   {
0201     vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaCopyFromDeviceAsync(where, fPtr, nbytes, stream));
0202   }
0203
0204   VECCORE_ATT_HOST_DEVICE
0205   void *GetPtr() const { return fPtr; }
0206
0207   void Free()
0208   {
0209     vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaFree((void *)fPtr));
0210 #ifdef DEBUG_DEVICEPTR
0211     fAllocatedSize = 0;
0212 #endif
0213   }
0214
0215   void Increment(long add)
0216   {
0217     fPtr = (char *)fPtr + add;
0218 #ifdef DEBUG_DEVICEPTR
0219     if (add) fIncremented = true;
0220 #endif
0221   }
0222
0223 public:
0224   DevicePtrBase()
0225       : fPtr(0)
0226 #ifdef DEBUG_DEVICEPTR
0227         ,
0228         fAllocatedSize(0), fIncremented(0)
0229 #endif
0230   {
0231   }
0232
0233   explicit DevicePtrBase(void *input)
0234       : fPtr(input)
0235 #ifdef DEBUG_DEVICEPTR
0236         ,
0237         fAllocatedSize(0), fIncremented(0)
0238 #endif
0239   {
0240   }
0241
0242   ~DevicePtrBase()
0243   { /* does not own content per se */
0244   }
0245
0246   void Malloc(unsigned long size)
0247   {
0248     vecgeom::cxx::CudaAssertError(vecgeom::cxx::CudaMalloc((void **)&fPtr, size));
0249 #ifdef DEBUG_DEVICEPTR
0250     fAllocatedSize = size;
0251 #endif
0252   }
0253 };
0254
0255 template <typename T>
0256 class DevicePtr;
0257
0258 template <typename Type, typename Derived = DevicePtr<Type>>
0259 class DevicePtrImpl : public DevicePtrBase {
0260 protected:
0261   DevicePtrImpl(const DevicePtrImpl & /* orig */) = default;
0262   DevicePtrImpl &operator=(const DevicePtrImpl & /*orig*/) = default;
0263   DevicePtrImpl()                                          = default;
0264   explicit DevicePtrImpl(void *input) : DevicePtrBase(input) {}
0265   ~DevicePtrImpl() = default;
0266
0267 public:
0268   void Allocate(unsigned long nelems = 1) { Malloc(nelems * Derived::SizeOf()); }
0269
0270   void Deallocate() { Free(); }
0271
0272   void ToDevice(const Type *what, unsigned long nelems = 1) { MemcpyToDevice(what, nelems * Derived::SizeOf()); }
0273   void FromDevice(Type *where, cudaStream_t stream)
0274   {
0275     // Async since we pass a stream.
0276     MemcpyToHostAsync(where, Derived::SizeOf(), stream);
0277   }
0278   void FromDevice(Type *where, unsigned long nelems, cudaStream_t stream)
0279   {
0280     // Async since we pass a stream.
0281     MemcpyToHostAsync(where, nelems * Derived::SizeOf(), stream);
0282   }
0283
0284   VECCORE_ATT_HOST_DEVICE
0285   Type *GetPtr() const { return reinterpret_cast<Type *>(DevicePtrBase::GetPtr()); }
0286
0287   VECCORE_ATT_HOST_DEVICE
0288   operator Type *() const { return GetPtr(); }
0289
0290   Derived &operator++() // prefix ++
0291   {
0292     Increment(Derived::SizeOf());
0293     return *(Derived *)this;
0294   }
0295
0296   Derived operator++(int) // postfix ++
0297   {
0298     Derived tmp(*(Derived *)this);
0299     Increment(Derived::SizeOf());
0300     return tmp;
0301   }
0302
0303   Derived &operator+=(long len) // prefix ++
0304   {
0305     Increment(len * Derived::SizeOf());
0306     return *(Derived *)this;
0307   }
0308 };
0309
0310 template <typename Type>
0311 class DevicePtr : public DevicePtrImpl<Type> {
0312 public:
0313   DevicePtr()                  = default;
0314   DevicePtr(const DevicePtr &) = default;
0315   DevicePtr &operator=(const DevicePtr &orig) = default;
0316
0317   // should be taking a DevicePtr<void*>
0318   explicit DevicePtr(void *input) : DevicePtrImpl<Type>(input) {}
0319
0320   // Need to go via the explicit route accepting all conversion
0321   // because the regular c++ compilation
0322   // does not actually see the declaration for the cuda version
0323   // (and thus can not determine the inheritance).
0324   template <typename inputType>
0325   explicit DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<Type>((void *)input)
0326   {
0327   }
0328
0329   // Disallow conversion from const to non-const.
0330   DevicePtr(DevicePtr<const Type> const &input,
0331             typename std::enable_if<!std::is_const<Type>::value, Type>::type * = nullptr) = delete;
0332
0333 #ifdef VECCORE_CUDA
0334   // Allows implicit conversion from DevicePtr<Derived> to DevicePtr<Base>
0335   template <typename inputType, typename std::enable_if<std::is_base_of<Type, inputType>::value>::type * = nullptr>
0336   DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<Type>(input.GetPtr())
0337   {
0338   }
0339
0340   // Disallow conversion from const to non-const.
0341   template <typename inputType, typename std::enable_if<std::is_base_of<Type, inputType>::value>::type * = nullptr>
0342   DevicePtr(DevicePtr<const inputType> const &input) = delete;
0343 #endif
0344
0345 #ifdef VECCORE_CUDA
0346   template <typename... ArgsTypes>
0347   void Construct(ArgsTypes... params) const
0348   {
0349     ConstructOnGpu<<<1, 1>>>(this->GetPtr(), params...);
0350   }
0351
0352   template <typename... ArgsTypes>
0353   void ConstructArray(size_t nElements, ArgsTypes... params) const
0354   {
0355     ConstructArrayOnGpu<<<nElements, 1>>>(this->GetPtr(), nElements, params...);
0356   }
0357
0358   static size_t SizeOf() { return sizeof(Type); }
0359
0360 #else
0361   template <typename... ArgsTypes>
0362   void Construct(ArgsTypes... params) const;
0363   template <typename... ArgsTypes>
0364   void ConstructArray(size_t nElements, ArgsTypes... params) const;
0365
0366   static size_t SizeOf();
0367 #endif
0368 };
0369
0370 template <typename Type>
0371 class DevicePtr<const Type> : private DevicePtrImpl<const Type> {
0372 public:
0373   DevicePtr()                  = default;
0374   DevicePtr(const DevicePtr &) = default;
0375   DevicePtr &operator=(const DevicePtr &orig) = default;
0376
0377   // should be taking a DevicePtr<void*>
0378   explicit DevicePtr(void *input) : DevicePtrBase(input) {}
0379
0380   // Need to go via the explicit route accepting all conversion
0381   // because the regular c++ compilation
0382   // does not actually see the declaration for the cuda version
0383   // (and thus can not determine the inheritance).
0384   template <typename inputType>
0385   explicit DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<const Type>((void *)input)
0386   {
0387   }
0388
0389   // Implicit conversion from non-const to const.
0390   DevicePtr(DevicePtr<typename std::remove_const<Type>::type> const &input) : DevicePtrImpl<const Type>((void *)input)
0391   {
0392   }
0393
0394 #ifdef VECCORE_CUDA
0395   // Allows implicit conversion from DevicePtr<Derived> to DevicePtr<Base>
0396   template <typename inputType, typename std::enable_if<std::is_base_of<Type, inputType>::value>::type * = nullptr>
0397   DevicePtr(DevicePtr<inputType> const &input) : DevicePtrImpl<const Type>(input.GetPtr())
0398   {
0399   }
0400 #endif
0401
0402   VECCORE_ATT_HOST_DEVICE
0403   const Type *GetPtr() const { return reinterpret_cast<const Type *>(DevicePtrBase::GetPtr()); }
0404
0405   VECCORE_ATT_HOST_DEVICE
0406   operator const Type *() const { return GetPtr(); }
0407
0408 #ifdef VECCORE_CUDA
0409   template <typename DataClass, typename... ArgsTypes>
0410   void Construct(ArgsTypes... params) const
0411   {
0412     ConstructOnGpu<<<1, 1>>>(*(*this), params...);
0413   }
0414
0415   template <typename... ArgsTypes>
0416   void ConstructArray(size_t nElements, ArgsTypes... params) const
0417   {
0418     ConstructArrayOnGpu<<<nElements, 1>>>(this->GetPtr(), nElements, params...);
0419   }
0420
0421   static size_t SizeOf() { return sizeof(Type); }
0422
0423 #else
0424   template <typename... ArgsTypes>
0425   void Construct(ArgsTypes... params) const;
0426   template <typename... ArgsTypes>
0427   void ConstructArray(size_t nElements, ArgsTypes... params) const;
0428
0429   static size_t SizeOf();
0430 #endif
0431 };
0432
0433 namespace CudaInterfaceHelpers {
0434
0435 /*!
0436  * Copy multiple arrays of values to the GPU.
0437  * For each array, allocate memory on the device, and copy it to the GPU.
0438  * The cpuToGpuMapping finally maps the CPU array pointers to the GPU arrays.
0439  * \param[out] cpuToGpuMapping Mapping of CPU array to GPU array. It gets filled during the function execution.
0440  * \param[in]  nElement Number of elements in all collections.
0441  * \param[in]  toCopy First array to copy.
0442  * \param[in]  restToCopy Parameter pack with more arrays to copy (can be empty).
0443  */
0444 template <typename Arg_t, typename... Args_t>
0445 void allocateAndCopyToGpu(std::unordered_map<const void *, void *> &cpuToGpuMapping, std::size_t nElement,
0446                           const Arg_t *toCopy, const Args_t *... restToCopy)
0447 {
0448   const auto nByte         = sizeof(toCopy[0]) * nElement;
0449   const void *hostMem      = toCopy;
0450   void *deviceMem          = AllocateOnGpu<void *>(nByte);
0451   cpuToGpuMapping[hostMem] = deviceMem;
0452   CopyToGpu(hostMem, deviceMem, nByte);
0453
0454 #if __cplusplus >= 201703L
0455   if constexpr (sizeof...(Args_t) > 0) {
0456     allocateAndCopyToGpu(cpuToGpuMapping, nElement, restToCopy...);
0457   }
0458 #else
0459   // C++11 "fold expression" hack. Please remove once VecGeom moves to c++17.
0460   int expandParameterPack[] = {0, ((void)allocateAndCopyToGpu(cpuToGpuMapping, nElement, restToCopy), 0)...};
0461   (void)expandParameterPack[0]; // Make nvcc happy
0462 #endif
0463 }
0464
0465 } // namespace CudaInterfaceHelpers
0466
0467 /*!
0468  * Construct many objects on the GPU, whose addresses and constructor parameters are passed as arrays.
0469  * \tparam DataClass The type to construct on the GPU.
0470  * \tparam DevPtr_t Device pointer type to specify the location of the GPU objects.
0471  * \param  nElement Number of elements to construct. It is assumed that all argument arrays have this length.
0472  * \param  gpu_ptrs Array of addresses to place the new objects at.
0473  * \param  params   Array(s) of constructor parameters with one entry for each object.
0474  */
0475 template <class DataClass, class DevPtr_t, typename... Args_t>
0476 void ConstructManyOnGpu(std::size_t nElement, const DevPtr_t *gpu_ptrs, const Args_t *... params)
0477 #ifdef VECCORE_CUDA
0478 {
0479   using namespace CudaInterfaceHelpers;
0480   std::unordered_map<const void *, void *> cpuToGpuMem;
0481   std::vector<DataClass *> raw_gpu_ptrs;
0482   std::transform(gpu_ptrs, gpu_ptrs + nElement, std::back_inserter(raw_gpu_ptrs),
0483                  [](const DevPtr_t &ptr) { return static_cast<DataClass *>(ptr.GetPtr()); });
0484   allocateAndCopyToGpu(cpuToGpuMem, nElement, raw_gpu_ptrs.data(), params...);
0485
0486   ConstructManyOnGpu_kernel<<<128, 32>>>(raw_gpu_ptrs.size(),
0487                                   static_cast<decltype(raw_gpu_ptrs.data())>(cpuToGpuMem[raw_gpu_ptrs.data()]),
0488                                   static_cast<decltype(params)>(cpuToGpuMem[params])...);
0489
0490   for (const auto &memCpu_memGpu : cpuToGpuMem) {
0491     FreeFromGpu(memCpu_memGpu.second);
0492   }
0493
0494   CudaCheckError();
0495 }
0496 #else
0497     ;
0498 #endif
0499
0500 template <class DataClass, class DevPtr_t>
0501 void CopyBBoxesToGpuImpl(std::size_t nElement, const DevPtr_t *gpu_ptrs, Precision *boxes_data)
0502 #ifdef VECCORE_CUDA
0503 {
0504   std::unordered_map<const void *, void *> cpuToGpuMem;
0505   std::vector<DataClass *> raw_gpu_ptrs;
0506   std::transform(gpu_ptrs, gpu_ptrs + nElement, std::back_inserter(raw_gpu_ptrs),
0507                  [](const DevPtr_t &ptr) { return static_cast<DataClass *>(ptr.GetPtr()); });
0508
0509   const auto nByteBoxes        = 6 * nElement * sizeof(Precision);
0510   const auto nByteVolumes      = nElement * sizeof(DataClass *);
0511   Precision *boxes_data_gpu    = AllocateOnGpu<Precision>(nByteBoxes);
0512   DataClass **raw_gpu_ptrs_gpu = AllocateOnGpu<DataClass *>(nByteVolumes);
0513
0514   CopyToGpu(boxes_data, boxes_data_gpu, nByteBoxes);
0515   CopyToGpu(raw_gpu_ptrs.data(), raw_gpu_ptrs_gpu, nByteVolumes);
0516   cudaDeviceSynchronize();
0517
0518   CopyBBoxesToGpu<DataClass><<<128, 32>>>(raw_gpu_ptrs.size(), raw_gpu_ptrs_gpu, boxes_data_gpu);
0519
0520   FreeFromGpu(boxes_data_gpu);
0521   FreeFromGpu(raw_gpu_ptrs_gpu);
0522 }
0523 #else
0524     ;
0525 #endif
0526
0527 } // End cxx namespace
0528
0529 } // namespace vecgeom
0530
0531 #endif // VECGEOM_ENABLE_CUDA
0532
0533 #endif // VECGEOM_BACKEND_CUDA_INTERFACE_H_