cuda/utility/memory.hpp

0001 /*
0002  * SPDX-PackageName: "covfie, a part of the ACTS project"
0003  * SPDX-FileCopyrightText: 2022 CERN
0004  * SPDX-License-Identifier: MPL-2.0
0005  */
0006
0007 #pragma once
0008
0009 #include <memory>
0010 #include <optional>
0011
0012 #include <cuda_runtime.h>
0013
0014 #include <covfie/cuda/error_check.hpp>
0015 #include <covfie/cuda/utility/unique_ptr.hpp>
0016
0017 namespace covfie::utility::cuda {
0018 template <typename T>
0019 unique_device_ptr<T> device_allocate()
0020 {
0021     static_assert(
0022         !(std::is_array_v<T> && std::extent_v<T> == 0),
0023         "Allocation pointer type cannot be an unbounded array."
0024     );
0025
0026     using pointer_t =
0027         std::conditional_t<std::is_array_v<T>, std::decay_t<T>, T *>;
0028
0029     pointer_t p;
0030
0031     cudaErrorCheck(cudaMalloc(&p, sizeof(T)));
0032
0033     return unique_device_ptr<T>(p);
0034 }
0035
0036 template <typename T>
0037 unique_device_ptr<T> device_allocate(std::size_t n)
0038 {
0039     static_assert(
0040         std::is_array_v<T>, "Allocation pointer type must be an array type."
0041     );
0042     static_assert(
0043         std::extent_v<T> == 0, "Allocation pointer type must be unbounded."
0044     );
0045
0046     using pointer_t =
0047         std::conditional_t<std::is_array_v<T>, std::decay_t<T>, T *>;
0048
0049     pointer_t p;
0050
0051     cudaErrorCheck(cudaMalloc(&p, n * sizeof(std::remove_extent_t<T>)));
0052
0053     return unique_device_ptr<T>(p);
0054 }
0055
0056 template <typename T>
0057 unique_device_ptr<T[]>
0058 device_copy_h2d(const T * h, std::optional<cudaStream_t> stream = std::nullopt)
0059 {
0060     unique_device_ptr<T[]> r = device_allocate<T[]>();
0061
0062     if (stream.has_value()) {
0063         cudaErrorCheck(cudaMemcpyAsync(
0064             r.get(), h, sizeof(T), cudaMemcpyHostToDevice, *stream
0065         ));
0066         cudaErrorCheck(cudaStreamSynchronize(*stream));
0067     } else {
0068         cudaErrorCheck(cudaMemcpy(r.get(), h, sizeof(T), cudaMemcpyHostToDevice)
0069         );
0070     }
0071
0072     return r;
0073 }
0074
0075 template <typename T>
0076 unique_device_ptr<T[]> device_copy_h2d(
0077     const T * h,
0078     std::size_t n,
0079     std::optional<cudaStream_t> stream = std::nullopt
0080 )
0081 {
0082     unique_device_ptr<T[]> r = device_allocate<T[]>(n);
0083
0084     if (stream.has_value()) {
0085         cudaErrorCheck(cudaMemcpyAsync(
0086             r.get(),
0087             h,
0088             n * sizeof(std::remove_extent_t<T>),
0089             cudaMemcpyHostToDevice,
0090             *stream
0091         ));
0092         cudaErrorCheck(cudaStreamSynchronize(*stream));
0093     } else {
0094         cudaErrorCheck(cudaMemcpy(
0095             r.get(),
0096             h,
0097             n * sizeof(std::remove_extent_t<T>),
0098             cudaMemcpyHostToDevice
0099         ));
0100     }
0101
0102     return r;
0103 }
0104
0105 template <typename T>
0106 unique_device_ptr<T[]>
0107 device_copy_d2d(const T * h, std::optional<cudaStream_t> stream = std::nullopt)
0108 {
0109     unique_device_ptr<T[]> r = device_allocate<T[]>();
0110
0111     if (stream.has_value()) {
0112         cudaErrorCheck(cudaMemcpyAsync(
0113             r.get(), h, sizeof(T), cudaMemcpyDeviceToDevice, *stream
0114         ));
0115         cudaErrorCheck(cudaStreamSynchronize(*stream));
0116     } else {
0117         cudaErrorCheck(
0118             cudaMemcpy(r.get(), h, sizeof(T), cudaMemcpyDeviceToDevice)
0119         );
0120     }
0121
0122     return r;
0123 }
0124
0125 template <typename T>
0126 unique_device_ptr<T[]> device_copy_d2d(
0127     const T * h,
0128     std::size_t n,
0129     std::optional<cudaStream_t> stream = std::nullopt
0130 )
0131 {
0132     unique_device_ptr<T[]> r = device_allocate<T[]>(n);
0133
0134     if (stream.has_value()) {
0135         cudaErrorCheck(cudaMemcpyAsync(
0136             r.get(),
0137             h,
0138             n * sizeof(std::remove_extent_t<T>),
0139             cudaMemcpyDeviceToDevice,
0140             *stream
0141         ));
0142         cudaErrorCheck(cudaStreamSynchronize(*stream));
0143     } else {
0144         cudaErrorCheck(cudaMemcpy(
0145             r.get(),
0146             h,
0147             n * sizeof(std::remove_extent_t<T>),
0148             cudaMemcpyDeviceToDevice
0149         ));
0150     }
0151
0152     return r;
0153 }
0154 }