File indexing completed on 2025-09-17 08:54:12
0001
0002
0003
0004
0005
0006
0007 #pragma once
0008
0009 #include <memory>
0010 #include <optional>
0011
0012 #include <cuda_runtime.h>
0013
0014 #include <covfie/cuda/error_check.hpp>
0015 #include <covfie/cuda/utility/unique_ptr.hpp>
0016
0017 namespace covfie::utility::cuda {
0018 template <typename T>
0019 unique_device_ptr<T> device_allocate()
0020 {
0021 static_assert(
0022 !(std::is_array_v<T> && std::extent_v<T> == 0),
0023 "Allocation pointer type cannot be an unbounded array."
0024 );
0025
0026 using pointer_t =
0027 std::conditional_t<std::is_array_v<T>, std::decay_t<T>, T *>;
0028
0029 pointer_t p;
0030
0031 cudaErrorCheck(cudaMalloc(&p, sizeof(T)));
0032
0033 return unique_device_ptr<T>(p);
0034 }
0035
0036 template <typename T>
0037 unique_device_ptr<T> device_allocate(std::size_t n)
0038 {
0039 static_assert(
0040 std::is_array_v<T>, "Allocation pointer type must be an array type."
0041 );
0042 static_assert(
0043 std::extent_v<T> == 0, "Allocation pointer type must be unbounded."
0044 );
0045
0046 using pointer_t =
0047 std::conditional_t<std::is_array_v<T>, std::decay_t<T>, T *>;
0048
0049 pointer_t p;
0050
0051 cudaErrorCheck(cudaMalloc(&p, n * sizeof(std::remove_extent_t<T>)));
0052
0053 return unique_device_ptr<T>(p);
0054 }
0055
0056 template <typename T>
0057 unique_device_ptr<T[]>
0058 device_copy_h2d(const T * h, std::optional<cudaStream_t> stream = std::nullopt)
0059 {
0060 unique_device_ptr<T[]> r = device_allocate<T[]>();
0061
0062 if (stream.has_value()) {
0063 cudaErrorCheck(cudaMemcpyAsync(
0064 r.get(), h, sizeof(T), cudaMemcpyHostToDevice, *stream
0065 ));
0066 cudaErrorCheck(cudaStreamSynchronize(*stream));
0067 } else {
0068 cudaErrorCheck(cudaMemcpy(r.get(), h, sizeof(T), cudaMemcpyHostToDevice)
0069 );
0070 }
0071
0072 return r;
0073 }
0074
0075 template <typename T>
0076 unique_device_ptr<T[]> device_copy_h2d(
0077 const T * h,
0078 std::size_t n,
0079 std::optional<cudaStream_t> stream = std::nullopt
0080 )
0081 {
0082 unique_device_ptr<T[]> r = device_allocate<T[]>(n);
0083
0084 if (stream.has_value()) {
0085 cudaErrorCheck(cudaMemcpyAsync(
0086 r.get(),
0087 h,
0088 n * sizeof(std::remove_extent_t<T>),
0089 cudaMemcpyHostToDevice,
0090 *stream
0091 ));
0092 cudaErrorCheck(cudaStreamSynchronize(*stream));
0093 } else {
0094 cudaErrorCheck(cudaMemcpy(
0095 r.get(),
0096 h,
0097 n * sizeof(std::remove_extent_t<T>),
0098 cudaMemcpyHostToDevice
0099 ));
0100 }
0101
0102 return r;
0103 }
0104
0105 template <typename T>
0106 unique_device_ptr<T[]>
0107 device_copy_d2d(const T * h, std::optional<cudaStream_t> stream = std::nullopt)
0108 {
0109 unique_device_ptr<T[]> r = device_allocate<T[]>();
0110
0111 if (stream.has_value()) {
0112 cudaErrorCheck(cudaMemcpyAsync(
0113 r.get(), h, sizeof(T), cudaMemcpyDeviceToDevice, *stream
0114 ));
0115 cudaErrorCheck(cudaStreamSynchronize(*stream));
0116 } else {
0117 cudaErrorCheck(
0118 cudaMemcpy(r.get(), h, sizeof(T), cudaMemcpyDeviceToDevice)
0119 );
0120 }
0121
0122 return r;
0123 }
0124
0125 template <typename T>
0126 unique_device_ptr<T[]> device_copy_d2d(
0127 const T * h,
0128 std::size_t n,
0129 std::optional<cudaStream_t> stream = std::nullopt
0130 )
0131 {
0132 unique_device_ptr<T[]> r = device_allocate<T[]>(n);
0133
0134 if (stream.has_value()) {
0135 cudaErrorCheck(cudaMemcpyAsync(
0136 r.get(),
0137 h,
0138 n * sizeof(std::remove_extent_t<T>),
0139 cudaMemcpyDeviceToDevice,
0140 *stream
0141 ));
0142 cudaErrorCheck(cudaStreamSynchronize(*stream));
0143 } else {
0144 cudaErrorCheck(cudaMemcpy(
0145 r.get(),
0146 h,
0147 n * sizeof(std::remove_extent_t<T>),
0148 cudaMemcpyDeviceToDevice
0149 ));
0150 }
0151
0152 return r;
0153 }
0154 }