File indexing completed on 2025-01-18 09:54:49
0001
0002
0003
0004
0005
0006
0007
0008 #pragma once
0009
0010 #include <cstddef>
0011 #include <string_view>
0012
0013 #include "corecel/DeviceRuntimeApi.hh"
0014
0015 #include "corecel/Assert.hh"
0016 #include "corecel/Macros.hh"
0017 #include "corecel/OpaqueId.hh"
0018 #include "corecel/Types.hh"
0019 #include "corecel/math/Algorithms.hh"
0020
0021 #include "Device.hh"
0022 #include "KernelAttributes.hh"
0023 #include "ThreadId.hh" // IWYU pragma: export
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033 #define CELER_LAUNCH_KERNEL(NAME, THREADS, STREAM, ...) \
0034 do \
0035 { \
0036 static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0037 #NAME, NAME##_kernel); \
0038 auto grid_ = calc_launch_params_(THREADS); \
0039 \
0040 CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel, \
0041 grid_.blocks_per_grid, \
0042 grid_.threads_per_block, \
0043 0, \
0044 STREAM, \
0045 __VA_ARGS__); \
0046 CELER_DEVICE_CHECK_ERROR(); \
0047 } while (0)
0048
0049
0050
0051
0052
0053
0054
0055
0056 #define CELER_LAUNCH_KERNEL_TEMPLATE_1(NAME, T1, THREADS, STREAM, ...) \
0057 do \
0058 { \
0059 static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0060 #NAME, NAME##_kernel<T1>); \
0061 auto grid_ = calc_launch_params_(THREADS); \
0062 \
0063 CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel<T1>, \
0064 grid_.blocks_per_grid, \
0065 grid_.threads_per_block, \
0066 0, \
0067 STREAM, \
0068 __VA_ARGS__); \
0069 CELER_DEVICE_CHECK_ERROR(); \
0070 } while (0)
0071
0072 #if CELERITAS_USE_CUDA
0073 # define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0074 KERNEL<<<GRID, BLOCK, SHARED, STREAM>>>(__VA_ARGS__)
0075 #elif CELERITAS_USE_HIP
0076 # define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0077 hipLaunchKernelGGL(KERNEL, GRID, BLOCK, SHARED, STREAM, __VA_ARGS__)
0078 #else
0079 # define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0080 CELER_NOT_CONFIGURED("CUDA or HIP"); \
0081 CELER_DISCARD(GRID) \
0082 CELER_DISCARD(KERNEL) \
0083 CELER_DISCARD(__VA_ARGS__);
0084 #endif
0085
0086 namespace celeritas
0087 {
0088
0089 struct KernelProfiling;
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112 class KernelParamCalculator
0113 {
0114 public:
0115
0116
0117 using dim_type = unsigned int;
0118
0119
0120
0121 struct LaunchParams
0122 {
0123 dim3 blocks_per_grid;
0124 dim3 threads_per_block;
0125 };
0126
0127 public:
0128
0129 inline CELER_FUNCTION static ThreadId thread_id();
0130
0131
0132
0133
0134 template<class F>
0135 inline KernelParamCalculator(std::string_view name, F* kernel_func_ptr);
0136
0137
0138 template<class F>
0139 inline KernelParamCalculator(std::string_view name,
0140 F* kernel_func_ptr,
0141 dim_type threads_per_block);
0142
0143
0144 inline LaunchParams operator()(size_type min_num_threads) const;
0145
0146 private:
0147
0148 dim_type block_size_;
0149
0150 KernelProfiling* profiling_{nullptr};
0151
0152
0153
0154 void register_kernel(std::string_view name, KernelAttributes&& attributes);
0155 void log_launch(size_type min_num_threads) const;
0156 };
0157
0158
0159
0160
0161
0162
0163
0164 CELER_FUNCTION auto KernelParamCalculator::thread_id() -> ThreadId
0165 {
0166 #if CELER_DEVICE_COMPILE
0167 return ThreadId{blockIdx.x * blockDim.x + threadIdx.x};
0168 #else
0169
0170 CELER_ASSERT_UNREACHABLE();
0171 #endif
0172 }
0173
0174
0175
0176
0177
0178 template<class F>
0179 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0180 F* kernel_func_ptr)
0181 {
0182 auto attrs = make_kernel_attributes(kernel_func_ptr);
0183 CELER_ASSERT(attrs.threads_per_block > 0);
0184 block_size_ = attrs.threads_per_block;
0185 this->register_kernel(name, std::move(attrs));
0186 }
0187
0188
0189
0190
0191
0192
0193
0194
0195 template<class F>
0196 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0197 F* kernel_func_ptr,
0198 dim_type threads_per_block)
0199 : block_size_(threads_per_block)
0200 {
0201 CELER_EXPECT(threads_per_block > 0
0202 && threads_per_block % celeritas::device().threads_per_warp()
0203 == 0);
0204
0205 auto attrs = make_kernel_attributes(kernel_func_ptr, threads_per_block);
0206 CELER_VALIDATE(threads_per_block <= attrs.max_threads_per_block,
0207 << "requested GPU threads per block " << threads_per_block
0208 << " exceeds kernel maximum "
0209 << attrs.max_threads_per_block);
0210 this->register_kernel(name, std::move(attrs));
0211 }
0212
0213
0214
0215
0216
0217 auto KernelParamCalculator::operator()(size_type min_num_threads) const
0218 -> LaunchParams
0219 {
0220 CELER_EXPECT(min_num_threads > 0);
0221
0222
0223 if (profiling_)
0224 {
0225 this->log_launch(min_num_threads);
0226 }
0227
0228
0229 dim_type blocks_per_grid
0230 = celeritas::ceil_div<dim_type>(min_num_threads, this->block_size_);
0231 CELER_ASSERT(blocks_per_grid
0232 < dim_type(celeritas::device().max_blocks_per_grid()));
0233
0234 LaunchParams result;
0235 result.blocks_per_grid.x = blocks_per_grid;
0236 result.threads_per_block.x = this->block_size_;
0237 CELER_ENSURE(result.blocks_per_grid.x * result.threads_per_block.x
0238 >= min_num_threads);
0239 return result;
0240 }
0241
0242
0243 }