File indexing completed on 2025-12-28 09:41:23
0001
0002
0003
0004
0005
0006
0007 #pragma once
0008
0009 #include <cstddef>
0010 #include <string_view>
0011
0012 #include "corecel/DeviceRuntimeApi.hh"
0013
0014 #include "corecel/Assert.hh"
0015 #include "corecel/Macros.hh"
0016 #include "corecel/OpaqueId.hh"
0017 #include "corecel/Types.hh"
0018 #include "corecel/math/Algorithms.hh"
0019
0020 #include "Device.hh"
0021 #include "KernelAttributes.hh"
0022 #include "ThreadId.hh" // IWYU pragma: export
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032 #define CELER_LAUNCH_KERNEL(NAME, THREADS, STREAM, ...) \
0033 do \
0034 { \
0035 static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0036 #NAME, NAME##_kernel); \
0037 auto grid_ = calc_launch_params_(THREADS); \
0038 \
0039 CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel, \
0040 grid_.blocks_per_grid, \
0041 grid_.threads_per_block, \
0042 0, \
0043 STREAM, \
0044 __VA_ARGS__); \
0045 CELER_DEVICE_API_CALL(PeekAtLastError()); \
0046 } while (0)
0047
0048
0049
0050
0051
0052
0053
0054
0055 #define CELER_LAUNCH_KERNEL_TEMPLATE_1(NAME, T1, THREADS, STREAM, ...) \
0056 do \
0057 { \
0058 static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0059 #NAME, NAME##_kernel<T1>); \
0060 auto grid_ = calc_launch_params_(THREADS); \
0061 \
0062 CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel<T1>, \
0063 grid_.blocks_per_grid, \
0064 grid_.threads_per_block, \
0065 0, \
0066 STREAM, \
0067 __VA_ARGS__); \
0068 CELER_DEVICE_API_CALL(PeekAtLastError()); \
0069 } while (0)
0070
0071 #if CELERITAS_USE_CUDA
0072 # define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0073 KERNEL<<<GRID, BLOCK, SHARED, STREAM>>>(__VA_ARGS__)
0074 #elif CELERITAS_USE_HIP
0075 # define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0076 hipLaunchKernelGGL(KERNEL, GRID, BLOCK, SHARED, STREAM, __VA_ARGS__)
0077 #else
0078 # define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0079 CELER_NOT_CONFIGURED("CUDA or HIP"); \
0080 CELER_DISCARD(GRID) \
0081 CELER_DISCARD(KERNEL) \
0082 CELER_DISCARD(__VA_ARGS__);
0083 #endif
0084
0085 namespace celeritas
0086 {
0087
0088 struct KernelProfiling;
0089
0090
0091
0092
0093
0094
0095
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111 class KernelParamCalculator
0112 {
0113 public:
0114
0115
0116 using dim_type = unsigned int;
0117
0118
0119
0120 struct LaunchParams
0121 {
0122 dim3 blocks_per_grid;
0123 dim3 threads_per_block;
0124 };
0125
0126 public:
0127
0128 inline CELER_FUNCTION static ThreadId thread_id();
0129
0130
0131
0132
0133 template<class F>
0134 inline KernelParamCalculator(std::string_view name, F* kernel_func_ptr);
0135
0136
0137 template<class F>
0138 inline KernelParamCalculator(std::string_view name,
0139 F* kernel_func_ptr,
0140 dim_type threads_per_block);
0141
0142
0143 inline LaunchParams operator()(size_type min_num_threads) const;
0144
0145 private:
0146
0147 dim_type block_size_;
0148
0149 KernelProfiling* profiling_{nullptr};
0150
0151
0152
0153 void register_kernel(std::string_view name, KernelAttributes&& attributes);
0154 void log_launch(size_type min_num_threads) const;
0155 };
0156
0157
0158
0159
0160
0161
0162
0163 CELER_FUNCTION auto KernelParamCalculator::thread_id() -> ThreadId
0164 {
0165 #if CELER_DEVICE_COMPILE
0166 return ThreadId{blockIdx.x * blockDim.x + threadIdx.x};
0167 #else
0168
0169 CELER_ASSERT_UNREACHABLE();
0170 #endif
0171 }
0172
0173
0174
0175
0176
0177 template<class F>
0178 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0179 F* kernel_func_ptr)
0180 {
0181 auto attrs = make_kernel_attributes(kernel_func_ptr);
0182 CELER_ASSERT(attrs.threads_per_block > 0);
0183 block_size_ = attrs.threads_per_block;
0184 this->register_kernel(name, std::move(attrs));
0185 }
0186
0187
0188
0189
0190
0191
0192
0193
0194 template<class F>
0195 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0196 F* kernel_func_ptr,
0197 dim_type threads_per_block)
0198 : block_size_(threads_per_block)
0199 {
0200 CELER_EXPECT(threads_per_block > 0
0201 && threads_per_block % celeritas::device().threads_per_warp()
0202 == 0);
0203
0204 auto attrs = make_kernel_attributes(kernel_func_ptr, threads_per_block);
0205 CELER_VALIDATE(threads_per_block <= attrs.max_threads_per_block,
0206 << "requested GPU threads per block " << threads_per_block
0207 << " exceeds kernel maximum "
0208 << attrs.max_threads_per_block);
0209 this->register_kernel(name, std::move(attrs));
0210 }
0211
0212
0213
0214
0215
0216 auto KernelParamCalculator::operator()(size_type min_num_threads) const
0217 -> LaunchParams
0218 {
0219 CELER_EXPECT(min_num_threads > 0);
0220
0221
0222 if (profiling_)
0223 {
0224 this->log_launch(min_num_threads);
0225 }
0226
0227
0228 dim_type blocks_per_grid
0229 = celeritas::ceil_div<dim_type>(min_num_threads, this->block_size_);
0230 CELER_ASSERT(blocks_per_grid
0231 < dim_type(celeritas::device().max_blocks_per_grid()));
0232
0233 LaunchParams result;
0234 result.blocks_per_grid.x = blocks_per_grid;
0235 result.threads_per_block.x = this->block_size_;
0236 CELER_ENSURE(result.blocks_per_grid.x * result.threads_per_block.x
0237 >= min_num_threads);
0238 return result;
0239 }
0240
0241
0242 }