corecel/sys/KernelParamCalculator.device.hh

0001 //------------------------------ -*- C++ -*- -------------------------------//
0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details
0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0004 //---------------------------------------------------------------------------//
0005 //! \file corecel/sys/KernelParamCalculator.device.hh
0006 //---------------------------------------------------------------------------//
0007 #pragma once
0008
0009 #include <cstddef>
0010 #include <string_view>
0011
0012 #include "corecel/DeviceRuntimeApi.hh"
0013
0014 #include "corecel/Assert.hh"
0015 #include "corecel/Macros.hh"
0016 #include "corecel/OpaqueId.hh"
0017 #include "corecel/Types.hh"
0018 #include "corecel/math/Algorithms.hh"
0019
0020 #include "Device.hh"
0021 #include "KernelAttributes.hh"
0022 #include "ThreadId.hh"  // IWYU pragma: export
0023
0024 //---------------------------------------------------------------------------//
0025 /*!
0026  * \def CELER_LAUNCH_KERNEL
0027  *
0028  * Create a kernel param calculator with the given kernel, assuming the
0029  * function itself has a \c _kernel suffix, and launch with the given
0030  * block/thread sizes and arguments list.
0031  */
0032 #define CELER_LAUNCH_KERNEL(NAME, THREADS, STREAM, ...)                      \
0033     do                                                                       \
0034     {                                                                        \
0035         static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0036             #NAME, NAME##_kernel);                                           \
0037         auto grid_ = calc_launch_params_(THREADS);                           \
0038                                                                              \
0039         CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel,                              \
0040                                  grid_.blocks_per_grid,                      \
0041                                  grid_.threads_per_block,                    \
0042                                  0,                                          \
0043                                  STREAM,                                     \
0044                                  __VA_ARGS__);                               \
0045         CELER_DEVICE_API_CALL(PeekAtLastError());                            \
0046     } while (0)
0047
0048 /*!
0049  * \def CELER_LAUNCH_KERNEL_TEMPLATE_1
0050  *
0051  * Create a kernel param calculator with the given kernel with
0052  * one template parameter, assuming the unction itself has a \c _kernel
0053  * suffix, and launch with the given block/thread sizes and arguments list.
0054  */
0055 #define CELER_LAUNCH_KERNEL_TEMPLATE_1(NAME, T1, THREADS, STREAM, ...)       \
0056     do                                                                       \
0057     {                                                                        \
0058         static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0059             #NAME, NAME##_kernel<T1>);                                       \
0060         auto grid_ = calc_launch_params_(THREADS);                           \
0061                                                                              \
0062         CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel<T1>,                          \
0063                                  grid_.blocks_per_grid,                      \
0064                                  grid_.threads_per_block,                    \
0065                                  0,                                          \
0066                                  STREAM,                                     \
0067                                  __VA_ARGS__);                               \
0068         CELER_DEVICE_API_CALL(PeekAtLastError());                            \
0069     } while (0)
0070
0071 #if CELERITAS_USE_CUDA
0072 #    define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0073         KERNEL<<<GRID, BLOCK, SHARED, STREAM>>>(__VA_ARGS__)
0074 #elif CELERITAS_USE_HIP
0075 #    define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0076         hipLaunchKernelGGL(KERNEL, GRID, BLOCK, SHARED, STREAM, __VA_ARGS__)
0077 #else
0078 #    define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0079         CELER_NOT_CONFIGURED("CUDA or HIP");                                   \
0080         CELER_DISCARD(GRID)                                                    \
0081         CELER_DISCARD(KERNEL)                                                  \
0082         CELER_DISCARD(__VA_ARGS__);
0083 #endif
0084
0085 namespace celeritas
0086 {
0087 //---------------------------------------------------------------------------//
0088 struct KernelProfiling;
0089
0090 //---------------------------------------------------------------------------//
0091 /*!
0092  * Kernel management helper functions.
0093  *
0094  * We assume that all our kernel launches use 1-D thread indexing to make
0095  * things easy. The \c dim_type alias should be the same size as the type of a
0096  * single \c dim3 member (x/y/z).
0097  *
0098  * Constructing the param calculator registers kernel attributes with \c
0099  * kernel_registry as an implementation detail in the .cc file that hides
0100  * inclusion of that interface from CUDA code. If kernel diagnostic profiling
0101  * is enabled, the registry will return a pointer that this class uses to
0102  * increment thread launch counters over the lifetime of the program.
0103  *
0104  * \code
0105     static KernelParamCalculator calc_params("my", &my_kernel);
0106     auto params = calc_params(states.size());
0107     my_kernel<<<params.blocks_per_grid,
0108  params.threads_per_block>>>(kernel_args...);
0109  * \endcode
0110  */
0111 class KernelParamCalculator
0112 {
0113   public:
0114     //!@{
0115     //! \name Type aliases
0116     using dim_type = unsigned int;
0117     //!@}
0118
0119     //! Parameters needed for a CUDA launch call
0120     struct LaunchParams
0121     {
0122         dim3 blocks_per_grid;  //!< Number of blocks for kernel grid
0123         dim3 threads_per_block;  //!< Number of threads per block
0124     };
0125
0126   public:
0127     // Get the thread ID for a kernel initialized with this class
0128     inline CELER_FUNCTION static ThreadId thread_id();
0129
0130     //// CLASS INTERFACE ////
0131
0132     // Construct with the default block size
0133     template<class F>
0134     inline KernelParamCalculator(std::string_view name, F* kernel_func_ptr);
0135
0136     // Construct with an explicit number of threads per block
0137     template<class F>
0138     inline KernelParamCalculator(std::string_view name,
0139                                  F* kernel_func_ptr,
0140                                  dim_type threads_per_block);
0141
0142     // Get launch parameters
0143     inline LaunchParams operator()(size_type min_num_threads) const;
0144
0145   private:
0146     //! Threads per block
0147     dim_type block_size_;
0148     //! Optional profiling data owned by the kernel registry
0149     KernelProfiling* profiling_{nullptr};
0150
0151     //// HELPER FUNCTIONS ////
0152
0153     void register_kernel(std::string_view name, KernelAttributes&& attributes);
0154     void log_launch(size_type min_num_threads) const;
0155 };
0156
0157 //---------------------------------------------------------------------------//
0158 // INLINE DEFINITIONS
0159 //---------------------------------------------------------------------------//
0160 /*!
0161  * Get the linear thread ID.
0162  */
0163 CELER_FUNCTION auto KernelParamCalculator::thread_id() -> ThreadId
0164 {
0165 #if CELER_DEVICE_COMPILE
0166     return ThreadId{blockIdx.x * blockDim.x + threadIdx.x};
0167 #else
0168     // blockIdx/threadIdx not available: shouldn't be called by host code
0169     CELER_ASSERT_UNREACHABLE();
0170 #endif
0171 }
0172
0173 //---------------------------------------------------------------------------//
0174 /*!
0175  * Construct with the maximum threads per block for a given kernel.
0176  */
0177 template<class F>
0178 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0179                                              F* kernel_func_ptr)
0180 {
0181     auto attrs = make_kernel_attributes(kernel_func_ptr);
0182     CELER_ASSERT(attrs.threads_per_block > 0);
0183     block_size_ = attrs.threads_per_block;
0184     this->register_kernel(name, std::move(attrs));
0185 }
0186
0187 //---------------------------------------------------------------------------//
0188 /*!
0189  * Construct for the given global kernel F.
0190  *
0191  * This registers the kernel with \c celeritas::kernel_registry() and saves a
0192  * pointer to the profiling data if profiling is to be used.
0193  */
0194 template<class F>
0195 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0196                                              F* kernel_func_ptr,
0197                                              dim_type threads_per_block)
0198     : block_size_(threads_per_block)
0199 {
0200     CELER_EXPECT(threads_per_block > 0
0201                  && threads_per_block % celeritas::device().threads_per_warp()
0202                         == 0);
0203
0204     auto attrs = make_kernel_attributes(kernel_func_ptr, threads_per_block);
0205     CELER_VALIDATE(threads_per_block <= attrs.max_threads_per_block,
0206                    << "requested GPU threads per block " << threads_per_block
0207                    << " exceeds kernel maximum "
0208                    << attrs.max_threads_per_block);
0209     this->register_kernel(name, std::move(attrs));
0210 }
0211
0212 //---------------------------------------------------------------------------//
0213 /*!
0214  * Calculate launch params given the number of threads.
0215  */
0216 auto KernelParamCalculator::operator()(size_type min_num_threads) const
0217     -> LaunchParams
0218 {
0219     CELER_EXPECT(min_num_threads > 0);
0220
0221     // Update diagnostics for the kernel
0222     if (profiling_)
0223     {
0224         this->log_launch(min_num_threads);
0225     }
0226
0227     // Ceiling integer division
0228     dim_type blocks_per_grid
0229         = celeritas::ceil_div<dim_type>(min_num_threads, this->block_size_);
0230     CELER_ASSERT(blocks_per_grid
0231                  < dim_type(celeritas::device().max_blocks_per_grid()));
0232
0233     LaunchParams result;
0234     result.blocks_per_grid.x = blocks_per_grid;
0235     result.threads_per_block.x = this->block_size_;
0236     CELER_ENSURE(result.blocks_per_grid.x * result.threads_per_block.x
0237                  >= min_num_threads);
0238     return result;
0239 }
0240
0241 //---------------------------------------------------------------------------//
0242 }  // namespace celeritas