Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:54:49

0001 //---------------------------------*-C++-*-----------------------------------//
0002 // Copyright 2020-2024 UT-Battelle, LLC, and other Celeritas developers.
0003 // See the top-level COPYRIGHT file for details.
0004 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0005 //---------------------------------------------------------------------------//
0006 //! \file corecel/sys/KernelParamCalculator.device.hh
0007 //---------------------------------------------------------------------------//
0008 #pragma once
0009 
0010 #include <cstddef>
0011 #include <string_view>
0012 
0013 #include "corecel/DeviceRuntimeApi.hh"
0014 
0015 #include "corecel/Assert.hh"
0016 #include "corecel/Macros.hh"
0017 #include "corecel/OpaqueId.hh"
0018 #include "corecel/Types.hh"
0019 #include "corecel/math/Algorithms.hh"
0020 
0021 #include "Device.hh"
0022 #include "KernelAttributes.hh"
0023 #include "ThreadId.hh"  // IWYU pragma: export
0024 
0025 //---------------------------------------------------------------------------//
0026 /*!
0027  * \def CELER_LAUNCH_KERNEL
0028  *
0029  * Create a kernel param calculator with the given kernel, assuming the
0030  * function itself has a \c _kernel suffix, and launch with the given
0031  * block/thread sizes and arguments list.
0032  */
0033 #define CELER_LAUNCH_KERNEL(NAME, THREADS, STREAM, ...)                      \
0034     do                                                                       \
0035     {                                                                        \
0036         static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0037             #NAME, NAME##_kernel);                                           \
0038         auto grid_ = calc_launch_params_(THREADS);                           \
0039                                                                              \
0040         CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel,                              \
0041                                  grid_.blocks_per_grid,                      \
0042                                  grid_.threads_per_block,                    \
0043                                  0,                                          \
0044                                  STREAM,                                     \
0045                                  __VA_ARGS__);                               \
0046         CELER_DEVICE_CHECK_ERROR();                                          \
0047     } while (0)
0048 
0049 /*!
0050  * \def CELER_LAUNCH_KERNEL_TEMPLATE_1
0051  *
0052  * Create a kernel param calculator with the given kernel with
0053  * one template parameter, assuming the unction itself has a \c _kernel
0054  * suffix, and launch with the given block/thread sizes and arguments list.
0055  */
0056 #define CELER_LAUNCH_KERNEL_TEMPLATE_1(NAME, T1, THREADS, STREAM, ...)       \
0057     do                                                                       \
0058     {                                                                        \
0059         static const ::celeritas::KernelParamCalculator calc_launch_params_( \
0060             #NAME, NAME##_kernel<T1>);                                       \
0061         auto grid_ = calc_launch_params_(THREADS);                           \
0062                                                                              \
0063         CELER_LAUNCH_KERNEL_IMPL(NAME##_kernel<T1>,                          \
0064                                  grid_.blocks_per_grid,                      \
0065                                  grid_.threads_per_block,                    \
0066                                  0,                                          \
0067                                  STREAM,                                     \
0068                                  __VA_ARGS__);                               \
0069         CELER_DEVICE_CHECK_ERROR();                                          \
0070     } while (0)
0071 
0072 #if CELERITAS_USE_CUDA
0073 #    define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0074         KERNEL<<<GRID, BLOCK, SHARED, STREAM>>>(__VA_ARGS__)
0075 #elif CELERITAS_USE_HIP
0076 #    define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0077         hipLaunchKernelGGL(KERNEL, GRID, BLOCK, SHARED, STREAM, __VA_ARGS__)
0078 #else
0079 #    define CELER_LAUNCH_KERNEL_IMPL(KERNEL, GRID, BLOCK, SHARED, STREAM, ...) \
0080         CELER_NOT_CONFIGURED("CUDA or HIP");                                   \
0081         CELER_DISCARD(GRID)                                                    \
0082         CELER_DISCARD(KERNEL)                                                  \
0083         CELER_DISCARD(__VA_ARGS__);
0084 #endif
0085 
0086 namespace celeritas
0087 {
0088 //---------------------------------------------------------------------------//
0089 struct KernelProfiling;
0090 
0091 //---------------------------------------------------------------------------//
0092 /*!
0093  * Kernel management helper functions.
0094  *
0095  * We assume that all our kernel launches use 1-D thread indexing to make
0096  * things easy. The \c dim_type alias should be the same size as the type of a
0097  * single \c dim3 member (x/y/z).
0098  *
0099  * Constructing the param calculator registers kernel attributes with \c
0100  * kernel_registry as an implementation detail in the .cc file that hides
0101  * inclusion of that interface from CUDA code. If kernel diagnostic profiling
0102  * is enabled, the registry will return a pointer that this class uses to
0103  * increment thread launch counters over the lifetime of the program.
0104  *
0105  * \code
0106     static KernelParamCalculator calc_params("my", &my_kernel);
0107     auto params = calc_params(states.size());
0108     my_kernel<<<params.blocks_per_grid,
0109  params.threads_per_block>>>(kernel_args...);
0110  * \endcode
0111  */
0112 class KernelParamCalculator
0113 {
0114   public:
0115     //!@{
0116     //! \name Type aliases
0117     using dim_type = unsigned int;
0118     //!@}
0119 
0120     //! Parameters needed for a CUDA lauch call
0121     struct LaunchParams
0122     {
0123         dim3 blocks_per_grid;  //!< Number of blocks for kernel grid
0124         dim3 threads_per_block;  //!< Number of threads per block
0125     };
0126 
0127   public:
0128     // Get the thread ID for a kernel initialized with this class
0129     inline CELER_FUNCTION static ThreadId thread_id();
0130 
0131     //// CLASS INTERFACE ////
0132 
0133     // Construct with the default block size
0134     template<class F>
0135     inline KernelParamCalculator(std::string_view name, F* kernel_func_ptr);
0136 
0137     // Construct with an explicit number of threads per block
0138     template<class F>
0139     inline KernelParamCalculator(std::string_view name,
0140                                  F* kernel_func_ptr,
0141                                  dim_type threads_per_block);
0142 
0143     // Get launch parameters
0144     inline LaunchParams operator()(size_type min_num_threads) const;
0145 
0146   private:
0147     //! Threads per block
0148     dim_type block_size_;
0149     //! Optional profiling data owned by the kernel registry
0150     KernelProfiling* profiling_{nullptr};
0151 
0152     //// HELPER FUNCTIONS ////
0153 
0154     void register_kernel(std::string_view name, KernelAttributes&& attributes);
0155     void log_launch(size_type min_num_threads) const;
0156 };
0157 
0158 //---------------------------------------------------------------------------//
0159 // INLINE DEFINITIONS
0160 //---------------------------------------------------------------------------//
0161 /*!
0162  * Get the linear thread ID.
0163  */
0164 CELER_FUNCTION auto KernelParamCalculator::thread_id() -> ThreadId
0165 {
0166 #if CELER_DEVICE_COMPILE
0167     return ThreadId{blockIdx.x * blockDim.x + threadIdx.x};
0168 #else
0169     // blockIdx/threadIdx not available: shouldn't be called by host code
0170     CELER_ASSERT_UNREACHABLE();
0171 #endif
0172 }
0173 
0174 //---------------------------------------------------------------------------//
0175 /*!
0176  * Construct with the maximum threads per block for a given kernel.
0177  */
0178 template<class F>
0179 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0180                                              F* kernel_func_ptr)
0181 {
0182     auto attrs = make_kernel_attributes(kernel_func_ptr);
0183     CELER_ASSERT(attrs.threads_per_block > 0);
0184     block_size_ = attrs.threads_per_block;
0185     this->register_kernel(name, std::move(attrs));
0186 }
0187 
0188 //---------------------------------------------------------------------------//
0189 /*!
0190  * Construct for the given global kernel F.
0191  *
0192  * This registers the kernel with \c celeritas::kernel_registry() and saves a
0193  * pointer to the profiling data if profiling is to be used.
0194  */
0195 template<class F>
0196 KernelParamCalculator::KernelParamCalculator(std::string_view name,
0197                                              F* kernel_func_ptr,
0198                                              dim_type threads_per_block)
0199     : block_size_(threads_per_block)
0200 {
0201     CELER_EXPECT(threads_per_block > 0
0202                  && threads_per_block % celeritas::device().threads_per_warp()
0203                         == 0);
0204 
0205     auto attrs = make_kernel_attributes(kernel_func_ptr, threads_per_block);
0206     CELER_VALIDATE(threads_per_block <= attrs.max_threads_per_block,
0207                    << "requested GPU threads per block " << threads_per_block
0208                    << " exceeds kernel maximum "
0209                    << attrs.max_threads_per_block);
0210     this->register_kernel(name, std::move(attrs));
0211 }
0212 
0213 //---------------------------------------------------------------------------//
0214 /*!
0215  * Calculate launch params given the number of threads.
0216  */
0217 auto KernelParamCalculator::operator()(size_type min_num_threads) const
0218     -> LaunchParams
0219 {
0220     CELER_EXPECT(min_num_threads > 0);
0221 
0222     // Update diagnostics for the kernel
0223     if (profiling_)
0224     {
0225         this->log_launch(min_num_threads);
0226     }
0227 
0228     // Ceiling integer division
0229     dim_type blocks_per_grid
0230         = celeritas::ceil_div<dim_type>(min_num_threads, this->block_size_);
0231     CELER_ASSERT(blocks_per_grid
0232                  < dim_type(celeritas::device().max_blocks_per_grid()));
0233 
0234     LaunchParams result;
0235     result.blocks_per_grid.x = blocks_per_grid;
0236     result.threads_per_block.x = this->block_size_;
0237     CELER_ENSURE(result.blocks_per_grid.x * result.threads_per_block.x
0238                  >= min_num_threads);
0239     return result;
0240 }
0241 
0242 //---------------------------------------------------------------------------//
0243 }  // namespace celeritas