corecel/sys/KernelAttributes.hh

0001 //------------------------------- -*- C++ -*- -------------------------------//
0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details
0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0004 //---------------------------------------------------------------------------//
0005 //! \file corecel/sys/KernelAttributes.hh
0006 //---------------------------------------------------------------------------//
0007 #pragma once
0008
0009 #include <cstddef>
0010 #include <type_traits>
0011
0012 #include "corecel/Config.hh"
0013
0014 #include "corecel/Assert.hh"
0015 #include "corecel/Macros.hh"
0016
0017 #include "Device.hh"
0018
0019 #if CELER_DEVICE_SOURCE
0020 #    include "corecel/DeviceRuntimeApi.hh"
0021 #endif
0022
0023 namespace celeritas
0024 {
0025 //---------------------------------------------------------------------------//
0026 /*!
0027  * Immutable attributes of a kernel function.
0028  *
0029  * This is an analog to \c cudaFuncAttributes with some additional helpful
0030  * information. Some quantities are device-specific.
0031  */
0032 struct KernelAttributes
0033 {
0034     unsigned int threads_per_block{0};
0035
0036     int num_regs{0};  //!< Number of 32-bit registers per thread
0037     std::size_t const_mem{0};  //!< Amount of constant memory (per thread) [b]
0038     std::size_t local_mem{0};  //!< Amount of local memory (per thread) [b]
0039
0040     unsigned int max_threads_per_block{0};  //!< Max allowed threads per block
0041     unsigned int max_blocks_per_cu{0};  //!< Occupancy (compute unit)
0042
0043     // Derivative but useful occupancy information
0044     unsigned int max_warps_per_eu{0};  //!< Occupancy (execution unit)
0045     double occupancy{0};  //!< Fractional occupancy (CU)
0046
0047     // Resource limits at first call
0048     std::size_t stack_size{0};  //!< CUDA Dynamic per-thread stack limit [b]
0049     std::size_t heap_size{0};  //!< Dynamic malloc heap size [b]
0050     std::size_t print_buffer_size{0};  //!< FIFO buffer size for printf [b]
0051 };
0052
0053 //---------------------------------------------------------------------------//
0054 /*!
0055  * Build kernel attributes from a __global__ kernel function.
0056  *
0057  * This can only be called from CUDA/HIP code. It assumes that the block size
0058  * is constant across the execution of the program and that the kernel is only
0059  * called by the device that's active at this time.
0060  *
0061  * The special value of zero threads per block causes the kernel attributes to
0062  * default to the *compile-time maximum* number of threads per block as
0063  * specified by launch bounds.
0064  */
0065 template<class F>
0066 KernelAttributes
0067 make_kernel_attributes(F* func, unsigned int threads_per_block = 0)
0068 {
0069     KernelAttributes result;
0070 #ifdef CELER_DEVICE_SOURCE
0071     // Get function attributes
0072     {
0073         CELER_DEVICE_API_SYMBOL(FuncAttributes) attr;
0074         CELER_DEVICE_API_CALL(
0075             FuncGetAttributes(&attr, reinterpret_cast<void const*>(func)));
0076         result.num_regs = attr.numRegs;
0077         result.const_mem = attr.constSizeBytes;
0078         result.local_mem = attr.localSizeBytes;
0079         result.max_threads_per_block = attr.maxThreadsPerBlock;
0080     }
0081
0082     if (threads_per_block == 0)
0083     {
0084         // Use the maximum number of threads instead of having smaller blocks
0085         threads_per_block = result.max_threads_per_block;
0086     }
0087
0088     // Get maximum number of active blocks per SM
0089     std::size_t dynamic_smem_size = 0;
0090     int num_blocks = 0;
0091     CELER_DEVICE_API_CALL(OccupancyMaxActiveBlocksPerMultiprocessor(
0092         &num_blocks, func, threads_per_block, dynamic_smem_size));
0093     result.max_blocks_per_cu = num_blocks;
0094
0095     // Calculate occupancy statistics used for launch bounds
0096     // (threads / block) * (blocks / cu) * (cu / eu) * (warp / thread)
0097     Device const& d = celeritas::device();
0098
0099     result.max_warps_per_eu = (threads_per_block * num_blocks)
0100                               / (d.eu_per_cu() * d.threads_per_warp());
0101     result.occupancy = static_cast<double>(num_blocks * threads_per_block)
0102                        / static_cast<double>(d.max_threads_per_cu());
0103
0104     // Get size limits
0105     if constexpr (CELERITAS_USE_CUDA)
0106     {
0107         // Stack size limit is CUDA-only
0108         CELER_DEVICE_API_CALL(DeviceGetLimit(
0109             &result.stack_size, CELER_DEVICE_API_SYMBOL(LimitStackSize)));
0110         // HIP throws 'limit is not supported on this architecture'
0111         CELER_DEVICE_API_CALL(
0112             DeviceGetLimit(&result.print_buffer_size,
0113                            CELER_DEVICE_API_SYMBOL(LimitPrintfFifoSize)));
0114     }
0115     CELER_DEVICE_API_CALL(DeviceGetLimit(
0116         &result.heap_size, CELER_DEVICE_API_SYMBOL(LimitMallocHeapSize)));
0117 #else
0118     CELER_DISCARD(func);
0119     CELER_ASSERT_UNREACHABLE();
0120 #endif
0121     result.threads_per_block = threads_per_block;
0122     return result;
0123 }
0124
0125 //---------------------------------------------------------------------------//
0126 }  // namespace celeritas