Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:54:49

0001 //----------------------------------*-C++-*----------------------------------//
0002 // Copyright 2022-2024 UT-Battelle, LLC, and other Celeritas developers.
0003 // See the top-level COPYRIGHT file for details.
0004 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0005 //---------------------------------------------------------------------------//
0006 //! \file corecel/sys/KernelAttributes.hh
0007 //---------------------------------------------------------------------------//
0008 #pragma once
0009 
0010 #include <cstddef>
0011 #include <type_traits>
0012 
0013 #include "corecel/Config.hh"
0014 
0015 #include "corecel/Assert.hh"
0016 #include "corecel/Macros.hh"
0017 
0018 #include "Device.hh"
0019 
0020 #if CELER_DEVICE_SOURCE
0021 #    include "corecel/DeviceRuntimeApi.hh"
0022 #endif
0023 
0024 namespace celeritas
0025 {
0026 //---------------------------------------------------------------------------//
0027 /*!
0028  * Immutable attributes of a kernel function.
0029  *
0030  * This is an analog to \c cudaFuncAttributes with some additional helpful
0031  * information. Some quantities are device-specific.
0032  */
0033 struct KernelAttributes
0034 {
0035     unsigned int threads_per_block{0};
0036 
0037     int num_regs{0};  //!< Number of 32-bit registers per thread
0038     std::size_t const_mem{0};  //!< Amount of constant memory (per thread) [b]
0039     std::size_t local_mem{0};  //!< Amount of local memory (per thread) [b]
0040 
0041     unsigned int max_threads_per_block{0};  //!< Max allowed threads per block
0042     unsigned int max_blocks_per_cu{0};  //!< Occupancy (compute unit)
0043 
0044     // Derivative but useful occupancy information
0045     unsigned int max_warps_per_eu{0};  //!< Occupancy (execution unit)
0046     double occupancy{0};  //!< Fractional occupancy (CU)
0047 
0048     // Resource limits at first call
0049     std::size_t stack_size{0};  //!< CUDA Dynamic per-thread stack limit [b]
0050     std::size_t heap_size{0};  //!< Dynamic malloc heap size [b]
0051     std::size_t print_buffer_size{0};  //!< FIFO buffer size for printf [b]
0052 };
0053 
0054 //---------------------------------------------------------------------------//
0055 /*!
0056  * Build kernel attributes from a __global__ kernel function.
0057  *
0058  * This can only be called from CUDA/HIP code. It assumes that the block size
0059  * is constant across the execution of the program and that the kernel is only
0060  * called by the device that's active at this time.
0061  *
0062  * The special value of zero threads per block causes the kernel attributes to
0063  * default to the *compile-time maximum* number of threads per block as
0064  * specified by launch bounds.
0065  */
0066 template<class F>
0067 KernelAttributes
0068 make_kernel_attributes(F* func, unsigned int threads_per_block = 0)
0069 {
0070     KernelAttributes result;
0071 #ifdef CELER_DEVICE_SOURCE
0072     // Get function attributes
0073     {
0074         CELER_DEVICE_PREFIX(FuncAttributes) attr;
0075         CELER_DEVICE_CALL_PREFIX(
0076             FuncGetAttributes(&attr, reinterpret_cast<void const*>(func)));
0077         result.num_regs = attr.numRegs;
0078         result.const_mem = attr.constSizeBytes;
0079         result.local_mem = attr.localSizeBytes;
0080         result.max_threads_per_block = attr.maxThreadsPerBlock;
0081     }
0082 
0083     if (threads_per_block == 0)
0084     {
0085         // Use the maximum number of threads instead of having smaller blocks
0086         threads_per_block = result.max_threads_per_block;
0087     }
0088 
0089     // Get maximum number of active blocks per SM
0090     std::size_t dynamic_smem_size = 0;
0091     int num_blocks = 0;
0092     CELER_DEVICE_CALL_PREFIX(OccupancyMaxActiveBlocksPerMultiprocessor(
0093         &num_blocks, func, threads_per_block, dynamic_smem_size));
0094     result.max_blocks_per_cu = num_blocks;
0095 
0096     // Calculate occupancy statistics used for launch bounds
0097     // (threads / block) * (blocks / cu) * (cu / eu) * (warp / thread)
0098     Device const& d = celeritas::device();
0099 
0100     result.max_warps_per_eu = (threads_per_block * num_blocks)
0101                               / (d.eu_per_cu() * d.threads_per_warp());
0102     result.occupancy = static_cast<double>(num_blocks * threads_per_block)
0103                        / static_cast<double>(d.max_threads_per_cu());
0104 
0105     // Get size limits
0106     if constexpr (CELERITAS_USE_CUDA)
0107     {
0108         // Stack size limit is CUDA-only
0109         CELER_CUDA_CALL(
0110             cudaDeviceGetLimit(&result.stack_size, cudaLimitStackSize));
0111         // HIP throws 'limit is not supported on this architecture'
0112         CELER_CUDA_CALL(cudaDeviceGetLimit(&result.print_buffer_size,
0113                                            cudaLimitPrintfFifoSize));
0114     }
0115     CELER_DEVICE_CALL_PREFIX(DeviceGetLimit(
0116         &result.heap_size, CELER_DEVICE_PREFIX(LimitMallocHeapSize)));
0117 #else
0118     CELER_DISCARD(func);
0119     CELER_ASSERT_UNREACHABLE();
0120 #endif
0121     result.threads_per_block = threads_per_block;
0122     return result;
0123 }
0124 
0125 //---------------------------------------------------------------------------//
0126 }  // namespace celeritas