File indexing completed on 2025-09-16 08:52:44
0001
0002
0003
0004
0005
0006
0007 #pragma once
0008
0009 #include <cstddef>
0010 #include <type_traits>
0011
0012 #include "corecel/Config.hh"
0013
0014 #include "corecel/Assert.hh"
0015 #include "corecel/Macros.hh"
0016
0017 #include "Device.hh"
0018
0019 #if CELER_DEVICE_SOURCE
0020 # include "corecel/DeviceRuntimeApi.hh"
0021 #endif
0022
0023 namespace celeritas
0024 {
0025
0026
0027
0028
0029
0030
0031
0032 struct KernelAttributes
0033 {
0034 unsigned int threads_per_block{0};
0035
0036 int num_regs{0};
0037 std::size_t const_mem{0};
0038 std::size_t local_mem{0};
0039
0040 unsigned int max_threads_per_block{0};
0041 unsigned int max_blocks_per_cu{0};
0042
0043
0044 unsigned int max_warps_per_eu{0};
0045 double occupancy{0};
0046
0047
0048 std::size_t stack_size{0};
0049 std::size_t heap_size{0};
0050 std::size_t print_buffer_size{0};
0051 };
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065 template<class F>
0066 KernelAttributes
0067 make_kernel_attributes(F* func, unsigned int threads_per_block = 0)
0068 {
0069 KernelAttributes result;
0070 #ifdef CELER_DEVICE_SOURCE
0071
0072 {
0073 CELER_DEVICE_API_SYMBOL(FuncAttributes) attr;
0074 CELER_DEVICE_API_CALL(
0075 FuncGetAttributes(&attr, reinterpret_cast<void const*>(func)));
0076 result.num_regs = attr.numRegs;
0077 result.const_mem = attr.constSizeBytes;
0078 result.local_mem = attr.localSizeBytes;
0079 result.max_threads_per_block = attr.maxThreadsPerBlock;
0080 }
0081
0082 if (threads_per_block == 0)
0083 {
0084
0085 threads_per_block = result.max_threads_per_block;
0086 }
0087
0088
0089 std::size_t dynamic_smem_size = 0;
0090 int num_blocks = 0;
0091 CELER_DEVICE_API_CALL(OccupancyMaxActiveBlocksPerMultiprocessor(
0092 &num_blocks, func, threads_per_block, dynamic_smem_size));
0093 result.max_blocks_per_cu = num_blocks;
0094
0095
0096
0097 Device const& d = celeritas::device();
0098
0099 result.max_warps_per_eu = (threads_per_block * num_blocks)
0100 / (d.eu_per_cu() * d.threads_per_warp());
0101 result.occupancy = static_cast<double>(num_blocks * threads_per_block)
0102 / static_cast<double>(d.max_threads_per_cu());
0103
0104
0105 if constexpr (CELERITAS_USE_CUDA)
0106 {
0107
0108 CELER_DEVICE_API_CALL(DeviceGetLimit(
0109 &result.stack_size, CELER_DEVICE_API_SYMBOL(LimitStackSize)));
0110
0111 CELER_DEVICE_API_CALL(
0112 DeviceGetLimit(&result.print_buffer_size,
0113 CELER_DEVICE_API_SYMBOL(LimitPrintfFifoSize)));
0114 }
0115 CELER_DEVICE_API_CALL(DeviceGetLimit(
0116 &result.heap_size, CELER_DEVICE_API_SYMBOL(LimitMallocHeapSize)));
0117 #else
0118 CELER_DISCARD(func);
0119 CELER_ASSERT_UNREACHABLE();
0120 #endif
0121 result.threads_per_block = threads_per_block;
0122 return result;
0123 }
0124
0125
0126 }