File indexing completed on 2025-01-18 09:54:49
0001
0002
0003
0004
0005
0006
0007
0008 #pragma once
0009
0010 #include <cstddef>
0011 #include <type_traits>
0012
0013 #include "corecel/Config.hh"
0014
0015 #include "corecel/Assert.hh"
0016 #include "corecel/Macros.hh"
0017
0018 #include "Device.hh"
0019
0020 #if CELER_DEVICE_SOURCE
0021 # include "corecel/DeviceRuntimeApi.hh"
0022 #endif
0023
0024 namespace celeritas
0025 {
0026
0027
0028
0029
0030
0031
0032
0033 struct KernelAttributes
0034 {
0035 unsigned int threads_per_block{0};
0036
0037 int num_regs{0};
0038 std::size_t const_mem{0};
0039 std::size_t local_mem{0};
0040
0041 unsigned int max_threads_per_block{0};
0042 unsigned int max_blocks_per_cu{0};
0043
0044
0045 unsigned int max_warps_per_eu{0};
0046 double occupancy{0};
0047
0048
0049 std::size_t stack_size{0};
0050 std::size_t heap_size{0};
0051 std::size_t print_buffer_size{0};
0052 };
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066 template<class F>
0067 KernelAttributes
0068 make_kernel_attributes(F* func, unsigned int threads_per_block = 0)
0069 {
0070 KernelAttributes result;
0071 #ifdef CELER_DEVICE_SOURCE
0072
0073 {
0074 CELER_DEVICE_PREFIX(FuncAttributes) attr;
0075 CELER_DEVICE_CALL_PREFIX(
0076 FuncGetAttributes(&attr, reinterpret_cast<void const*>(func)));
0077 result.num_regs = attr.numRegs;
0078 result.const_mem = attr.constSizeBytes;
0079 result.local_mem = attr.localSizeBytes;
0080 result.max_threads_per_block = attr.maxThreadsPerBlock;
0081 }
0082
0083 if (threads_per_block == 0)
0084 {
0085
0086 threads_per_block = result.max_threads_per_block;
0087 }
0088
0089
0090 std::size_t dynamic_smem_size = 0;
0091 int num_blocks = 0;
0092 CELER_DEVICE_CALL_PREFIX(OccupancyMaxActiveBlocksPerMultiprocessor(
0093 &num_blocks, func, threads_per_block, dynamic_smem_size));
0094 result.max_blocks_per_cu = num_blocks;
0095
0096
0097
0098 Device const& d = celeritas::device();
0099
0100 result.max_warps_per_eu = (threads_per_block * num_blocks)
0101 / (d.eu_per_cu() * d.threads_per_warp());
0102 result.occupancy = static_cast<double>(num_blocks * threads_per_block)
0103 / static_cast<double>(d.max_threads_per_cu());
0104
0105
0106 if constexpr (CELERITAS_USE_CUDA)
0107 {
0108
0109 CELER_CUDA_CALL(
0110 cudaDeviceGetLimit(&result.stack_size, cudaLimitStackSize));
0111
0112 CELER_CUDA_CALL(cudaDeviceGetLimit(&result.print_buffer_size,
0113 cudaLimitPrintfFifoSize));
0114 }
0115 CELER_DEVICE_CALL_PREFIX(DeviceGetLimit(
0116 &result.heap_size, CELER_DEVICE_PREFIX(LimitMallocHeapSize)));
0117 #else
0118 CELER_DISCARD(func);
0119 CELER_ASSERT_UNREACHABLE();
0120 #endif
0121 result.threads_per_block = threads_per_block;
0122 return result;
0123 }
0124
0125
0126 }