File indexing completed on 2025-01-18 09:54:49
0001
0002
0003
0004
0005
0006
0007
0008 #pragma once
0009
0010 #include "corecel/Config.hh"
0011 #include "corecel/DeviceRuntimeApi.hh"
0012
0013 #include "corecel/Macros.hh"
0014 #include "corecel/cont/Range.hh"
0015 #include "corecel/sys/KernelParamCalculator.device.hh"
0016 #include "corecel/sys/KernelTraits.hh"
0017 #include "corecel/sys/ThreadId.hh"
0018
0019 namespace celeritas
0020 {
0021 namespace detail
0022 {
0023 namespace
0024 {
0025
0026
0027
0028
0029 template<class F>
0030 __device__ CELER_FORCEINLINE void
0031 launch_kernel_impl(Range<ThreadId> const& thread_range, F& execute_thread)
0032 {
0033 auto tid = celeritas::KernelParamCalculator::thread_id();
0034 if (!(tid < thread_range.size()))
0035 return;
0036 execute_thread(*(thread_range.cbegin() + tid.get()));
0037 }
0038
0039
0040
0041
0042
0043
0044 template<class F, std::enable_if_t<!has_applier_v<F>, bool> = true>
0045 __global__ void __launch_bounds__(CELERITAS_MAX_BLOCK_SIZE)
0046 launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0047 {
0048 launch_kernel_impl(thread_range, execute_thread);
0049 }
0050
0051
0052 template<class F,
0053 std::enable_if_t<kernel_no_bound<typename F::Applier>, bool> = true>
0054 __global__ void __launch_bounds__(CELERITAS_MAX_BLOCK_SIZE)
0055 launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0056 {
0057 launch_kernel_impl(thread_range, execute_thread);
0058 }
0059
0060
0061 template<class F,
0062 class A_ = typename F::Applier,
0063 std::enable_if_t<kernel_max_blocks<A_>, bool> = true>
0064 __global__ void __launch_bounds__(A_::max_block_size)
0065 launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0066 {
0067 launch_kernel_impl(thread_range, execute_thread);
0068 }
0069
0070
0071 template<class F,
0072 class A_ = typename F::Applier,
0073 std::enable_if_t<kernel_max_blocks_min_warps<A_>, bool> = true>
0074 __global__ void
0075 #if CELERITAS_USE_CUDA
0076 __launch_bounds__(A_::max_block_size,
0077 (A_::min_warps_per_eu * 32) / A_::max_block_size)
0078 #elif CELERITAS_USE_HIP
0079 __launch_bounds__(A_::max_block_size, A_::min_warps_per_eu)
0080 #endif
0081 launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0082 {
0083 launch_kernel_impl(thread_range, execute_thread);
0084 }
0085
0086
0087 }
0088 }
0089 }
0090