sys/detail/KernelLauncherImpl.device.hh

0001 //---------------------------------*-CUDA-*----------------------------------//
0002 // Copyright 2023-2024 UT-Battelle, LLC, and other Celeritas developers.
0003 // See the top-level COPYRIGHT file for details.
0004 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0005 //---------------------------------------------------------------------------//
0006 //! \file corecel/sys/detail/KernelLauncherImpl.device.hh
0007 //---------------------------------------------------------------------------//
0008 #pragma once
0009
0010 #include "corecel/Config.hh"
0011 #include "corecel/DeviceRuntimeApi.hh"
0012
0013 #include "corecel/Macros.hh"
0014 #include "corecel/cont/Range.hh"
0015
0016 #include "../KernelParamCalculator.device.hh"
0017 #include "../KernelTraits.hh"
0018 #include "../ThreadId.hh"
0019
0020 namespace celeritas
0021 {
0022 namespace detail
0023 {
0024 namespace
0025 {
0026 //---------------------------------------------------------------------------//
0027 /*!
0028  * Celeritas executor kernel implementation.
0029  */
0030 template<class F>
0031 __device__ CELER_FORCEINLINE void
0032 launch_kernel_impl(Range<ThreadId> const& thread_range, F& execute_thread)
0033 {
0034     auto tid = celeritas::KernelParamCalculator::thread_id();
0035     if (!(tid < thread_range.size()))
0036         return;
0037     execute_thread(*(thread_range.cbegin() + tid.get()));
0038 }
0039
0040 //---------------------------------------------------------------------------//
0041 //!@{
0042 //! Launch the given executor using thread ids in the thread_range.
0043
0044 // Instantiated if F doesn't define a member type F::Applier
0045 template<class F, std::enable_if_t<!has_applier_v<F>, bool> = true>
0046 __global__ void __launch_bounds__(CELERITAS_MAX_BLOCK_SIZE)
0047     launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0048 {
0049     launch_kernel_impl(thread_range, execute_thread);
0050 }
0051
0052 // Instantiated if F::Applier has no manual launch bounds
0053 template<class F,
0054          std::enable_if_t<kernel_no_bound<typename F::Applier>, bool> = true>
0055 __global__ void __launch_bounds__(CELERITAS_MAX_BLOCK_SIZE)
0056     launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0057 {
0058     launch_kernel_impl(thread_range, execute_thread);
0059 }
0060
0061 // Instantiated if F::Applier defines the first launch bounds argument
0062 template<class F,
0063          class A_ = typename F::Applier,
0064          std::enable_if_t<kernel_max_blocks<A_>, bool> = true>
0065 __global__ void __launch_bounds__(A_::max_block_size)
0066     launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0067 {
0068     launch_kernel_impl(thread_range, execute_thread);
0069 }
0070
0071 // Instantiated if F::Applier defines two arguments for launch bounds
0072 template<class F,
0073          class A_ = typename F::Applier,
0074          std::enable_if_t<kernel_max_blocks_min_warps<A_>, bool> = true>
0075 __global__ void
0076 #if CELERITAS_USE_CUDA
0077 __launch_bounds__(A_::max_block_size,
0078                   (A_::min_warps_per_eu * 32) / A_::max_block_size)
0079 #elif CELERITAS_USE_HIP
0080 __launch_bounds__(A_::max_block_size, A_::min_warps_per_eu)
0081 #endif
0082     launch_action_impl(Range<ThreadId> const thread_range, F execute_thread)
0083 {
0084     launch_kernel_impl(thread_range, execute_thread);
0085 }
0086
0087 //---------------------------------------------------------------------------//
0088 }  // namespace
0089 }  // namespace detail
0090 }  // namespace celeritas
0091 // vim: set ft=cuda :