corecel/sys/KernelLauncher.device.hh

0001 //------------------------------ -*- C++ -*- -------------------------------//
0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details
0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0004 //---------------------------------------------------------------------------//
0005 //! \file corecel/sys/KernelLauncher.device.hh
0006 //---------------------------------------------------------------------------//
0007 #pragma once
0008
0009 #include <string_view>
0010 #include <type_traits>
0011
0012 #include "corecel/DeviceRuntimeApi.hh"
0013
0014 #include "corecel/Assert.hh"
0015 #include "corecel/Macros.hh"
0016 #include "corecel/Types.hh"
0017 #include "corecel/cont/Range.hh"
0018
0019 #include "Device.hh"
0020 #include "KernelParamCalculator.device.hh"
0021 #include "Stream.hh"
0022 #include "ThreadId.hh"
0023
0024 #include "detail/KernelLauncherImpl.device.hh"
0025
0026 namespace celeritas
0027 {
0028 //---------------------------------------------------------------------------//
0029 /*!
0030  * Profile and launch Celeritas kernels.
0031  *
0032  * The template argument \c F may define a member type named \c Applier.
0033  * \c F::Applier should have up to two static constexpr int variables named
0034  * \c max_block_size and/or \c min_warps_per_eu.
0035  * If present, the kernel will use appropriate \c __launch_bounds__.
0036  * If \c F::Applier::min_warps_per_eu exists then \c F::Applier::max_block_size
0037  * must also be present or we get a compile error.
0038  *
0039  * The semantics of the second \c __launch_bounds__ argument differs between
0040  * CUDA and HIP.  \c KernelLauncher expects HIP semantics. If Celeritas is
0041  * built targeting CUDA, it will automatically convert that argument to match
0042  * CUDA semantics.
0043  *
0044  * The CUDA-specific 3rd argument \c maxBlocksPerCluster is not supported.
0045  *
0046  * Example:
0047  * \code
0048  void launch_kernel(DeviceParams const& params, size_type count) const
0049  {
0050     auto execute_thread = BlahExecutor{params};
0051     static KernelLauncher<decltype(execute_thread)> const launch("blah");
0052     launch_kernel(count, StreamId{}, execute_thread);
0053  }
0054  * \endcode
0055  */
0056 template<class F>
0057 class KernelLauncher
0058 {
0059     static_assert(
0060         (std::is_trivially_copyable_v<F> || CELERITAS_USE_HIP
0061          || CELER_COMPILER == CELER_COMPILER_CLANG)
0062             && !std::is_pointer_v<F> && !std::is_reference_v<F>,
0063         R"(Launched action must be a trivially copyable function object)");
0064
0065   public:
0066     // Create a launcher from a label
0067     explicit inline KernelLauncher(std::string_view name);
0068
0069     // Launch a kernel for a thread range
0070     inline void operator()(Range<ThreadId> threads,
0071                            StreamId stream_id,
0072                            F const& execute_thread) const;
0073
0074     // Launch a kernel with a custom number of threads
0075     inline void operator()(size_type num_threads,
0076                            StreamId stream_id,
0077                            F const& execute_thread) const;
0078
0079   private:
0080     KernelParamCalculator calc_launch_params_;
0081 };
0082
0083 //---------------------------------------------------------------------------//
0084 // INLINE FUNCTIONS
0085 //---------------------------------------------------------------------------//
0086 /*!
0087  * Create a launcher from a label.
0088  */
0089 template<class F>
0090 KernelLauncher<F>::KernelLauncher(std::string_view name)
0091     : calc_launch_params_{name, &detail::launch_action_impl<F>}
0092 {
0093 }
0094
0095 //---------------------------------------------------------------------------//
0096 /*!
0097  * Launch a kernel for a thread range.
0098  */
0099 template<class F>
0100 void KernelLauncher<F>::operator()(Range<ThreadId> threads,
0101                                    StreamId stream_id,
0102                                    F const& execute_thread) const
0103 {
0104     if (!threads.empty())
0105     {
0106         using StreamT = CELER_DEVICE_API_SYMBOL(Stream_t);
0107         StreamT stream = stream_id
0108                              ? celeritas::device().stream(stream_id).get()
0109                              : nullptr;
0110         auto config = calc_launch_params_(threads.size());
0111         detail::launch_action_impl<F>
0112             <<<config.blocks_per_grid, config.threads_per_block, 0, stream>>>(
0113                 threads, execute_thread);
0114     }
0115 }
0116
0117 //---------------------------------------------------------------------------//
0118 /*!
0119  * Launch a kernel with a custom number of threads.
0120  *
0121  * The launch arguments have the same ordering as CUDA/HIP kernel launch
0122  * arguments.
0123  *
0124  * \param num_threads Total number of active consecutive threads
0125  * \param stream_id Execute the kernel on this device stream
0126  * \param execute_thread Call the given functor with the thread ID
0127  */
0128 template<class F>
0129 void KernelLauncher<F>::operator()(size_type num_threads,
0130                                    StreamId stream_id,
0131                                    F const& execute_thread) const
0132 {
0133     (*this)(range(ThreadId{num_threads}), stream_id, execute_thread);
0134 }
0135
0136 //---------------------------------------------------------------------------//
0137 }  // namespace celeritas