corecel/sys/Device.hh

0001 //------------------------------- -*- C++ -*- -------------------------------//
0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details
0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0004 //---------------------------------------------------------------------------//
0005 //! \file corecel/sys/Device.hh
0006 //---------------------------------------------------------------------------//
0007 #pragma once
0008
0009 #include <cstddef>
0010 #include <iosfwd>  // IWYU pragma: keep
0011 #include <map>
0012 #include <string>
0013 #include <vector>
0014
0015 #include "corecel/Assert.hh"
0016 #include "corecel/Macros.hh"
0017
0018 #include "Stream.hh"
0019 #include "ThreadId.hh"
0020
0021 namespace celeritas
0022 {
0023 //---------------------------------------------------------------------------//
0024 class MpiCommunicator;
0025 class Stream;
0026
0027 //---------------------------------------------------------------------------//
0028 /*!
0029  * Manage attributes of the GPU.
0030  *
0031  * CUDA/HIP translation table:
0032  *
0033  * CUDA/NVIDIA    | HIP/AMD        | Description
0034  * -------------- | -------------- | -----------------
0035  * thread         | work item      | individual local work element
0036  * warp           | wavefront      | "vectorized thread" operating in lockstep
0037  * block          | workgroup      | group of threads able to sync
0038  * multiprocessor | compute unit   | hardware executing one or more blocks
0039  * multiprocessor | execution unit | hardware executing one or more warps
0040  *
0041  * Each block/workgroup operates on the same hardware (compute unit) until
0042  * completion. Similarly, a warp/wavefront is tied to a single execution
0043  * unit. Each compute unit can execute one or more blocks: the higher the
0044  * number of blocks resident, the more latency can be hidden.
0045  *
0046  * \warning The current multithreading/multiprocess model is intended to have
0047  * one GPU serving multiple CPU threads simultaneously, and one MPI process per
0048  * GPU. The active CUDA device is a static thread-local property but  \c
0049  * global_device is global. CUDA needs to be activated using \c activate_device
0050  * or \c activate_device_local on every thread, using the same device ID.
0051  *
0052  * \todo Const correctness for streams is wrong; we should
0053  * probably make the global device non-const (and thread-local?) and then
0054  * activate it on "move".
0055  */
0056 class Device
0057 {
0058   public:
0059     //!@{
0060     //! \name Type aliases
0061     using MapStrInt = std::map<std::string, int>;
0062     //!@}
0063
0064   public:
0065     // Number of devices available on the local compute node (0 if disabled)
0066     static int num_devices();
0067
0068     // Whether verbose messages and error checking are enabled
0069     static bool debug();
0070
0071     // Whether asynchronous stream operations are supported
0072     static bool async();
0073
0074     //// CONSTRUCTORS ////
0075
0076     // Construct an inactive device (disable celeritas CUDA calls)
0077     Device() = default;
0078
0079     // Construct from device ID
0080     explicit Device(int id);
0081
0082     //// ACCESSORS ////
0083
0084     // Get the device ID
0085     inline int device_id() const;
0086
0087     //! True if device is initialized
0088     explicit operator bool() const { return id_ >= 0; }
0089
0090     //! Device name
0091     std::string name() const { return name_; }
0092
0093     //! Total memory capacity (bytes)
0094     std::size_t total_global_mem() const { return total_global_mem_; }
0095
0096     //! Maximum number of threads per block (for launch limits)
0097     int max_threads_per_block() const { return max_threads_per_block_; }
0098
0099     //! Maximum number of threads per block (for launch limits)
0100     int max_blocks_per_grid() const { return max_blocks_per_grid_; }
0101
0102     //! Maximum number of concurrent threads per compute unit (for occupancy)
0103     int max_threads_per_cu() const { return max_threads_per_cu_; }
0104
0105     //! Number of threads per warp
0106     unsigned int threads_per_warp() const { return threads_per_warp_; }
0107
0108     //! Whether the device supports mapped pinned memory
0109     bool can_map_host_memory() const { return can_map_host_memory_; }
0110
0111     //! Number of execution units per compute unit (1 for NVIDIA, 4 for AMD)
0112     unsigned int eu_per_cu() const { return eu_per_cu_; }
0113
0114     //! CUDA/HIP capability: major * 10 + minor
0115     unsigned int capability() const { return capability_; }
0116
0117     //! Additional potentially interesting diagnostics
0118     MapStrInt const& extra() const { return extra_; }
0119
0120     // Number of streams allocated
0121     StreamId::size_type num_streams() const;
0122
0123     // Allocate the given number of streams
0124     void create_streams(unsigned int num_streams) const;
0125
0126     // Destroy all streams before shutting down CUDA
0127     void destroy_streams() const;
0128
0129     // Access a stream
0130     inline Stream& stream(StreamId) const;
0131
0132   private:
0133     //// DATA ////
0134
0135     // Required values for default constructor
0136     int id_{-1};
0137     std::string name_{"<DISABLED>"};
0138
0139     // Default values overridden in device-ID constructor
0140     std::size_t total_global_mem_{};
0141     int max_threads_per_block_{};
0142     int max_blocks_per_grid_{};
0143     int max_threads_per_cu_{};
0144     unsigned int threads_per_warp_{};
0145     bool can_map_host_memory_{};
0146     unsigned int capability_{0};
0147     unsigned int eu_per_cu_{};
0148     MapStrInt extra_;
0149     std::vector<Stream> streams_;
0150 };
0151
0152 //---------------------------------------------------------------------------//
0153 // CELERITAS SHARED DEVICE
0154 //---------------------------------------------------------------------------//
0155 // Global active device (default is inactive/false)
0156 Device const& device();
0157
0158 // Set and initialize the active GPU
0159 void activate_device(Device&& device);
0160
0161 // Initialize the first device if available using celeritas::comm_world
0162 void activate_device();
0163
0164 // Initialize a device in a round-robin fashion from a communicator.
0165 void activate_device(MpiCommunicator const&);
0166
0167 // Call cudaSetDevice using the existing device, for thread-local safety
0168 void activate_device_local();
0169
0170 //---------------------------------------------------------------------------//
0171 // FREE FUNCTIONS
0172 //---------------------------------------------------------------------------//
0173 // Print device info
0174 std::ostream& operator<<(std::ostream&, Device const&);
0175
0176 // Increase CUDA stack size
0177 void set_cuda_stack_size(int limit);
0178
0179 // Increase CUDA HEAP size
0180 void set_cuda_heap_size(int limit);
0181
0182 //---------------------------------------------------------------------------//
0183 // INLINE DEFINITIONS
0184 //---------------------------------------------------------------------------//
0185 /*!
0186  * Get the CUDA device ID, if active.
0187  */
0188 int Device::device_id() const
0189 {
0190     CELER_EXPECT(*this);
0191     return id_;
0192 }
0193
0194 //---------------------------------------------------------------------------//
0195 /*!
0196  * Access a stream after creating.
0197  */
0198 Stream& Device::stream(StreamId id) const
0199 {
0200     CELER_EXPECT(id < streams_.size());
0201     return const_cast<Stream&>(streams_[id.get()]);
0202 }
0203
0204 //---------------------------------------------------------------------------//
0205 }  // namespace celeritas