Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 09:54:49

0001 //----------------------------------*-C++-*----------------------------------//
0002 // Copyright 2020-2024 UT-Battelle, LLC, and other Celeritas developers.
0003 // See the top-level COPYRIGHT file for details.
0004 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
0005 //---------------------------------------------------------------------------//
0006 //! \file corecel/sys/Device.hh
0007 //---------------------------------------------------------------------------//
0008 #pragma once
0009 
0010 #include <cstddef>
0011 #include <iosfwd>  // IWYU pragma: keep
0012 #include <map>
0013 #include <memory>
0014 #include <string>
0015 
0016 #include "corecel/Assert.hh"
0017 
0018 #include "ThreadId.hh"
0019 
0020 namespace celeritas
0021 {
0022 class MpiCommunicator;
0023 class Stream;
0024 namespace detail
0025 {
0026 class StreamStorage;
0027 }
0028 
0029 //---------------------------------------------------------------------------//
0030 /*!
0031  * Manage attributes of the GPU.
0032  *
0033  * CUDA/HIP translation table:
0034  *
0035  * CUDA/NVIDIA    | HIP/AMD        | Description
0036  * -------------- | -------------- | -----------------
0037  * thread         | work item      | individual local work element
0038  * warp           | wavefront      | "vectorized thread" operating in lockstep
0039  * block          | workgroup      | group of threads able to sync
0040  * multiprocessor | compute unit   | hardware executing one or more blocks
0041  * multiprocessor | execution unit | hardware executing one or more warps
0042  *
0043  * Each block/workgroup operates on the same hardware (compute unit) until
0044  * completion. Similarly, a warp/wavefront is tied to a single execution
0045  * unit. Each compute unit can execute one or more blocks: the higher the
0046  * number of blocks resident, the more latency can be hidden.
0047  *
0048  * \warning The current multithreading/multiprocess model is intended to have
0049  * one GPU serving multiple CPU threads simultaneously, and one MPI process per
0050  * GPU. The active CUDA device is a static thread-local property but  \c
0051  * global_device is global. CUDA needs to be activated using \c activate_device
0052  * or \c activate_device_local on every thread, using the same device ID.
0053  */
0054 class Device
0055 {
0056   public:
0057     //!@{
0058     //! \name Type aliases
0059     using MapStrInt = std::map<std::string, int>;
0060     //!@}
0061 
0062   public:
0063     // Number of devices available on the local compute node (0 if disabled)
0064     static int num_devices();
0065 
0066     // Whether verbose messages and error checking are enabled
0067     static bool debug();
0068 
0069     //// CONSTRUCTORS ////
0070 
0071     // Construct an inactive device (disable celeritas CUDA calls)
0072     Device() = default;
0073 
0074     // Construct from device ID
0075     explicit Device(int id);
0076 
0077     //// ACCESSORS ////
0078 
0079     // Get the device ID
0080     inline int device_id() const;
0081 
0082     //! True if device is initialized
0083     explicit operator bool() const { return id_ >= 0; }
0084 
0085     //! Device name
0086     std::string name() const { return name_; }
0087 
0088     //! Total memory capacity (bytes)
0089     std::size_t total_global_mem() const { return total_global_mem_; }
0090 
0091     //! Maximum number of threads per block (for launch limits)
0092     int max_threads_per_block() const { return max_threads_per_block_; }
0093 
0094     //! Maximum number of threads per block (for launch limits)
0095     int max_blocks_per_grid() const { return max_blocks_per_grid_; }
0096 
0097     //! Maximum number of concurrent threads per compute unit (for occupancy)
0098     int max_threads_per_cu() const { return max_threads_per_cu_; }
0099 
0100     //! Number of threads per warp
0101     unsigned int threads_per_warp() const { return threads_per_warp_; }
0102 
0103     //! Whether the device supports mapped pinned memory
0104     bool can_map_host_memory() const { return can_map_host_memory_; }
0105 
0106     //! Number of execution units per compute unit (1 for NVIDIA, 4 for AMD)
0107     unsigned int eu_per_cu() const { return eu_per_cu_; }
0108 
0109     //! Additional potentially interesting diagnostics
0110     MapStrInt const& extra() const { return extra_; }
0111 
0112     // Number of streams allocated
0113     StreamId::size_type num_streams() const;
0114 
0115     // Allocate the given number of streams
0116     void create_streams(unsigned int num_streams) const;
0117 
0118     // Access a stream
0119     Stream& stream(StreamId) const;
0120 
0121   private:
0122     struct StreamStorageDeleter
0123     {
0124         void operator()(detail::StreamStorage*) noexcept;
0125     };
0126 
0127     using UPStreamStorage
0128         = std::unique_ptr<detail::StreamStorage, StreamStorageDeleter>;
0129 
0130     //// DATA ////
0131 
0132     // Required values for default constructor
0133     int id_{-1};
0134     std::string name_{"<DISABLED>"};
0135 
0136     // Default values overridden in device-ID constructor
0137     std::size_t total_global_mem_{};
0138     int max_threads_per_block_{};
0139     int max_blocks_per_grid_{};
0140     int max_threads_per_cu_{};
0141     unsigned int threads_per_warp_{};
0142     bool can_map_host_memory_{};
0143     unsigned int eu_per_cu_{};
0144     MapStrInt extra_;
0145     UPStreamStorage streams_;
0146 };
0147 
0148 //---------------------------------------------------------------------------//
0149 // CELERITAS SHARED DEVICE
0150 //---------------------------------------------------------------------------//
0151 // Global active device (default is inactive/false)
0152 Device const& device();
0153 
0154 // Set and initialize the active GPU
0155 void activate_device(Device&& device);
0156 
0157 // Initialize the first device if available using celeritas::comm_world
0158 void activate_device();
0159 
0160 // Initialize a device in a round-robin fashion from a communicator.
0161 void activate_device(MpiCommunicator const&);
0162 
0163 // Call cudaSetDevice using the existing device, for thread-local safety
0164 void activate_device_local();
0165 
0166 //---------------------------------------------------------------------------//
0167 // FREE FUNCTIONS
0168 //---------------------------------------------------------------------------//
0169 // Print device info
0170 std::ostream& operator<<(std::ostream&, Device const&);
0171 
0172 // Increase CUDA stack size
0173 void set_cuda_stack_size(int limit);
0174 
0175 // Increase CUDA HEAP size
0176 void set_cuda_heap_size(int limit);
0177 
0178 //---------------------------------------------------------------------------//
0179 // INLINE DEFINITIONS
0180 //---------------------------------------------------------------------------//
0181 /*!
0182  * Get the CUDA device ID, if active.
0183  */
0184 int Device::device_id() const
0185 {
0186     CELER_EXPECT(*this);
0187     return id_;
0188 }
0189 
0190 //---------------------------------------------------------------------------//
0191 }  // namespace celeritas