![]() |
|
|||
File indexing completed on 2025-09-15 08:55:07
0001 //------------------------------- -*- C++ -*- -------------------------------// 0002 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details 0003 // SPDX-License-Identifier: (Apache-2.0 OR MIT) 0004 //---------------------------------------------------------------------------// 0005 //! \file corecel/sys/Device.hh 0006 //---------------------------------------------------------------------------// 0007 #pragma once 0008 0009 #include <cstddef> 0010 #include <iosfwd> // IWYU pragma: keep 0011 #include <map> 0012 #include <string> 0013 #include <vector> 0014 0015 #include "corecel/Assert.hh" 0016 #include "corecel/Macros.hh" 0017 0018 #include "Stream.hh" 0019 #include "ThreadId.hh" 0020 0021 namespace celeritas 0022 { 0023 //---------------------------------------------------------------------------// 0024 class MpiCommunicator; 0025 class Stream; 0026 0027 //---------------------------------------------------------------------------// 0028 /*! 0029 * Manage attributes of the GPU. 0030 * 0031 * CUDA/HIP translation table: 0032 * 0033 * CUDA/NVIDIA | HIP/AMD | Description 0034 * -------------- | -------------- | ----------------- 0035 * thread | work item | individual local work element 0036 * warp | wavefront | "vectorized thread" operating in lockstep 0037 * block | workgroup | group of threads able to sync 0038 * multiprocessor | compute unit | hardware executing one or more blocks 0039 * multiprocessor | execution unit | hardware executing one or more warps 0040 * 0041 * Each block/workgroup operates on the same hardware (compute unit) until 0042 * completion. Similarly, a warp/wavefront is tied to a single execution 0043 * unit. Each compute unit can execute one or more blocks: the higher the 0044 * number of blocks resident, the more latency can be hidden. 0045 * 0046 * \warning The current multithreading/multiprocess model is intended to have 0047 * one GPU serving multiple CPU threads simultaneously, and one MPI process per 0048 * GPU. The active CUDA device is a static thread-local property but \c 0049 * global_device is global. CUDA needs to be activated using \c activate_device 0050 * or \c activate_device_local on every thread, using the same device ID. 0051 * 0052 * \todo Const correctness for streams is wrong; we should 0053 * probably make the global device non-const (and thread-local?) and then 0054 * activate it on "move". 0055 */ 0056 class Device 0057 { 0058 public: 0059 //!@{ 0060 //! \name Type aliases 0061 using MapStrInt = std::map<std::string, int>; 0062 //!@} 0063 0064 public: 0065 // Number of devices available on the local compute node (0 if disabled) 0066 static int num_devices(); 0067 0068 // Whether verbose messages and error checking are enabled 0069 static bool debug(); 0070 0071 // Whether asynchronous stream operations are supported 0072 static bool async(); 0073 0074 //// CONSTRUCTORS //// 0075 0076 // Construct an inactive device (disable celeritas CUDA calls) 0077 Device() = default; 0078 0079 // Construct from device ID 0080 explicit Device(int id); 0081 0082 //// ACCESSORS //// 0083 0084 // Get the device ID 0085 inline int device_id() const; 0086 0087 //! True if device is initialized 0088 explicit operator bool() const { return id_ >= 0; } 0089 0090 //! Device name 0091 std::string name() const { return name_; } 0092 0093 //! Total memory capacity (bytes) 0094 std::size_t total_global_mem() const { return total_global_mem_; } 0095 0096 //! Maximum number of threads per block (for launch limits) 0097 int max_threads_per_block() const { return max_threads_per_block_; } 0098 0099 //! Maximum number of threads per block (for launch limits) 0100 int max_blocks_per_grid() const { return max_blocks_per_grid_; } 0101 0102 //! Maximum number of concurrent threads per compute unit (for occupancy) 0103 int max_threads_per_cu() const { return max_threads_per_cu_; } 0104 0105 //! Number of threads per warp 0106 unsigned int threads_per_warp() const { return threads_per_warp_; } 0107 0108 //! Whether the device supports mapped pinned memory 0109 bool can_map_host_memory() const { return can_map_host_memory_; } 0110 0111 //! Number of execution units per compute unit (1 for NVIDIA, 4 for AMD) 0112 unsigned int eu_per_cu() const { return eu_per_cu_; } 0113 0114 //! CUDA/HIP capability: major * 10 + minor 0115 unsigned int capability() const { return capability_; } 0116 0117 //! Additional potentially interesting diagnostics 0118 MapStrInt const& extra() const { return extra_; } 0119 0120 // Number of streams allocated 0121 StreamId::size_type num_streams() const; 0122 0123 // Allocate the given number of streams 0124 void create_streams(unsigned int num_streams) const; 0125 0126 // Destroy all streams before shutting down CUDA 0127 void destroy_streams() const; 0128 0129 // Access a stream 0130 inline Stream& stream(StreamId) const; 0131 0132 private: 0133 //// DATA //// 0134 0135 // Required values for default constructor 0136 int id_{-1}; 0137 std::string name_{"<DISABLED>"}; 0138 0139 // Default values overridden in device-ID constructor 0140 std::size_t total_global_mem_{}; 0141 int max_threads_per_block_{}; 0142 int max_blocks_per_grid_{}; 0143 int max_threads_per_cu_{}; 0144 unsigned int threads_per_warp_{}; 0145 bool can_map_host_memory_{}; 0146 unsigned int capability_{0}; 0147 unsigned int eu_per_cu_{}; 0148 MapStrInt extra_; 0149 std::vector<Stream> streams_; 0150 }; 0151 0152 //---------------------------------------------------------------------------// 0153 // CELERITAS SHARED DEVICE 0154 //---------------------------------------------------------------------------// 0155 // Global active device (default is inactive/false) 0156 Device const& device(); 0157 0158 // Set and initialize the active GPU 0159 void activate_device(Device&& device); 0160 0161 // Initialize the first device if available using celeritas::comm_world 0162 void activate_device(); 0163 0164 // Initialize a device in a round-robin fashion from a communicator. 0165 void activate_device(MpiCommunicator const&); 0166 0167 // Call cudaSetDevice using the existing device, for thread-local safety 0168 void activate_device_local(); 0169 0170 //---------------------------------------------------------------------------// 0171 // FREE FUNCTIONS 0172 //---------------------------------------------------------------------------// 0173 // Print device info 0174 std::ostream& operator<<(std::ostream&, Device const&); 0175 0176 // Increase CUDA stack size 0177 void set_cuda_stack_size(int limit); 0178 0179 // Increase CUDA HEAP size 0180 void set_cuda_heap_size(int limit); 0181 0182 //---------------------------------------------------------------------------// 0183 // INLINE DEFINITIONS 0184 //---------------------------------------------------------------------------// 0185 /*! 0186 * Get the CUDA device ID, if active. 0187 */ 0188 int Device::device_id() const 0189 { 0190 CELER_EXPECT(*this); 0191 return id_; 0192 } 0193 0194 //---------------------------------------------------------------------------// 0195 /*! 0196 * Access a stream after creating. 0197 */ 0198 Stream& Device::stream(StreamId id) const 0199 { 0200 CELER_EXPECT(id < streams_.size()); 0201 return const_cast<Stream&>(streams_[id.get()]); 0202 } 0203 0204 //---------------------------------------------------------------------------// 0205 } // namespace celeritas
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |