|
||||
File indexing completed on 2025-01-18 09:54:49
0001 //----------------------------------*-C++-*----------------------------------// 0002 // Copyright 2020-2024 UT-Battelle, LLC, and other Celeritas developers. 0003 // See the top-level COPYRIGHT file for details. 0004 // SPDX-License-Identifier: (Apache-2.0 OR MIT) 0005 //---------------------------------------------------------------------------// 0006 //! \file corecel/sys/Device.hh 0007 //---------------------------------------------------------------------------// 0008 #pragma once 0009 0010 #include <cstddef> 0011 #include <iosfwd> // IWYU pragma: keep 0012 #include <map> 0013 #include <memory> 0014 #include <string> 0015 0016 #include "corecel/Assert.hh" 0017 0018 #include "ThreadId.hh" 0019 0020 namespace celeritas 0021 { 0022 class MpiCommunicator; 0023 class Stream; 0024 namespace detail 0025 { 0026 class StreamStorage; 0027 } 0028 0029 //---------------------------------------------------------------------------// 0030 /*! 0031 * Manage attributes of the GPU. 0032 * 0033 * CUDA/HIP translation table: 0034 * 0035 * CUDA/NVIDIA | HIP/AMD | Description 0036 * -------------- | -------------- | ----------------- 0037 * thread | work item | individual local work element 0038 * warp | wavefront | "vectorized thread" operating in lockstep 0039 * block | workgroup | group of threads able to sync 0040 * multiprocessor | compute unit | hardware executing one or more blocks 0041 * multiprocessor | execution unit | hardware executing one or more warps 0042 * 0043 * Each block/workgroup operates on the same hardware (compute unit) until 0044 * completion. Similarly, a warp/wavefront is tied to a single execution 0045 * unit. Each compute unit can execute one or more blocks: the higher the 0046 * number of blocks resident, the more latency can be hidden. 0047 * 0048 * \warning The current multithreading/multiprocess model is intended to have 0049 * one GPU serving multiple CPU threads simultaneously, and one MPI process per 0050 * GPU. The active CUDA device is a static thread-local property but \c 0051 * global_device is global. CUDA needs to be activated using \c activate_device 0052 * or \c activate_device_local on every thread, using the same device ID. 0053 */ 0054 class Device 0055 { 0056 public: 0057 //!@{ 0058 //! \name Type aliases 0059 using MapStrInt = std::map<std::string, int>; 0060 //!@} 0061 0062 public: 0063 // Number of devices available on the local compute node (0 if disabled) 0064 static int num_devices(); 0065 0066 // Whether verbose messages and error checking are enabled 0067 static bool debug(); 0068 0069 //// CONSTRUCTORS //// 0070 0071 // Construct an inactive device (disable celeritas CUDA calls) 0072 Device() = default; 0073 0074 // Construct from device ID 0075 explicit Device(int id); 0076 0077 //// ACCESSORS //// 0078 0079 // Get the device ID 0080 inline int device_id() const; 0081 0082 //! True if device is initialized 0083 explicit operator bool() const { return id_ >= 0; } 0084 0085 //! Device name 0086 std::string name() const { return name_; } 0087 0088 //! Total memory capacity (bytes) 0089 std::size_t total_global_mem() const { return total_global_mem_; } 0090 0091 //! Maximum number of threads per block (for launch limits) 0092 int max_threads_per_block() const { return max_threads_per_block_; } 0093 0094 //! Maximum number of threads per block (for launch limits) 0095 int max_blocks_per_grid() const { return max_blocks_per_grid_; } 0096 0097 //! Maximum number of concurrent threads per compute unit (for occupancy) 0098 int max_threads_per_cu() const { return max_threads_per_cu_; } 0099 0100 //! Number of threads per warp 0101 unsigned int threads_per_warp() const { return threads_per_warp_; } 0102 0103 //! Whether the device supports mapped pinned memory 0104 bool can_map_host_memory() const { return can_map_host_memory_; } 0105 0106 //! Number of execution units per compute unit (1 for NVIDIA, 4 for AMD) 0107 unsigned int eu_per_cu() const { return eu_per_cu_; } 0108 0109 //! Additional potentially interesting diagnostics 0110 MapStrInt const& extra() const { return extra_; } 0111 0112 // Number of streams allocated 0113 StreamId::size_type num_streams() const; 0114 0115 // Allocate the given number of streams 0116 void create_streams(unsigned int num_streams) const; 0117 0118 // Access a stream 0119 Stream& stream(StreamId) const; 0120 0121 private: 0122 struct StreamStorageDeleter 0123 { 0124 void operator()(detail::StreamStorage*) noexcept; 0125 }; 0126 0127 using UPStreamStorage 0128 = std::unique_ptr<detail::StreamStorage, StreamStorageDeleter>; 0129 0130 //// DATA //// 0131 0132 // Required values for default constructor 0133 int id_{-1}; 0134 std::string name_{"<DISABLED>"}; 0135 0136 // Default values overridden in device-ID constructor 0137 std::size_t total_global_mem_{}; 0138 int max_threads_per_block_{}; 0139 int max_blocks_per_grid_{}; 0140 int max_threads_per_cu_{}; 0141 unsigned int threads_per_warp_{}; 0142 bool can_map_host_memory_{}; 0143 unsigned int eu_per_cu_{}; 0144 MapStrInt extra_; 0145 UPStreamStorage streams_; 0146 }; 0147 0148 //---------------------------------------------------------------------------// 0149 // CELERITAS SHARED DEVICE 0150 //---------------------------------------------------------------------------// 0151 // Global active device (default is inactive/false) 0152 Device const& device(); 0153 0154 // Set and initialize the active GPU 0155 void activate_device(Device&& device); 0156 0157 // Initialize the first device if available using celeritas::comm_world 0158 void activate_device(); 0159 0160 // Initialize a device in a round-robin fashion from a communicator. 0161 void activate_device(MpiCommunicator const&); 0162 0163 // Call cudaSetDevice using the existing device, for thread-local safety 0164 void activate_device_local(); 0165 0166 //---------------------------------------------------------------------------// 0167 // FREE FUNCTIONS 0168 //---------------------------------------------------------------------------// 0169 // Print device info 0170 std::ostream& operator<<(std::ostream&, Device const&); 0171 0172 // Increase CUDA stack size 0173 void set_cuda_stack_size(int limit); 0174 0175 // Increase CUDA HEAP size 0176 void set_cuda_heap_size(int limit); 0177 0178 //---------------------------------------------------------------------------// 0179 // INLINE DEFINITIONS 0180 //---------------------------------------------------------------------------// 0181 /*! 0182 * Get the CUDA device ID, if active. 0183 */ 0184 int Device::device_id() const 0185 { 0186 CELER_EXPECT(*this); 0187 return id_; 0188 } 0189 0190 //---------------------------------------------------------------------------// 0191 } // namespace celeritas
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |