File indexing completed on 2026-04-09 07:49:40
0001
0002
0003
0004
0005
0006
0007
0008 #include <cstdio>
0009 #include <chrono>
0010 #include <thread>
0011 #include <string>
0012 #include <sstream>
0013 #include <iomanip>
0014 #include <cstring>
0015 #include <unistd.h>
0016 #include <iostream>
0017 #include <cstdlib>
0018 #include <csignal>
0019 #include <vector>
0020
0021 #include <nvml.h>
0022 #include "NVML_CHECK.h"
0023 #include "NPX.h"
0024
0025 struct smon
0026 {
0027 uint64_t stamp ;
0028 uint64_t device ;
0029 uint64_t free ;
0030 uint64_t total ;
0031
0032 uint64_t used ;
0033 uint64_t pid ;
0034 uint64_t usedGpuMemory ;
0035 uint64_t proc_count ;
0036 };
0037
0038 struct smonitor
0039 {
0040 static constexpr const char* SLEEP_US = "smonitor__SLEEP_US" ;
0041 static constexpr const bool VERBOSE = false ;
0042 static smonitor* INSTANCE ;
0043 static uint64_t Stamp();
0044 static void SleepMicroseconds(int us);
0045 static void signal_callback_handler(int signum);
0046
0047 int sleep_us ;
0048 unsigned device_count ;
0049 std::vector<smon> mon ;
0050
0051 smonitor();
0052
0053 void runloop();
0054 void check();
0055 void save();
0056 };
0057
0058
0059 smonitor* smonitor::INSTANCE = nullptr ;
0060
0061 inline uint64_t smonitor::Stamp()
0062 {
0063 using Clock = std::chrono::system_clock;
0064 using Unit = std::chrono::microseconds ;
0065 std::chrono::time_point<Clock> t0 = Clock::now();
0066 return std::chrono::duration_cast<Unit>(t0.time_since_epoch()).count() ;
0067 }
0068
0069 inline void smonitor::SleepMicroseconds(int us)
0070 {
0071 std::chrono::microseconds dura(us);
0072 std::this_thread::sleep_for( dura );
0073 }
0074
0075 inline smonitor::smonitor()
0076 :
0077 sleep_us(U::GetEnvInt(SLEEP_US, 1000000))
0078 {
0079 INSTANCE = this ;
0080 NVML_CHECK( nvmlInit() );
0081 NVML_CHECK( nvmlDeviceGetCount(&device_count) );
0082 printf("device_count: %u %s=%d \n", device_count, SLEEP_US, sleep_us );
0083
0084 signal(SIGINT, signal_callback_handler);
0085 }
0086
0087 void smonitor::signal_callback_handler(int signum)
0088 {
0089 std::cout << "Caught signal " << signum << std::endl;
0090 smonitor::INSTANCE->save();
0091 exit(signum);
0092 }
0093
0094 inline void smonitor::save()
0095 {
0096 std::cout << "smonitor::save mon.size " << mon.size() << std::endl ;
0097
0098 NP* a = NPX::ArrayFromVec<uint64_t, smon>(mon) ;
0099 a->save("smonitor.npy");
0100 }
0101
0102
0103
0104 inline void smonitor::runloop()
0105 {
0106 while(true)
0107 {
0108 check();
0109 SleepMicroseconds(sleep_us);
0110 }
0111 }
0112
0113 inline void smonitor::check()
0114 {
0115 for(unsigned index=0 ; index < device_count ; index++)
0116 {
0117 nvmlDevice_t device ;
0118 NVML_CHECK( nvmlDeviceGetHandleByIndex_v2( index, &device ));
0119
0120
0121
0122
0123
0124
0125 nvmlMemory_t memory ;
0126 NVML_CHECK( nvmlDeviceGetMemoryInfo(device, &memory ) );
0127
0128 if(VERBOSE) printf(" memory.free %llu memory.total %llu memory.used %llu \n",
0129 memory.free, memory.total, memory.used );
0130
0131 unsigned proc_count(0) ;
0132 nvmlReturn_t rc = nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_count, nullptr );
0133
0134 if( rc == NVML_ERROR_INSUFFICIENT_SIZE )
0135 {
0136 if(VERBOSE) printf("proc_count %d \n", proc_count );
0137
0138 unsigned proc_alloc = proc_count + 3 ;
0139 nvmlProcessInfo_t* procs = new nvmlProcessInfo_t[proc_alloc] ;
0140
0141 NVML_CHECK( nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_alloc, procs ) );
0142
0143 for(unsigned p=0 ; p < proc_alloc ; p++)
0144 {
0145 int num_mon = mon.size();
0146 const nvmlProcessInfo_t& proc = procs[p] ;
0147
0148
0149
0150
0151 printf(" num_mon %5d proc_count %2d proc.pid %u proc.usedGpuMemory %llu [%10.3f GB] \n",
0152 num_mon, proc_count, proc.pid, proc.usedGpuMemory, float(proc.usedGpuMemory)/1e9 );
0153
0154 smon m ;
0155
0156 m.stamp = Stamp();
0157 m.device = index ;
0158 m.free = memory.free ;
0159 m.total = memory.total ;
0160
0161 m.used = memory.used ;
0162 m.pid = proc.pid ;
0163 m.usedGpuMemory = proc.usedGpuMemory ;
0164 m.proc_count = proc_count ;
0165
0166 mon.push_back(m);
0167 }
0168 }
0169 }
0170 }
0171
0172 int main()
0173 {
0174 smonitor sm ;
0175 sm.runloop();
0176 return 0 ;
0177 }
0178
0179