Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:40

0001 /**
0002 smonitor.cc : monitor GPU process memory usage saving into .npy array 
0003 ========================================================================
0004 
0005 **/
0006 
0007 
0008 #include <cstdio>
0009 #include <chrono>
0010 #include <thread>
0011 #include <string>
0012 #include <sstream>
0013 #include <iomanip>
0014 #include <cstring>
0015 #include <unistd.h>
0016 #include <iostream>
0017 #include <cstdlib>
0018 #include <csignal>
0019 #include <vector>
0020 
0021 #include <nvml.h>
0022 #include "NVML_CHECK.h"
0023 #include "NPX.h"
0024 
0025 struct smon
0026 {
0027     uint64_t stamp ;   
0028     uint64_t device ; 
0029     uint64_t free ; 
0030     uint64_t total ; 
0031 
0032     uint64_t used ; 
0033     uint64_t pid ; 
0034     uint64_t usedGpuMemory ;
0035     uint64_t proc_count ; 
0036 }; 
0037 
0038 struct smonitor
0039 {
0040     static constexpr const char* SLEEP_US = "smonitor__SLEEP_US" ; 
0041     static constexpr const bool VERBOSE = false ; 
0042     static smonitor* INSTANCE ; 
0043     static uint64_t Stamp(); 
0044     static void SleepMicroseconds(int us); 
0045     static void signal_callback_handler(int signum); 
0046 
0047     int       sleep_us ; 
0048     unsigned  device_count ;
0049     std::vector<smon> mon ; 
0050 
0051     smonitor(); 
0052 
0053     void runloop(); 
0054     void check(); 
0055     void save(); 
0056 };
0057 
0058 
0059 smonitor* smonitor::INSTANCE = nullptr ; 
0060 
0061 inline uint64_t smonitor::Stamp()
0062 {
0063     using Clock = std::chrono::system_clock;
0064     using Unit  = std::chrono::microseconds ;
0065     std::chrono::time_point<Clock> t0 = Clock::now();
0066     return std::chrono::duration_cast<Unit>(t0.time_since_epoch()).count() ;   
0067 }
0068 
0069 inline void smonitor::SleepMicroseconds(int us) 
0070 {
0071     std::chrono::microseconds dura(us);
0072     std::this_thread::sleep_for( dura );
0073 }
0074 
0075 inline smonitor::smonitor()
0076     :
0077     sleep_us(U::GetEnvInt(SLEEP_US, 1000000))
0078 {
0079     INSTANCE = this ; 
0080     NVML_CHECK( nvmlInit() );
0081     NVML_CHECK( nvmlDeviceGetCount(&device_count) ); 
0082     printf("device_count: %u  %s=%d \n", device_count, SLEEP_US, sleep_us  );
0083 
0084     signal(SIGINT, signal_callback_handler);
0085 }
0086 
0087 void smonitor::signal_callback_handler(int signum) 
0088 {
0089     std::cout << "Caught signal " << signum << std::endl;
0090     smonitor::INSTANCE->save();  
0091     exit(signum);
0092 }
0093 
0094 inline void smonitor::save()
0095 {
0096     std::cout << "smonitor::save mon.size " << mon.size() << std::endl ; 
0097 
0098     NP* a = NPX::ArrayFromVec<uint64_t, smon>(mon) ;  
0099     a->save("smonitor.npy"); 
0100 }
0101 
0102 
0103 
0104 inline void smonitor::runloop()
0105 {
0106     while(true)
0107     {
0108         check(); 
0109         SleepMicroseconds(sleep_us);
0110     }
0111 }
0112 
0113 inline void smonitor::check()
0114 {
0115     for(unsigned index=0 ; index < device_count ; index++) 
0116     {
0117         nvmlDevice_t device ; 
0118         NVML_CHECK( nvmlDeviceGetHandleByIndex_v2( index, &device )); 
0119          
0120         //const int maxchar = 32 ;           
0121         //char name[maxchar] ; 
0122         //NVML_CHECK( nvmlDeviceGetName(device, name, maxchar ) ); 
0123         //printf("device %d name %s \n" , index, name );  
0124 
0125         nvmlMemory_t memory ; 
0126         NVML_CHECK( nvmlDeviceGetMemoryInfo(device, &memory ) ); 
0127 
0128         if(VERBOSE) printf(" memory.free %llu memory.total %llu memory.used %llu \n", 
0129                              memory.free,     memory.total,     memory.used ); 
0130 
0131         unsigned proc_count(0) ; 
0132         nvmlReturn_t rc = nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_count, nullptr );
0133 
0134         if( rc == NVML_ERROR_INSUFFICIENT_SIZE )  // documented that get this 
0135         {
0136             if(VERBOSE) printf("proc_count %d \n", proc_count ); 
0137 
0138             unsigned proc_alloc = proc_count + 3 ; 
0139             nvmlProcessInfo_t* procs = new nvmlProcessInfo_t[proc_alloc] ; 
0140 
0141             NVML_CHECK( nvmlDeviceGetComputeRunningProcesses_v3(device, &proc_alloc, procs ) );
0142 
0143             for(unsigned p=0 ; p <  proc_alloc ; p++)
0144             {
0145                 int num_mon = mon.size(); 
0146                 const nvmlProcessInfo_t& proc = procs[p] ; 
0147 
0148                 //printf(" proc.computeInstanceId  %u  proc.gpuInstanceId %u  proc.pid %u  proc.usedGpuMemory %llu \n", 
0149                 //         proc.computeInstanceId, proc.gpuInstanceId, proc.pid, proc.usedGpuMemory ); 
0150 
0151                 printf(" num_mon %5d proc_count %2d proc.pid %u  proc.usedGpuMemory %llu [%10.3f GB] \n", 
0152                         num_mon, proc_count, proc.pid, proc.usedGpuMemory, float(proc.usedGpuMemory)/1e9 ); 
0153 
0154                 smon m ; 
0155 
0156                 m.stamp = Stamp(); 
0157                 m.device = index ; 
0158                 m.free = memory.free ; 
0159                 m.total = memory.total ; 
0160 
0161                 m.used = memory.used ;
0162                 m.pid = proc.pid ; 
0163                 m.usedGpuMemory = proc.usedGpuMemory ; 
0164                 m.proc_count = proc_count ; 
0165 
0166                 mon.push_back(m); 
0167             }
0168         }
0169     }
0170 }
0171 
0172 int main()
0173 {
0174     smonitor sm ; 
0175     sm.runloop(); 
0176     return 0 ; 
0177 }
0178 
0179