Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:13

0001 /**
0002 curand_uniform_test.cc
0003 =======================
0004 
0005 ::
0006 
0007     ~/o/sysrap/tests/curand_uniform_test.sh
0008 
0009     // https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
0010 
0011 **/
0012 
0013 
0014 #include <cstdlib>
0015 #include <array>
0016 #include <chrono>
0017 
0018 #include "NP.hh"
0019 #include "SCurandState.h"
0020 // HUH: nvcc ignoring "pragma once" from NP.hh NPU.hh with this combo, but macro guards OK
0021 
0022 #include "scuda.h"
0023 #include "sstr.h"
0024 #include "srng.h"
0025 #include "sstamp.h"
0026 
0027 #include "curand_kernel.h"
0028 #include "curandlite/curandStatePhilox4_32_10_OpticksLite.h"
0029 
0030 
0031 using RNG0 = curandStateXORWOW ; 
0032 using RNG1 = curandStateXORWOW ; 
0033 using RNG2 = curandStatePhilox4_32_10 ; 
0034 using RNG3 = curandStatePhilox4_32_10_OpticksLite ; 
0035 
0036 
0037 /**
0038 _test_curand_uniform
0039 -----------------------
0040 
0041 **/
0042 
0043 
0044 
0045 struct KernelInfo
0046 {
0047     dim3 numBlocks ; 
0048     dim3 threadsPerBlock ; 
0049 
0050     int ni ; 
0051     int nj ; 
0052     float ms ;   // milliseconds (1e-3 s)
0053 
0054     int64_t dt0 ;  // us from start of process 
0055     int64_t t0 ; 
0056     int64_t t1 ; 
0057     int64_t et ; 
0058 
0059     double dt ;  // microseconds (1e-6 s)
0060     const char* name ; 
0061     void*    states ; 
0062     float*   dd ; 
0063 
0064     bool four_by_four ; 
0065     bool download ; 
0066 
0067     std::string desc() const ; 
0068 };
0069 
0070 std::string KernelInfo::desc() const
0071 {
0072     std::stringstream ss ; 
0073     ss 
0074         << " dt0 " << dt0 
0075         << " ms " << std::fixed << std::setw(10) << std::setprecision(6) << ms 
0076        // << " t0 " << sstamp::Format(t0) 
0077        // << " t1 " << sstamp::Format(t1) 
0078         << " [t1-t0;us] " << std::setw(8) << ( t1 - t0 )
0079         << " states " << ( states ? "YES" : "NO " ) 
0080         << " download " << ( download ? "YES" : "NO " ) 
0081         << " four_by_four " << ( four_by_four ? "YES" : "NO " ) 
0082         << " name " << ( name ? name : "-" ) 
0083         ; 
0084 
0085     std::string str = ss.str() ;
0086     return str ;  
0087 }
0088 
0089 
0090 
0091 
0092 template<typename T>
0093 __global__ void _test_curand_uniform(float* ff, int ni, int nj, T* states, bool four_by_four)
0094 {
0095     unsigned ix = blockIdx.x * blockDim.x + threadIdx.x;
0096 
0097     unsigned long long subsequence = ix ;    // follow approach of ~/o/qudarap/QCurandState.cu 
0098     unsigned long long seed = 0ull ; 
0099     unsigned long long offset = 0ull ; 
0100 
0101     T rng ; 
0102 
0103     if( states == nullptr )
0104     {
0105         curand_init( seed, subsequence, offset, &rng ); 
0106     }
0107     else
0108     {
0109         rng = states[subsequence] ;  
0110     }
0111 
0112     if(four_by_four)
0113     {
0114         int nk = nj/4 ;  
0115         for(int k=0 ; k < nk ; k++) 
0116         {
0117             float4 ans = curand_uniform4(&rng); 
0118             ff[4*(ix*nk+k)+0] = ans.x ;  
0119             ff[4*(ix*nk+k)+1] = ans.y ; 
0120             ff[4*(ix*nk+k)+2] = ans.z ; 
0121             ff[4*(ix*nk+k)+3] = ans.w ; 
0122         }
0123     }
0124     else
0125     {
0126         for(int j=0 ; j < nj ; j++) 
0127         {
0128             ff[ix*nj+j] = curand_uniform(&rng);  
0129         }
0130 
0131     }
0132 
0133 }
0134 
0135 void ConfigureLaunch(dim3& numBlocks, dim3& threadsPerBlock, unsigned width )
0136 { 
0137     threadsPerBlock.x = 1024 ; 
0138     threadsPerBlock.y = 1 ; 
0139     threadsPerBlock.z = 1 ; 
0140 
0141     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ; 
0142     numBlocks.y = 1 ; 
0143     numBlocks.z = 1 ; 
0144 }
0145 
0146 
0147 template<typename T>
0148 void test_curand_uniform(KernelInfo& ki )
0149 {
0150     ki.t0 = sstamp::Now(); 
0151 
0152     cudaEvent_t start, stop;
0153     cudaEventCreate(&start);
0154     cudaEventCreate(&stop);
0155 
0156     cudaEventRecord(start);
0157     _test_curand_uniform<T><<<ki.numBlocks,ki.threadsPerBlock>>>(ki.dd, ki.ni, ki.nj, (T*)ki.states, ki.four_by_four );  
0158     cudaEventRecord(stop);
0159     cudaEventSynchronize(stop);  // blocks CPU execution until the specified event is recorded
0160 
0161     ki.t1 = sstamp::Now(); 
0162 
0163     cudaEventElapsedTime(&ki.ms, start, stop);
0164 }
0165 
0166 
0167 int main()
0168 {
0169     int NI = U::GetEnvInt("NI", 1000); 
0170     int NJ = U::GetEnvInt("NJ", 16 ); 
0171 
0172     int64_t t0 = sstamp::Now(); 
0173 
0174     NP* h = NP::Make<float>( NI, NJ ) ; 
0175     float* hh = h->values<float>(); 
0176 
0177 
0178     NP::INT nv = h->num_values(); 
0179     float* dd = SCU_::device_alloc<float>( nv, "randoms") ; 
0180 
0181     int64_t t1 = sstamp::Now(); 
0182     std::cout << " t1 - t0 : output allocations [us] " << ( t1 - t0 ) << "\n" ; 
0183     
0184     SCurandState cs ; 
0185     //std::cout << cs.desc() << "\n" ; 
0186     RNG0* d0 = cs.loadAndUpload<RNG0>(NI) ; 
0187 
0188     int64_t t2 = sstamp::Now(); 
0189     std::cout << " t2 - t1 : loadAndUpload [us] " << ( t2 - t1 ) << "\n" ; 
0190   
0191     std::array<KernelInfo,16> kis; 
0192 
0193 
0194     for(int m=0 ; m < kis.size() ; m++ )
0195     {
0196         int m4 = m % 4 ; 
0197         int g4 = m / 4 ; 
0198 
0199         KernelInfo& ki = kis[m] ; 
0200         ConfigureLaunch(ki.numBlocks, ki.threadsPerBlock, NI ); 
0201         int64_t t2 = sstamp::Now();  
0202 
0203 
0204         ki.dt0 = t2 - t1 ; 
0205         ki.ni = NI ; 
0206         ki.nj = NJ ; 
0207         ki.states = m4 == 1 ? d0 : nullptr ; 
0208         //ki.download = g4 == 1 ? true : false ;  
0209         ki.download = false ;  
0210         ki.dd = dd ; 
0211         ki.four_by_four = g4 % 2 == 1 ;  
0212      
0213        
0214         switch(m4)
0215         {
0216            case 0: test_curand_uniform<RNG0>(ki); ki.name = srng<RNG0>::NAME ; break ; 
0217            case 1: test_curand_uniform<RNG1>(ki); ki.name = srng<RNG1>::NAME ; break ; 
0218            case 2: test_curand_uniform<RNG2>(ki); ki.name = srng<RNG2>::NAME ; break ; 
0219            case 3: test_curand_uniform<RNG2>(ki); ki.name = srng<RNG3>::NAME ; break ; 
0220         }
0221 
0222         if(m4 == 0 ) std::cout << "\n" ;  
0223         std::cout << ki.desc() << "\n" ;
0224 
0225         if(ki.download)
0226         {
0227             cudaMemcpy( hh, dd, h->arr_bytes(), cudaMemcpyDeviceToHost ) ; 
0228             cudaDeviceSynchronize();
0229 
0230             std::string path = sstr::Format_("$FOLD/RNG%d.npy", m ); 
0231             h->save(path.c_str()); 
0232         }
0233     }
0234 
0235     return 0 ; 
0236 }
0237 
0238