sysrap/tests/curand_uniform_test.cu

0001 /**
0002 curand_uniform_test.cc
0003 =======================
0004
0005 ::
0006
0007     ~/o/sysrap/tests/curand_uniform_test.sh
0008
0009     // https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
0010
0011 **/
0012
0013
0014 #include <cstdlib>
0015 #include <array>
0016 #include <chrono>
0017
0018 #include "NP.hh"
0019 #include "SCurandState.h"
0020 // HUH: nvcc ignoring "pragma once" from NP.hh NPU.hh with this combo, but macro guards OK
0021
0022 #include "scuda.h"
0023 #include "sstr.h"
0024 #include "srng.h"
0025 #include "sstamp.h"
0026
0027 #include "curand_kernel.h"
0028 #include "curandlite/curandStatePhilox4_32_10_OpticksLite.h"
0029
0030
0031 using RNG0 = curandStateXORWOW ;
0032 using RNG1 = curandStateXORWOW ;
0033 using RNG2 = curandStatePhilox4_32_10 ;
0034 using RNG3 = curandStatePhilox4_32_10_OpticksLite ;
0035
0036
0037 /**
0038 _test_curand_uniform
0039 -----------------------
0040
0041 **/
0042
0043
0044
0045 struct KernelInfo
0046 {
0047     dim3 numBlocks ;
0048     dim3 threadsPerBlock ;
0049
0050     int ni ;
0051     int nj ;
0052     float ms ;   // milliseconds (1e-3 s)
0053
0054     int64_t dt0 ;  // us from start of process
0055     int64_t t0 ;
0056     int64_t t1 ;
0057     int64_t et ;
0058
0059     double dt ;  // microseconds (1e-6 s)
0060     const char* name ;
0061     void*    states ;
0062     float*   dd ;
0063
0064     bool four_by_four ;
0065     bool download ;
0066
0067     std::string desc() const ;
0068 };
0069
0070 std::string KernelInfo::desc() const
0071 {
0072     std::stringstream ss ;
0073     ss
0074         << " dt0 " << dt0
0075         << " ms " << std::fixed << std::setw(10) << std::setprecision(6) << ms
0076        // << " t0 " << sstamp::Format(t0)
0077        // << " t1 " << sstamp::Format(t1)
0078         << " [t1-t0;us] " << std::setw(8) << ( t1 - t0 )
0079         << " states " << ( states ? "YES" : "NO " )
0080         << " download " << ( download ? "YES" : "NO " )
0081         << " four_by_four " << ( four_by_four ? "YES" : "NO " )
0082         << " name " << ( name ? name : "-" )
0083         ;
0084
0085     std::string str = ss.str() ;
0086     return str ;
0087 }
0088
0089
0090
0091
0092 template<typename T>
0093 __global__ void _test_curand_uniform(float* ff, int ni, int nj, T* states, bool four_by_four)
0094 {
0095     unsigned ix = blockIdx.x * blockDim.x + threadIdx.x;
0096
0097     unsigned long long subsequence = ix ;    // follow approach of ~/o/qudarap/QCurandState.cu
0098     unsigned long long seed = 0ull ;
0099     unsigned long long offset = 0ull ;
0100
0101     T rng ;
0102
0103     if( states == nullptr )
0104     {
0105         curand_init( seed, subsequence, offset, &rng );
0106     }
0107     else
0108     {
0109         rng = states[subsequence] ;
0110     }
0111
0112     if(four_by_four)
0113     {
0114         int nk = nj/4 ;
0115         for(int k=0 ; k < nk ; k++)
0116         {
0117             float4 ans = curand_uniform4(&rng);
0118             ff[4*(ix*nk+k)+0] = ans.x ;
0119             ff[4*(ix*nk+k)+1] = ans.y ;
0120             ff[4*(ix*nk+k)+2] = ans.z ;
0121             ff[4*(ix*nk+k)+3] = ans.w ;
0122         }
0123     }
0124     else
0125     {
0126         for(int j=0 ; j < nj ; j++)
0127         {
0128             ff[ix*nj+j] = curand_uniform(&rng);
0129         }
0130
0131     }
0132
0133 }
0134
0135 void ConfigureLaunch(dim3& numBlocks, dim3& threadsPerBlock, unsigned width )
0136 {
0137     threadsPerBlock.x = 1024 ;
0138     threadsPerBlock.y = 1 ;
0139     threadsPerBlock.z = 1 ;
0140
0141     numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0142     numBlocks.y = 1 ;
0143     numBlocks.z = 1 ;
0144 }
0145
0146
0147 template<typename T>
0148 void test_curand_uniform(KernelInfo& ki )
0149 {
0150     ki.t0 = sstamp::Now();
0151
0152     cudaEvent_t start, stop;
0153     cudaEventCreate(&start);
0154     cudaEventCreate(&stop);
0155
0156     cudaEventRecord(start);
0157     _test_curand_uniform<T><<<ki.numBlocks,ki.threadsPerBlock>>>(ki.dd, ki.ni, ki.nj, (T*)ki.states, ki.four_by_four );
0158     cudaEventRecord(stop);
0159     cudaEventSynchronize(stop);  // blocks CPU execution until the specified event is recorded
0160
0161     ki.t1 = sstamp::Now();
0162
0163     cudaEventElapsedTime(&ki.ms, start, stop);
0164 }
0165
0166
0167 int main()
0168 {
0169     int NI = U::GetEnvInt("NI", 1000);
0170     int NJ = U::GetEnvInt("NJ", 16 );
0171
0172     int64_t t0 = sstamp::Now();
0173
0174     NP* h = NP::Make<float>( NI, NJ ) ;
0175     float* hh = h->values<float>();
0176
0177
0178     NP::INT nv = h->num_values();
0179     float* dd = SCU_::device_alloc<float>( nv, "randoms") ;
0180
0181     int64_t t1 = sstamp::Now();
0182     std::cout << " t1 - t0 : output allocations [us] " << ( t1 - t0 ) << "\n" ;
0183
0184     SCurandState cs ;
0185     //std::cout << cs.desc() << "\n" ;
0186     RNG0* d0 = cs.loadAndUpload<RNG0>(NI) ;
0187
0188     int64_t t2 = sstamp::Now();
0189     std::cout << " t2 - t1 : loadAndUpload [us] " << ( t2 - t1 ) << "\n" ;
0190
0191     std::array<KernelInfo,16> kis;
0192
0193
0194     for(int m=0 ; m < kis.size() ; m++ )
0195     {
0196         int m4 = m % 4 ;
0197         int g4 = m / 4 ;
0198
0199         KernelInfo& ki = kis[m] ;
0200         ConfigureLaunch(ki.numBlocks, ki.threadsPerBlock, NI );
0201         int64_t t2 = sstamp::Now();
0202
0203
0204         ki.dt0 = t2 - t1 ;
0205         ki.ni = NI ;
0206         ki.nj = NJ ;
0207         ki.states = m4 == 1 ? d0 : nullptr ;
0208         //ki.download = g4 == 1 ? true : false ;
0209         ki.download = false ;
0210         ki.dd = dd ;
0211         ki.four_by_four = g4 % 2 == 1 ;
0212
0213
0214         switch(m4)
0215         {
0216            case 0: test_curand_uniform<RNG0>(ki); ki.name = srng<RNG0>::NAME ; break ;
0217            case 1: test_curand_uniform<RNG1>(ki); ki.name = srng<RNG1>::NAME ; break ;
0218            case 2: test_curand_uniform<RNG2>(ki); ki.name = srng<RNG2>::NAME ; break ;
0219            case 3: test_curand_uniform<RNG2>(ki); ki.name = srng<RNG3>::NAME ; break ;
0220         }
0221
0222         if(m4 == 0 ) std::cout << "\n" ;
0223         std::cout << ki.desc() << "\n" ;
0224
0225         if(ki.download)
0226         {
0227             cudaMemcpy( hh, dd, h->arr_bytes(), cudaMemcpyDeviceToHost ) ;
0228             cudaDeviceSynchronize();
0229
0230             std::string path = sstr::Format_("$FOLD/RNG%d.npy", m );
0231             h->save(path.c_str());
0232         }
0233     }
0234
0235     return 0 ;
0236 }
0237
0238