File indexing completed on 2026-04-09 07:49:13
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 #include <cstdlib>
0015 #include <array>
0016 #include <chrono>
0017
0018 #include "NP.hh"
0019 #include "SCurandState.h"
0020
0021
0022 #include "scuda.h"
0023 #include "sstr.h"
0024 #include "srng.h"
0025 #include "sstamp.h"
0026
0027 #include "curand_kernel.h"
0028 #include "curandlite/curandStatePhilox4_32_10_OpticksLite.h"
0029
0030
0031 using RNG0 = curandStateXORWOW ;
0032 using RNG1 = curandStateXORWOW ;
0033 using RNG2 = curandStatePhilox4_32_10 ;
0034 using RNG3 = curandStatePhilox4_32_10_OpticksLite ;
0035
0036
0037
0038
0039
0040
0041
0042
0043
0044
0045 struct KernelInfo
0046 {
0047 dim3 numBlocks ;
0048 dim3 threadsPerBlock ;
0049
0050 int ni ;
0051 int nj ;
0052 float ms ;
0053
0054 int64_t dt0 ;
0055 int64_t t0 ;
0056 int64_t t1 ;
0057 int64_t et ;
0058
0059 double dt ;
0060 const char* name ;
0061 void* states ;
0062 float* dd ;
0063
0064 bool four_by_four ;
0065 bool download ;
0066
0067 std::string desc() const ;
0068 };
0069
0070 std::string KernelInfo::desc() const
0071 {
0072 std::stringstream ss ;
0073 ss
0074 << " dt0 " << dt0
0075 << " ms " << std::fixed << std::setw(10) << std::setprecision(6) << ms
0076
0077
0078 << " [t1-t0;us] " << std::setw(8) << ( t1 - t0 )
0079 << " states " << ( states ? "YES" : "NO " )
0080 << " download " << ( download ? "YES" : "NO " )
0081 << " four_by_four " << ( four_by_four ? "YES" : "NO " )
0082 << " name " << ( name ? name : "-" )
0083 ;
0084
0085 std::string str = ss.str() ;
0086 return str ;
0087 }
0088
0089
0090
0091
0092 template<typename T>
0093 __global__ void _test_curand_uniform(float* ff, int ni, int nj, T* states, bool four_by_four)
0094 {
0095 unsigned ix = blockIdx.x * blockDim.x + threadIdx.x;
0096
0097 unsigned long long subsequence = ix ;
0098 unsigned long long seed = 0ull ;
0099 unsigned long long offset = 0ull ;
0100
0101 T rng ;
0102
0103 if( states == nullptr )
0104 {
0105 curand_init( seed, subsequence, offset, &rng );
0106 }
0107 else
0108 {
0109 rng = states[subsequence] ;
0110 }
0111
0112 if(four_by_four)
0113 {
0114 int nk = nj/4 ;
0115 for(int k=0 ; k < nk ; k++)
0116 {
0117 float4 ans = curand_uniform4(&rng);
0118 ff[4*(ix*nk+k)+0] = ans.x ;
0119 ff[4*(ix*nk+k)+1] = ans.y ;
0120 ff[4*(ix*nk+k)+2] = ans.z ;
0121 ff[4*(ix*nk+k)+3] = ans.w ;
0122 }
0123 }
0124 else
0125 {
0126 for(int j=0 ; j < nj ; j++)
0127 {
0128 ff[ix*nj+j] = curand_uniform(&rng);
0129 }
0130
0131 }
0132
0133 }
0134
0135 void ConfigureLaunch(dim3& numBlocks, dim3& threadsPerBlock, unsigned width )
0136 {
0137 threadsPerBlock.x = 1024 ;
0138 threadsPerBlock.y = 1 ;
0139 threadsPerBlock.z = 1 ;
0140
0141 numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0142 numBlocks.y = 1 ;
0143 numBlocks.z = 1 ;
0144 }
0145
0146
0147 template<typename T>
0148 void test_curand_uniform(KernelInfo& ki )
0149 {
0150 ki.t0 = sstamp::Now();
0151
0152 cudaEvent_t start, stop;
0153 cudaEventCreate(&start);
0154 cudaEventCreate(&stop);
0155
0156 cudaEventRecord(start);
0157 _test_curand_uniform<T><<<ki.numBlocks,ki.threadsPerBlock>>>(ki.dd, ki.ni, ki.nj, (T*)ki.states, ki.four_by_four );
0158 cudaEventRecord(stop);
0159 cudaEventSynchronize(stop);
0160
0161 ki.t1 = sstamp::Now();
0162
0163 cudaEventElapsedTime(&ki.ms, start, stop);
0164 }
0165
0166
0167 int main()
0168 {
0169 int NI = U::GetEnvInt("NI", 1000);
0170 int NJ = U::GetEnvInt("NJ", 16 );
0171
0172 int64_t t0 = sstamp::Now();
0173
0174 NP* h = NP::Make<float>( NI, NJ ) ;
0175 float* hh = h->values<float>();
0176
0177
0178 NP::INT nv = h->num_values();
0179 float* dd = SCU_::device_alloc<float>( nv, "randoms") ;
0180
0181 int64_t t1 = sstamp::Now();
0182 std::cout << " t1 - t0 : output allocations [us] " << ( t1 - t0 ) << "\n" ;
0183
0184 SCurandState cs ;
0185
0186 RNG0* d0 = cs.loadAndUpload<RNG0>(NI) ;
0187
0188 int64_t t2 = sstamp::Now();
0189 std::cout << " t2 - t1 : loadAndUpload [us] " << ( t2 - t1 ) << "\n" ;
0190
0191 std::array<KernelInfo,16> kis;
0192
0193
0194 for(int m=0 ; m < kis.size() ; m++ )
0195 {
0196 int m4 = m % 4 ;
0197 int g4 = m / 4 ;
0198
0199 KernelInfo& ki = kis[m] ;
0200 ConfigureLaunch(ki.numBlocks, ki.threadsPerBlock, NI );
0201 int64_t t2 = sstamp::Now();
0202
0203
0204 ki.dt0 = t2 - t1 ;
0205 ki.ni = NI ;
0206 ki.nj = NJ ;
0207 ki.states = m4 == 1 ? d0 : nullptr ;
0208
0209 ki.download = false ;
0210 ki.dd = dd ;
0211 ki.four_by_four = g4 % 2 == 1 ;
0212
0213
0214 switch(m4)
0215 {
0216 case 0: test_curand_uniform<RNG0>(ki); ki.name = srng<RNG0>::NAME ; break ;
0217 case 1: test_curand_uniform<RNG1>(ki); ki.name = srng<RNG1>::NAME ; break ;
0218 case 2: test_curand_uniform<RNG2>(ki); ki.name = srng<RNG2>::NAME ; break ;
0219 case 3: test_curand_uniform<RNG2>(ki); ki.name = srng<RNG3>::NAME ; break ;
0220 }
0221
0222 if(m4 == 0 ) std::cout << "\n" ;
0223 std::cout << ki.desc() << "\n" ;
0224
0225 if(ki.download)
0226 {
0227 cudaMemcpy( hh, dd, h->arr_bytes(), cudaMemcpyDeviceToHost ) ;
0228 cudaDeviceSynchronize();
0229
0230 std::string path = sstr::Format_("$FOLD/RNG%d.npy", m );
0231 h->save(path.c_str());
0232 }
0233 }
0234
0235 return 0 ;
0236 }
0237
0238