File indexing completed on 2026-04-10 07:49:45
0001
0002
0003
0004
0005
0006
0007
0008
0009 #include <cstdlib>
0010 #include <array>
0011 #include "NP.hh"
0012 #include "scuda.h"
0013
0014 #include "curand-done-right/curanddr.hxx"
0015
0016
0017
0018 __global__ void _test_curanddr_uniform(float* ff, int ni, int nj)
0019 {
0020 uint ix = blockIdx.x * blockDim.x + threadIdx.x;
0021 uint nk = nj/4 ;
0022 for(uint k=0 ; k < nk ; k++)
0023 {
0024 float* ffk = ff + 4*(ix*nk + k) ;
0025 curanddr::uniforms_into_buffer<4>( ffk, uint4{k,0,ix,0}, 0 );
0026 }
0027 }
0028
0029 void ConfigureLaunch(dim3& numBlocks, dim3& threadsPerBlock, unsigned width )
0030 {
0031 threadsPerBlock.x = 512 ;
0032 threadsPerBlock.y = 1 ;
0033 threadsPerBlock.z = 1 ;
0034
0035 numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0036 numBlocks.y = 1 ;
0037 numBlocks.z = 1 ;
0038 }
0039
0040 void test_curanddr_uniform()
0041 {
0042 int ni = 1000 ;
0043 int nj = 16 ;
0044
0045 dim3 numBlocks ;
0046 dim3 threadsPerBlock ;
0047 ConfigureLaunch(numBlocks, threadsPerBlock, ni );
0048
0049 printf("//test_curanddr_uniform \n" );
0050 NP* h = NP::Make<float>( ni, nj ) ;
0051 int arr_bytes = h->arr_bytes() ;
0052 float* hh = h->values<float>();
0053
0054 float* dd = nullptr ;
0055 cudaMalloc(reinterpret_cast<void**>( &dd ), arr_bytes );
0056
0057 _test_curanddr_uniform<<<numBlocks,threadsPerBlock>>>(dd, ni, nj );
0058
0059 cudaMemcpy( hh, dd, arr_bytes, cudaMemcpyDeviceToHost ) ;
0060 cudaDeviceSynchronize();
0061
0062 h->save("$FOLD/curanddr_uniform_test.npy");
0063 }
0064 int main()
0065 {
0066 test_curanddr_uniform();
0067 return 0 ;
0068 }
0069
0070