File indexing completed on 2026-04-09 07:49:13
0001
0002
0003
0004 #include <cstdlib>
0005 #include <array>
0006 #include "NP.hh"
0007 #include "scuda.h"
0008
0009 const char* FOLD = getenv("FOLD") ? getenv("FOLD") : "/tmp" ;
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025 __global__ void _test_erfcinvf(float* ff, int ni, int nj)
0026 {
0027 unsigned ix = blockIdx.x * blockDim.x + threadIdx.x;
0028
0029 float u2 = 2.f*float(ix)/float(ni-1) ;
0030 float v = -M_SQRT2f*erfcinvf(u2) ;
0031
0032 ff[ix*nj+0] = u2 ;
0033 ff[ix*nj+1] = v ;
0034 }
0035
0036 void ConfigureLaunch(dim3& numBlocks, dim3& threadsPerBlock, unsigned width )
0037 {
0038 threadsPerBlock.x = 512 ;
0039 threadsPerBlock.y = 1 ;
0040 threadsPerBlock.z = 1 ;
0041
0042 numBlocks.x = (width + threadsPerBlock.x - 1) / threadsPerBlock.x ;
0043 numBlocks.y = 1 ;
0044 numBlocks.z = 1 ;
0045 }
0046
0047 void test_erfcinvf()
0048 {
0049 int ni = 1000 ;
0050 int nj = 2 ;
0051
0052 dim3 numBlocks ;
0053 dim3 threadsPerBlock ;
0054 ConfigureLaunch(numBlocks, threadsPerBlock, ni );
0055
0056 printf("//test_erfcinvf \n");
0057 NP* h = NP::Make<float>( ni, nj ) ;
0058 int arr_bytes = h->arr_bytes() ;
0059 float* hh = h->values<float>();
0060
0061 float* dd = nullptr ;
0062 cudaMalloc(reinterpret_cast<void**>( &dd ), arr_bytes );
0063
0064 _test_erfcinvf<<<numBlocks,threadsPerBlock>>>(dd, ni, nj );
0065
0066 cudaMemcpy( hh, dd, arr_bytes, cudaMemcpyDeviceToHost ) ;
0067 cudaDeviceSynchronize();
0068
0069 h->save(FOLD,"erfcinvf_Test_cu.npy");
0070 }
0071 int main()
0072 {
0073 test_erfcinvf();
0074 return 0 ;
0075 }
0076
0077