sysrap/tests/SIMGStandaloneTest.cu

0001 // ~/o/sysrap/tests/SIMGStandaloneTest.sh
0002
0003 #include <vector_types.h>
0004 #include <vector_functions.h>
0005 #include <cuda_runtime.h>
0006
0007 #include <iostream>
0008 #define SIMG_IMPLEMENTATION 1
0009 #include "SIMG.h"
0010
0011 // https://stackoverflow.com/questions/14901491/cudamemcpytoarray/14929827#14929827
0012
0013 #include <stdio.h>
0014 #define cudaCheckErrors(msg) \
0015     do { \
0016         cudaError_t __err = cudaGetLastError(); \
0017         if (__err != cudaSuccess) { \
0018             fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
0019                 msg, cudaGetErrorString(__err), \
0020                 __FILE__, __LINE__); \
0021             fprintf(stderr, "*** FAILED - ABORTING\n"); \
0022             exit(1); \
0023         } \
0024     } while (0)
0025
0026
0027
0028 __global__ void colorKernel(uchar4* output, cudaTextureObject_t texObj, int width, int height, float theta) {
0029     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
0030     unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
0031
0032     //if( x % 1000 == 0 ) printf("x %d y %d \n", x, y );
0033     //if( x == 1000 ) printf("x %d y %d \n", x, y );
0034
0035     output[y * width + x] = make_uchar4( 255u, 0u, 0u, 255u );
0036 }
0037
0038
0039
0040 __global__ void transformKernel(uchar4* output, cudaTextureObject_t texObj, int width, int height, float theta) {
0041     unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
0042     unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
0043
0044     float u = x / (float) width;  // 0. -> 1.
0045     float v = y / (float) height;
0046
0047     // shift origin to center of image
0048     u -= 0.5f;                   //  -0.5 -> 0.5
0049     v -= 0.5f;
0050
0051     // rotate around the center
0052     float tu = u * cosf(theta) - v * sinf(theta) ;
0053     float tv = v * cosf(theta) + u * sinf(theta) ;
0054
0055     // read from the texture
0056     uchar4 c = tex2D<uchar4>(texObj, tu+0.5f, tv+0.5f);
0057
0058     //if( c.x != 0 ) printf(" c ( %d %d %d %d ) \n",c.x, c.y, c.z, c.w );
0059     //c.x = 255u ;
0060     c.w = 255u ;
0061
0062     output[y * width + x] = c ;
0063 }
0064
0065 int main(int argc, char** argv)
0066 {
0067     const char* ipath = argc > 1 ? argv[1] : "/tmp/i.png" ;
0068     const char* opath = argc > 2 ? argv[2] : "/tmp/o.png" ;
0069
0070     SIMG img(ipath);
0071     std::cout << img.desc() << std::endl ;
0072     assert( img.channels == 4 );
0073
0074     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar4>();
0075
0076     cudaArray *cuArray;
0077     cudaMallocArray(&cuArray, &channelDesc, img.width, img.height );
0078     cudaCheckErrors("cudaMallocArray");
0079
0080     cudaMemcpyToArray(cuArray, 0, 0, img.data, img.width*img.height*4*sizeof(unsigned char), cudaMemcpyHostToDevice);
0081     cudaCheckErrors("cudaMemcpyToArray");
0082
0083     struct cudaResourceDesc resDesc;
0084     memset(&resDesc, 0, sizeof(resDesc));
0085     resDesc.resType = cudaResourceTypeArray;
0086     resDesc.res.array.array = cuArray;
0087
0088     // https://docs.nvidia.com/cuda/cuda-runtime-api/structcudaTextureDesc.html
0089     struct cudaTextureDesc texDesc;
0090     memset(&texDesc, 0, sizeof(texDesc));
0091     texDesc.addressMode[0] = cudaAddressModeWrap;
0092     texDesc.addressMode[1] = cudaAddressModeWrap;
0093
0094     //texDesc.filterMode = cudaFilterModeLinear;
0095     texDesc.filterMode = cudaFilterModePoint;    // switch off interpolation, as that gives error with non-float texture
0096
0097     texDesc.readMode = cudaReadModeElementType;  // return data of the type of the underlying buffer
0098     texDesc.normalizedCoords = 1 ;            // addressing into the texture with floats in range 0:1
0099
0100     // Create texture object
0101     cudaTextureObject_t texObj = 0;
0102     cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
0103
0104     // Allocate result of transformation in device memory
0105     uchar4* d_output;
0106     cudaMalloc(&d_output, img.width * img.height * 4*sizeof(unsigned char));
0107
0108     dim3 dimBlock(16, 16);
0109     dim3 dimGrid((img.width + dimBlock.x - 1) / dimBlock.x, (img.height + dimBlock.y - 1) / dimBlock.y);
0110
0111     float theta = 1.f ;
0112
0113     //colorKernel<<<dimGrid, dimBlock>>>(d_output, texObj, img.width, img.height, theta );
0114     transformKernel<<<dimGrid, dimBlock>>>(d_output, texObj, img.width, img.height, theta );
0115     cudaDeviceSynchronize();
0116     cudaCheckErrors("cudaDeviceSynchronize");
0117     // Fatal error: cudaDeviceSynchronize (linear filtering not supported for non-float type at SIMGStandaloneTest.cu:123)
0118
0119
0120     uchar4* output = new uchar4[img.width*img.height] ;
0121     cudaMemcpy(output, d_output, img.width*img.height*sizeof(uchar4), cudaMemcpyDeviceToHost);
0122
0123     std::cout << "writing to " << opath << std::endl ;
0124
0125     SIMG img2(img.width, img.height, img.channels, (unsigned char*)output );
0126     img2.writePNG(opath);
0127
0128     cudaDeviceSynchronize();
0129
0130     cudaDestroyTextureObject(texObj);
0131     cudaFreeArray(cuArray);
0132
0133     delete[] output ;
0134     cudaFree(d_output);
0135
0136     return 0;
0137 }
0138