Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:33

0001 #pragma once
0002 /**
0003 SCUDA_OutputBuffer.h : Allows an OpenGL PBO buffer to be accessed from CUDA
0004 ============================================================================
0005 
0006 Adapted from SDK/CUDAOutputBuffer.h
0007 Include this after OpenGL headers.
0008 
0009 **/
0010 
0011 #include <cuda_runtime.h>
0012 #include <cuda_gl_interop.h>
0013 
0014 #include "CUDA_CHECK.h"
0015 
0016 #include <iostream>
0017 #include <vector>
0018 
0019 
0020 enum class SCUDA_OutputBufferType
0021 {
0022     CUDA_DEVICE = 0, // not preferred, typically slower than ZERO_COPY
0023     GL_INTEROP  = 1, // single device only, preferred for single device
0024     ZERO_COPY   = 2, // general case, preferred for multi-gpu if not fully nvlink connected
0025     CUDA_P2P    = 3  // fully connected only, preferred for fully nvlink connected
0026 };
0027 
0028 typedef cudaStream_t CUstream ; // why I need to do this ? missing some header ? mixing APIs ?
0029 
0030 template <typename PIXEL_FORMAT>
0031 class SCUDA_OutputBuffer
0032 {
0033 public:
0034     SCUDA_OutputBuffer( SCUDA_OutputBufferType type, int32_t width, int32_t height );
0035     ~SCUDA_OutputBuffer();
0036 
0037     void setDevice( int32_t device_idx ) { m_device_idx = device_idx; }
0038     void setStream( CUstream stream    ) { m_stream     = stream;     }
0039 
0040     void resize( int32_t width, int32_t height );
0041 
0042     // Allocate or update device pointer as necessary for CUDA access
0043     PIXEL_FORMAT* map();
0044     void unmap();
0045     std::string desc() const ;
0046 
0047     int32_t        width() const  { return m_width;  }
0048     int32_t        height() const { return m_height; }
0049 
0050 
0051     // Get output buffer
0052     GLuint         getPBO();
0053     void           deletePBO();
0054     PIXEL_FORMAT*  getHostPointer();
0055 
0056 private:
0057     void makeCurrent() { CUDA_CHECK( cudaSetDevice( m_device_idx ) ); }
0058 
0059     SCUDA_OutputBufferType     m_type;
0060     int32_t                    m_width             = 0u;
0061     int32_t                    m_height            = 0u;
0062 
0063     cudaGraphicsResource*      m_cuda_gfx_resource = nullptr;
0064     GLuint                     m_pbo               = 0u;
0065     PIXEL_FORMAT*              m_device_pixels     = nullptr;
0066     PIXEL_FORMAT*              m_host_zcopy_pixels = nullptr;
0067     std::vector<PIXEL_FORMAT>  m_host_pixels;
0068 
0069     CUstream                   m_stream            = 0u;
0070     int32_t                    m_device_idx        = 0;
0071 };
0072 
0073 
0074 template <typename PIXEL_FORMAT>
0075 inline SCUDA_OutputBuffer<PIXEL_FORMAT>::SCUDA_OutputBuffer( SCUDA_OutputBufferType type, int32_t width, int32_t height )
0076     :
0077     m_type( type )
0078 {
0079     // If using GL Interop, expect that the active device is also the display device.
0080     if( m_type == SCUDA_OutputBufferType::GL_INTEROP )
0081     {
0082         int current_device, is_display_device;
0083         CUDA_CHECK( cudaGetDevice( &current_device ) );
0084 
0085         CUDA_CHECK( cudaDeviceGetAttribute( &is_display_device, cudaDevAttrKernelExecTimeout, current_device ) );
0086 
0087         const char* xdg_session_type = getenv("XDG_SESSION_TYPE");
0088         if( xdg_session_type != nullptr && strcmp(xdg_session_type, "wayland") == 0)
0089         {
0090             is_display_device = 1;   // determining is_display_device from timeout seems not to work with Xwayland
0091         }
0092 
0093         if( !is_display_device ) std::cerr
0094               << "SCUDA_OutputBuffer::SCUDA_OutputBuffer GL interop is only available on display device \n"
0095               << "  xdg_session_type [" << ( xdg_session_type ? xdg_session_type : "-'" ) << "\n" ;
0096         assert( is_display_device );
0097     }
0098     resize( width, height );
0099 }
0100 
0101 
0102 template <typename PIXEL_FORMAT>
0103 SCUDA_OutputBuffer<PIXEL_FORMAT>::~SCUDA_OutputBuffer()
0104 {
0105     try
0106     {
0107         makeCurrent();
0108         if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0109         {
0110             CUDA_CHECK( cudaFree( reinterpret_cast<void*>( m_device_pixels ) ) );
0111         }
0112         else if( m_type == SCUDA_OutputBufferType::ZERO_COPY )
0113         {
0114             CUDA_CHECK( cudaFreeHost( reinterpret_cast<void*>( m_host_zcopy_pixels ) ) );
0115         }
0116         else if( m_type == SCUDA_OutputBufferType::GL_INTEROP || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0117         {
0118             CUDA_CHECK( cudaGraphicsUnregisterResource( m_cuda_gfx_resource ) );
0119         }
0120 
0121         if( m_pbo != 0u )
0122         {
0123             GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0124             GL_CHECK( glDeleteBuffers( 1, &m_pbo ) );
0125         }
0126     }
0127     catch(std::exception& e )
0128     {
0129         std::cerr << "SCUDA_OutputBuffer destructor caught exception: " << e.what() << std::endl;
0130     }
0131 }
0132 
0133 /**
0134 SCUDA_OutputBuffer::resize
0135 --------------------------
0136 
0137 In interop mode sets:
0138 
0139 1. m_pbo : reference to "PBO" GL_ARRAY_BUFFER
0140 2. m_cuda_gfx_resource : reference resulting from registering as CUDA graphics resource
0141 
0142 **/
0143 
0144 
0145 template <typename PIXEL_FORMAT>
0146 void SCUDA_OutputBuffer<PIXEL_FORMAT>::resize( int32_t width, int32_t height )
0147 {
0148     if( width < 1 ) width = 1 ;
0149     if( height < 1 ) height = 1 ;
0150 
0151     if( m_width == width && m_height == height )
0152         return;
0153 
0154     m_width  = width;
0155     m_height = height;
0156 
0157     makeCurrent();
0158 
0159     if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0160     {
0161         CUDA_CHECK( cudaFree( reinterpret_cast<void*>( m_device_pixels ) ) );
0162         CUDA_CHECK( cudaMalloc(
0163                     reinterpret_cast<void**>( &m_device_pixels ),
0164                     m_width*m_height*sizeof(PIXEL_FORMAT)
0165                     ) );
0166 
0167     }
0168 
0169     if( m_type == SCUDA_OutputBufferType::GL_INTEROP || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0170     {
0171         // GL buffer gets resized below
0172         GL_CHECK( glGenBuffers( 1, &m_pbo ) );
0173         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0174         GL_CHECK( glBufferData( GL_ARRAY_BUFFER, sizeof(PIXEL_FORMAT)*m_width*m_height, nullptr, GL_STREAM_DRAW ) );
0175         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0u ) );
0176 
0177         CUDA_CHECK( cudaGraphicsGLRegisterBuffer(
0178                     &m_cuda_gfx_resource,
0179                     m_pbo,
0180                     cudaGraphicsMapFlagsWriteDiscard
0181                     ) );
0182     }
0183 
0184     if( m_type == SCUDA_OutputBufferType::ZERO_COPY )
0185     {
0186         CUDA_CHECK( cudaFreeHost( reinterpret_cast<void*>( m_host_zcopy_pixels ) ) );
0187         CUDA_CHECK( cudaHostAlloc(
0188                     reinterpret_cast<void**>( &m_host_zcopy_pixels ),
0189                     m_width*m_height*sizeof(PIXEL_FORMAT),
0190                     cudaHostAllocPortable | cudaHostAllocMapped
0191                     ) );
0192         CUDA_CHECK( cudaHostGetDevicePointer(
0193                     reinterpret_cast<void**>( &m_device_pixels ),
0194                     reinterpret_cast<void*>( m_host_zcopy_pixels ),
0195                     0 /*flags*/
0196                     ) );
0197     }
0198 
0199     if( m_type != SCUDA_OutputBufferType::GL_INTEROP && m_type != SCUDA_OutputBufferType::CUDA_P2P && m_pbo != 0u )
0200     {
0201         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0202         GL_CHECK( glBufferData( GL_ARRAY_BUFFER, sizeof(PIXEL_FORMAT)*m_width*m_height, nullptr, GL_STREAM_DRAW ) );
0203         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0u ) );
0204     }
0205 
0206     if( !m_host_pixels.empty() )
0207         m_host_pixels.resize( m_width*m_height );
0208 }
0209 
0210 /**
0211 SCUDA_OutputBuffer::map
0212 -----------------------
0213 
0214 In interop mode sets and returns m_device_pixels pointer
0215 allowing CUDA to write to the underlying graphics "PBO" buffer.
0216 
0217 **/
0218 
0219 template <typename PIXEL_FORMAT>
0220 PIXEL_FORMAT* SCUDA_OutputBuffer<PIXEL_FORMAT>::map()
0221 {
0222     if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0223     {
0224         // nothing needed
0225     }
0226     else if( m_type == SCUDA_OutputBufferType::GL_INTEROP  )
0227     {
0228         makeCurrent();
0229 
0230         size_t buffer_size = 0u;
0231         CUDA_CHECK( cudaGraphicsMapResources ( 1, &m_cuda_gfx_resource, m_stream ) );
0232         CUDA_CHECK( cudaGraphicsResourceGetMappedPointer(
0233                     reinterpret_cast<void**>( &m_device_pixels ),
0234                     &buffer_size,
0235                     m_cuda_gfx_resource
0236                     ) );
0237     }
0238     else // m_type == SCUDA_OutputBufferType::ZERO_COPY
0239     {
0240         // nothing needed
0241     }
0242 
0243     return m_device_pixels;
0244 }
0245 
0246 /**
0247 SCUDA_OutputBuffer::unmap
0248 --------------------------
0249 
0250 Relinquishes CUDA access to the PBO graphics buffer, allowing
0251 subequent rendering with OpenGL.
0252 
0253 **/
0254 
0255 template <typename PIXEL_FORMAT>
0256 void SCUDA_OutputBuffer<PIXEL_FORMAT>::unmap()
0257 {
0258     makeCurrent();
0259 
0260     if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0261     {
0262         CUDA_CHECK( cudaStreamSynchronize( m_stream ) );
0263     }
0264     else if( m_type == SCUDA_OutputBufferType::GL_INTEROP  )
0265     {
0266         CUDA_CHECK( cudaGraphicsUnmapResources ( 1, &m_cuda_gfx_resource,  m_stream ) );
0267     }
0268     else // m_type == SCUDA_OutputBufferType::ZERO_COPY
0269     {
0270         CUDA_CHECK( cudaStreamSynchronize( m_stream ) );
0271     }
0272 }
0273 
0274 
0275 template <typename PIXEL_FORMAT>
0276 std::string SCUDA_OutputBuffer<PIXEL_FORMAT>::desc() const
0277 {
0278     std::stringstream ss ;
0279     ss << "SCUDA_OutputBuffer::desc"
0280        << std::endl
0281        << " int(m_type) " << int(m_type)
0282        << std::endl
0283        << " m_width " << m_width
0284        << std::endl
0285        << " m_height " << m_height
0286        << std::endl
0287        << " m_cuda_gfx_resource " << ( m_cuda_gfx_resource ? "YES" : "NO " )
0288        << std::endl
0289        << " m_pbo " << m_pbo
0290        << std::endl
0291        << " m_device_pixels " << ( m_device_pixels ? "YES" : "NO " )
0292        << std::endl
0293        << " m_host_zcopy_pixels " << ( m_host_zcopy_pixels ? "YES" : "NO " )
0294        << std::endl
0295        << " m_host_pixels.size " << m_host_pixels.size()
0296        << std::endl
0297        << " m_stream " << m_stream
0298        << std::endl
0299        << " m_device_idx " << m_device_idx
0300        << std::endl
0301        ;
0302     std::string str = ss.str();
0303     return str ;
0304 }
0305 
0306 /**
0307 SCUDA_OutputBuffer::getPBO
0308 ---------------------------
0309 
0310 In interop mode just returns m_pbo
0311 
0312 **/
0313 
0314 template <typename PIXEL_FORMAT>
0315 GLuint SCUDA_OutputBuffer<PIXEL_FORMAT>::getPBO()
0316 {
0317     if( m_pbo == 0u )
0318         GL_CHECK( glGenBuffers( 1, &m_pbo ) );
0319 
0320     const size_t buffer_size = m_width*m_height*sizeof(PIXEL_FORMAT);
0321 
0322     if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE )
0323     {
0324         // We need a host buffer to act as a way-station
0325         if( m_host_pixels.empty() )
0326             m_host_pixels.resize( m_width*m_height );
0327 
0328         makeCurrent();
0329         CUDA_CHECK( cudaMemcpy(
0330                     static_cast<void*>( m_host_pixels.data() ),
0331                     m_device_pixels,
0332                     buffer_size,
0333                     cudaMemcpyDeviceToHost
0334                     ) );
0335 
0336         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0337         GL_CHECK( glBufferData(
0338                     GL_ARRAY_BUFFER,
0339                     buffer_size,
0340                     static_cast<void*>( m_host_pixels.data() ),
0341                     GL_STREAM_DRAW
0342                     ) );
0343         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0344     }
0345     else if( m_type == SCUDA_OutputBufferType::GL_INTEROP  )
0346     {
0347         // Nothing needed
0348     }
0349     else if ( m_type == SCUDA_OutputBufferType::CUDA_P2P )
0350     {
0351         makeCurrent();
0352         void* pbo_buff = nullptr;
0353         size_t dummy_size = 0;
0354 
0355         CUDA_CHECK( cudaGraphicsMapResources( 1, &m_cuda_gfx_resource, m_stream ) );
0356         CUDA_CHECK( cudaGraphicsResourceGetMappedPointer( &pbo_buff, &dummy_size, m_cuda_gfx_resource ) );
0357         CUDA_CHECK( cudaMemcpy( pbo_buff, m_device_pixels, buffer_size, cudaMemcpyDeviceToDevice ) );
0358         CUDA_CHECK( cudaGraphicsUnmapResources( 1, &m_cuda_gfx_resource, m_stream ) );
0359     }
0360     else // m_type == SCUDA_OutputBufferType::ZERO_COPY
0361     {
0362         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0363         GL_CHECK( glBufferData(
0364                     GL_ARRAY_BUFFER,
0365                     buffer_size,
0366                     static_cast<void*>( m_host_zcopy_pixels ),
0367                     GL_STREAM_DRAW
0368                     ) );
0369         GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0370     }
0371 
0372     return m_pbo;
0373 }
0374 
0375 template <typename PIXEL_FORMAT>
0376 void SCUDA_OutputBuffer<PIXEL_FORMAT>::deletePBO()
0377 {
0378     GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0379     GL_CHECK( glDeleteBuffers( 1, &m_pbo ) );
0380     m_pbo = 0;
0381 }
0382 
0383 /**
0384 SCUDA_OutputBuffer::getHostPointer
0385 ----------------------------------
0386 
0387 In all modes other than ZERO_COPY resizes the m_host_pixels vector
0388 can downloads the mapped graphics device buffer contents to the host vector.
0389 In ZERO_COPY just returns m_host_zcopy_pixels.
0390 
0391 **/
0392 
0393 template <typename PIXEL_FORMAT>
0394 PIXEL_FORMAT* SCUDA_OutputBuffer<PIXEL_FORMAT>::getHostPointer()
0395 {
0396     if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE ||
0397         m_type == SCUDA_OutputBufferType::CUDA_P2P ||
0398         m_type == SCUDA_OutputBufferType::GL_INTEROP  )
0399     {
0400         m_host_pixels.resize( m_width*m_height );
0401 
0402         makeCurrent();
0403         CUDA_CHECK( cudaMemcpy(
0404                     static_cast<void*>( m_host_pixels.data() ),
0405                     map(),
0406                     m_width*m_height*sizeof(PIXEL_FORMAT),
0407                     cudaMemcpyDeviceToHost
0408                     ) );
0409         unmap();
0410 
0411         return m_host_pixels.data();
0412     }
0413     else // m_type == SCUDA_OutputBufferType::ZERO_COPY
0414     {
0415         return m_host_zcopy_pixels;
0416     }
0417 }
0418