File indexing completed on 2026-04-09 07:49:33
0001 #pragma once
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #include <cuda_runtime.h>
0012 #include <cuda_gl_interop.h>
0013
0014 #include "CUDA_CHECK.h"
0015
0016 #include <iostream>
0017 #include <vector>
0018
0019
0020 enum class SCUDA_OutputBufferType
0021 {
0022 CUDA_DEVICE = 0,
0023 GL_INTEROP = 1,
0024 ZERO_COPY = 2,
0025 CUDA_P2P = 3
0026 };
0027
0028 typedef cudaStream_t CUstream ;
0029
0030 template <typename PIXEL_FORMAT>
0031 class SCUDA_OutputBuffer
0032 {
0033 public:
0034 SCUDA_OutputBuffer( SCUDA_OutputBufferType type, int32_t width, int32_t height );
0035 ~SCUDA_OutputBuffer();
0036
0037 void setDevice( int32_t device_idx ) { m_device_idx = device_idx; }
0038 void setStream( CUstream stream ) { m_stream = stream; }
0039
0040 void resize( int32_t width, int32_t height );
0041
0042
0043 PIXEL_FORMAT* map();
0044 void unmap();
0045 std::string desc() const ;
0046
0047 int32_t width() const { return m_width; }
0048 int32_t height() const { return m_height; }
0049
0050
0051
0052 GLuint getPBO();
0053 void deletePBO();
0054 PIXEL_FORMAT* getHostPointer();
0055
0056 private:
0057 void makeCurrent() { CUDA_CHECK( cudaSetDevice( m_device_idx ) ); }
0058
0059 SCUDA_OutputBufferType m_type;
0060 int32_t m_width = 0u;
0061 int32_t m_height = 0u;
0062
0063 cudaGraphicsResource* m_cuda_gfx_resource = nullptr;
0064 GLuint m_pbo = 0u;
0065 PIXEL_FORMAT* m_device_pixels = nullptr;
0066 PIXEL_FORMAT* m_host_zcopy_pixels = nullptr;
0067 std::vector<PIXEL_FORMAT> m_host_pixels;
0068
0069 CUstream m_stream = 0u;
0070 int32_t m_device_idx = 0;
0071 };
0072
0073
0074 template <typename PIXEL_FORMAT>
0075 inline SCUDA_OutputBuffer<PIXEL_FORMAT>::SCUDA_OutputBuffer( SCUDA_OutputBufferType type, int32_t width, int32_t height )
0076 :
0077 m_type( type )
0078 {
0079
0080 if( m_type == SCUDA_OutputBufferType::GL_INTEROP )
0081 {
0082 int current_device, is_display_device;
0083 CUDA_CHECK( cudaGetDevice( ¤t_device ) );
0084
0085 CUDA_CHECK( cudaDeviceGetAttribute( &is_display_device, cudaDevAttrKernelExecTimeout, current_device ) );
0086
0087 const char* xdg_session_type = getenv("XDG_SESSION_TYPE");
0088 if( xdg_session_type != nullptr && strcmp(xdg_session_type, "wayland") == 0)
0089 {
0090 is_display_device = 1;
0091 }
0092
0093 if( !is_display_device ) std::cerr
0094 << "SCUDA_OutputBuffer::SCUDA_OutputBuffer GL interop is only available on display device \n"
0095 << " xdg_session_type [" << ( xdg_session_type ? xdg_session_type : "-'" ) << "\n" ;
0096 assert( is_display_device );
0097 }
0098 resize( width, height );
0099 }
0100
0101
0102 template <typename PIXEL_FORMAT>
0103 SCUDA_OutputBuffer<PIXEL_FORMAT>::~SCUDA_OutputBuffer()
0104 {
0105 try
0106 {
0107 makeCurrent();
0108 if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0109 {
0110 CUDA_CHECK( cudaFree( reinterpret_cast<void*>( m_device_pixels ) ) );
0111 }
0112 else if( m_type == SCUDA_OutputBufferType::ZERO_COPY )
0113 {
0114 CUDA_CHECK( cudaFreeHost( reinterpret_cast<void*>( m_host_zcopy_pixels ) ) );
0115 }
0116 else if( m_type == SCUDA_OutputBufferType::GL_INTEROP || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0117 {
0118 CUDA_CHECK( cudaGraphicsUnregisterResource( m_cuda_gfx_resource ) );
0119 }
0120
0121 if( m_pbo != 0u )
0122 {
0123 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0124 GL_CHECK( glDeleteBuffers( 1, &m_pbo ) );
0125 }
0126 }
0127 catch(std::exception& e )
0128 {
0129 std::cerr << "SCUDA_OutputBuffer destructor caught exception: " << e.what() << std::endl;
0130 }
0131 }
0132
0133
0134
0135
0136
0137
0138
0139
0140
0141
0142
0143
0144
0145 template <typename PIXEL_FORMAT>
0146 void SCUDA_OutputBuffer<PIXEL_FORMAT>::resize( int32_t width, int32_t height )
0147 {
0148 if( width < 1 ) width = 1 ;
0149 if( height < 1 ) height = 1 ;
0150
0151 if( m_width == width && m_height == height )
0152 return;
0153
0154 m_width = width;
0155 m_height = height;
0156
0157 makeCurrent();
0158
0159 if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0160 {
0161 CUDA_CHECK( cudaFree( reinterpret_cast<void*>( m_device_pixels ) ) );
0162 CUDA_CHECK( cudaMalloc(
0163 reinterpret_cast<void**>( &m_device_pixels ),
0164 m_width*m_height*sizeof(PIXEL_FORMAT)
0165 ) );
0166
0167 }
0168
0169 if( m_type == SCUDA_OutputBufferType::GL_INTEROP || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0170 {
0171
0172 GL_CHECK( glGenBuffers( 1, &m_pbo ) );
0173 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0174 GL_CHECK( glBufferData( GL_ARRAY_BUFFER, sizeof(PIXEL_FORMAT)*m_width*m_height, nullptr, GL_STREAM_DRAW ) );
0175 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0u ) );
0176
0177 CUDA_CHECK( cudaGraphicsGLRegisterBuffer(
0178 &m_cuda_gfx_resource,
0179 m_pbo,
0180 cudaGraphicsMapFlagsWriteDiscard
0181 ) );
0182 }
0183
0184 if( m_type == SCUDA_OutputBufferType::ZERO_COPY )
0185 {
0186 CUDA_CHECK( cudaFreeHost( reinterpret_cast<void*>( m_host_zcopy_pixels ) ) );
0187 CUDA_CHECK( cudaHostAlloc(
0188 reinterpret_cast<void**>( &m_host_zcopy_pixels ),
0189 m_width*m_height*sizeof(PIXEL_FORMAT),
0190 cudaHostAllocPortable | cudaHostAllocMapped
0191 ) );
0192 CUDA_CHECK( cudaHostGetDevicePointer(
0193 reinterpret_cast<void**>( &m_device_pixels ),
0194 reinterpret_cast<void*>( m_host_zcopy_pixels ),
0195 0
0196 ) );
0197 }
0198
0199 if( m_type != SCUDA_OutputBufferType::GL_INTEROP && m_type != SCUDA_OutputBufferType::CUDA_P2P && m_pbo != 0u )
0200 {
0201 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0202 GL_CHECK( glBufferData( GL_ARRAY_BUFFER, sizeof(PIXEL_FORMAT)*m_width*m_height, nullptr, GL_STREAM_DRAW ) );
0203 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0u ) );
0204 }
0205
0206 if( !m_host_pixels.empty() )
0207 m_host_pixels.resize( m_width*m_height );
0208 }
0209
0210
0211
0212
0213
0214
0215
0216
0217
0218
0219 template <typename PIXEL_FORMAT>
0220 PIXEL_FORMAT* SCUDA_OutputBuffer<PIXEL_FORMAT>::map()
0221 {
0222 if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0223 {
0224
0225 }
0226 else if( m_type == SCUDA_OutputBufferType::GL_INTEROP )
0227 {
0228 makeCurrent();
0229
0230 size_t buffer_size = 0u;
0231 CUDA_CHECK( cudaGraphicsMapResources ( 1, &m_cuda_gfx_resource, m_stream ) );
0232 CUDA_CHECK( cudaGraphicsResourceGetMappedPointer(
0233 reinterpret_cast<void**>( &m_device_pixels ),
0234 &buffer_size,
0235 m_cuda_gfx_resource
0236 ) );
0237 }
0238 else
0239 {
0240
0241 }
0242
0243 return m_device_pixels;
0244 }
0245
0246
0247
0248
0249
0250
0251
0252
0253
0254
0255 template <typename PIXEL_FORMAT>
0256 void SCUDA_OutputBuffer<PIXEL_FORMAT>::unmap()
0257 {
0258 makeCurrent();
0259
0260 if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE || m_type == SCUDA_OutputBufferType::CUDA_P2P )
0261 {
0262 CUDA_CHECK( cudaStreamSynchronize( m_stream ) );
0263 }
0264 else if( m_type == SCUDA_OutputBufferType::GL_INTEROP )
0265 {
0266 CUDA_CHECK( cudaGraphicsUnmapResources ( 1, &m_cuda_gfx_resource, m_stream ) );
0267 }
0268 else
0269 {
0270 CUDA_CHECK( cudaStreamSynchronize( m_stream ) );
0271 }
0272 }
0273
0274
0275 template <typename PIXEL_FORMAT>
0276 std::string SCUDA_OutputBuffer<PIXEL_FORMAT>::desc() const
0277 {
0278 std::stringstream ss ;
0279 ss << "SCUDA_OutputBuffer::desc"
0280 << std::endl
0281 << " int(m_type) " << int(m_type)
0282 << std::endl
0283 << " m_width " << m_width
0284 << std::endl
0285 << " m_height " << m_height
0286 << std::endl
0287 << " m_cuda_gfx_resource " << ( m_cuda_gfx_resource ? "YES" : "NO " )
0288 << std::endl
0289 << " m_pbo " << m_pbo
0290 << std::endl
0291 << " m_device_pixels " << ( m_device_pixels ? "YES" : "NO " )
0292 << std::endl
0293 << " m_host_zcopy_pixels " << ( m_host_zcopy_pixels ? "YES" : "NO " )
0294 << std::endl
0295 << " m_host_pixels.size " << m_host_pixels.size()
0296 << std::endl
0297 << " m_stream " << m_stream
0298 << std::endl
0299 << " m_device_idx " << m_device_idx
0300 << std::endl
0301 ;
0302 std::string str = ss.str();
0303 return str ;
0304 }
0305
0306
0307
0308
0309
0310
0311
0312
0313
0314 template <typename PIXEL_FORMAT>
0315 GLuint SCUDA_OutputBuffer<PIXEL_FORMAT>::getPBO()
0316 {
0317 if( m_pbo == 0u )
0318 GL_CHECK( glGenBuffers( 1, &m_pbo ) );
0319
0320 const size_t buffer_size = m_width*m_height*sizeof(PIXEL_FORMAT);
0321
0322 if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE )
0323 {
0324
0325 if( m_host_pixels.empty() )
0326 m_host_pixels.resize( m_width*m_height );
0327
0328 makeCurrent();
0329 CUDA_CHECK( cudaMemcpy(
0330 static_cast<void*>( m_host_pixels.data() ),
0331 m_device_pixels,
0332 buffer_size,
0333 cudaMemcpyDeviceToHost
0334 ) );
0335
0336 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0337 GL_CHECK( glBufferData(
0338 GL_ARRAY_BUFFER,
0339 buffer_size,
0340 static_cast<void*>( m_host_pixels.data() ),
0341 GL_STREAM_DRAW
0342 ) );
0343 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0344 }
0345 else if( m_type == SCUDA_OutputBufferType::GL_INTEROP )
0346 {
0347
0348 }
0349 else if ( m_type == SCUDA_OutputBufferType::CUDA_P2P )
0350 {
0351 makeCurrent();
0352 void* pbo_buff = nullptr;
0353 size_t dummy_size = 0;
0354
0355 CUDA_CHECK( cudaGraphicsMapResources( 1, &m_cuda_gfx_resource, m_stream ) );
0356 CUDA_CHECK( cudaGraphicsResourceGetMappedPointer( &pbo_buff, &dummy_size, m_cuda_gfx_resource ) );
0357 CUDA_CHECK( cudaMemcpy( pbo_buff, m_device_pixels, buffer_size, cudaMemcpyDeviceToDevice ) );
0358 CUDA_CHECK( cudaGraphicsUnmapResources( 1, &m_cuda_gfx_resource, m_stream ) );
0359 }
0360 else
0361 {
0362 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, m_pbo ) );
0363 GL_CHECK( glBufferData(
0364 GL_ARRAY_BUFFER,
0365 buffer_size,
0366 static_cast<void*>( m_host_zcopy_pixels ),
0367 GL_STREAM_DRAW
0368 ) );
0369 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0370 }
0371
0372 return m_pbo;
0373 }
0374
0375 template <typename PIXEL_FORMAT>
0376 void SCUDA_OutputBuffer<PIXEL_FORMAT>::deletePBO()
0377 {
0378 GL_CHECK( glBindBuffer( GL_ARRAY_BUFFER, 0 ) );
0379 GL_CHECK( glDeleteBuffers( 1, &m_pbo ) );
0380 m_pbo = 0;
0381 }
0382
0383
0384
0385
0386
0387
0388
0389
0390
0391
0392
0393 template <typename PIXEL_FORMAT>
0394 PIXEL_FORMAT* SCUDA_OutputBuffer<PIXEL_FORMAT>::getHostPointer()
0395 {
0396 if( m_type == SCUDA_OutputBufferType::CUDA_DEVICE ||
0397 m_type == SCUDA_OutputBufferType::CUDA_P2P ||
0398 m_type == SCUDA_OutputBufferType::GL_INTEROP )
0399 {
0400 m_host_pixels.resize( m_width*m_height );
0401
0402 makeCurrent();
0403 CUDA_CHECK( cudaMemcpy(
0404 static_cast<void*>( m_host_pixels.data() ),
0405 map(),
0406 m_width*m_height*sizeof(PIXEL_FORMAT),
0407 cudaMemcpyDeviceToHost
0408 ) );
0409 unmap();
0410
0411 return m_host_pixels.data();
0412 }
0413 else
0414 {
0415 return m_host_zcopy_pixels;
0416 }
0417 }
0418