eic-opticks/sysrap/SLaunchSequence.h

0001 #pragma once
0002 /**
0003 SLaunchSequence
0004 ================
0005
0006 This is an updated version of the old cudarap/LaunchSequence
0007
0008 Old defaults, chosen while using macOS mobile GPU Geforce 750M::
0009
0010     unsigned max_blocks=128
0011     unsigned threads_per_block=256
0012
0013 Example of a CUDA launch using this::
0014
0015     init_rng<<<launch.blocks_per_launch, launch.threads_per_block>>>( launch.threads_per_launch, launch.thread_offset, dev_rng_states_launch, seed, offset );
0016
0017 Can experiment with envvars
0018
0019     THREADS_PER_BLOCK
0020         rather constrained even with TITAN V,  TITAN RTX cannot exceed 1024
0021
0022     MAX_BLOCKS
0023         not constrained, the maximum is enormous
0024
0025 **/
0026
0027
0028 #include <vector>
0029 #include <cassert>
0030 #include <cstring>
0031 #include <cstdio>
0032 #include <cstdlib>
0033
0034 #if defined(__CUDACC__) || defined(__CUDABE__)
0035 #else
0036 #include <string>
0037 #include <sstream>
0038 #include <iomanip>
0039 #endif
0040
0041 #include "ssys.h"
0042
0043 #include "SYSRAP_API_EXPORT.hh"
0044
0045 struct SYSRAP_API SLaunch
0046 {
0047    unsigned thread_offset ;
0048    unsigned threads_per_launch ;
0049    unsigned blocks_per_launch ;
0050    unsigned threads_per_block ;
0051    unsigned sequence_index ;
0052    float    kernel_time ;
0053
0054 #if defined(__CUDACC__) || defined(__CUDABE__)
0055 #else
0056    std::string desc() const ;
0057 #endif
0058
0059 };
0060
0061 #if defined(__CUDACC__) || defined(__CUDABE__)
0062 #else
0063 inline std::string SLaunch::desc() const
0064 {
0065    std::stringstream ss ;
0066    ss
0067        << "SLaunch::desc"
0068        << " sequence_index " << std::setw(3) << sequence_index
0069        << " thread_offset " << std::setw(7) << thread_offset
0070        << " threads_per_launch " << std::setw(6) << threads_per_launch
0071        << " blocks_per_launch " << std::setw(6) << blocks_per_launch
0072        << " threads_per_block " << std::setw(6) << threads_per_block
0073        << " kernel_time (ms) " << std::setw(20) << std::fixed << std::setprecision(4) << kernel_time
0074        ;
0075
0076    std::string s = ss.str();
0077    return s ;
0078 }
0079 #endif
0080
0081
0082 struct SYSRAP_API SLaunchSequence
0083 {
0084     unsigned items ;
0085     unsigned threads_per_block ;
0086     unsigned max_blocks ;
0087
0088     std::vector<SLaunch> launches ;
0089
0090     SLaunchSequence(unsigned items);
0091     void init();
0092
0093
0094 #if defined(__CUDACC__) || defined(__CUDABE__)
0095 #else
0096     std::string brief() const ;
0097     std::string desc() const ;
0098     float total_time() const ;
0099     unsigned total_threads() const ;
0100 #endif
0101 };
0102
0103 inline SLaunchSequence::SLaunchSequence(unsigned items_)
0104     :
0105     items(items_),
0106     threads_per_block(ssys::getenvint("THREADS_PER_BLOCK", 512)),
0107     max_blocks(ssys::getenvint("MAX_BLOCKS", 128))
0108 {
0109     init();
0110 }
0111
0112 inline void SLaunchSequence::init()
0113 {
0114     assert( threads_per_block <= 1024 ); // THREADS_PER_BLOCK is highly constrained, unlike MAX_BLOCKS
0115
0116     launches.clear();
0117     unsigned thread_offset = 0 ;
0118     unsigned sequence_index = 0 ;
0119
0120     while( thread_offset < items )
0121     {
0122         unsigned remaining = items - thread_offset ;
0123         unsigned blocks_per_launch = remaining / threads_per_block ;
0124         if(remaining % threads_per_block != 0) blocks_per_launch += 1 ;
0125         if( blocks_per_launch > max_blocks ) blocks_per_launch = max_blocks ;
0126         // blocks_per_launch sticks at max_blocks until the last launch of the sequence
0127
0128         unsigned threads_per_launch = blocks_per_launch * threads_per_block ;
0129         if(threads_per_launch > remaining) threads_per_launch = remaining ;
0130
0131         launches.push_back( { thread_offset, threads_per_launch, blocks_per_launch, threads_per_block, sequence_index, -1.f } );
0132
0133         thread_offset += threads_per_launch ;
0134         sequence_index += 1 ;
0135     }
0136 }
0137
0138
0139 #if defined(__CUDACC__) || defined(__CUDABE__)
0140 #else
0141
0142 inline float SLaunchSequence::total_time() const
0143 {
0144    float total = 0.0f ;
0145    for(unsigned i=0 ; i < launches.size() ; i++ )
0146    {
0147        const SLaunch& launch = launches[i] ;
0148        if(launch.kernel_time > 0.f ) total += launch.kernel_time ;
0149    }
0150    return total ;
0151 }
0152
0153 inline unsigned SLaunchSequence::total_threads() const
0154 {
0155    unsigned total = 0 ;
0156    for(unsigned i=0 ; i < launches.size() ; i++ )
0157    {
0158        const SLaunch& launch = launches[i] ;
0159        total += launch.threads_per_launch ;
0160    }
0161    return total ;
0162 }
0163
0164
0165 inline std::string SLaunchSequence::brief() const
0166 {
0167     std::stringstream ss ;
0168     ss
0169        << " items " << std::setw(7) << items
0170        << " total_threads " << std::setw(7) << total_threads()
0171        << " THREADS_PER_BLOCK " << std::setw(5) << threads_per_block
0172        << " MAX_BLOCKS " << std::setw(6) << max_blocks
0173        << " num_launches " << std::setw(4) << launches.size()
0174        << " total_time " << std::setw(10) << std::fixed << std::setprecision(4) << total_time()
0175        ;
0176     std::string s = ss.str();
0177     return s ;
0178 }
0179
0180 inline std::string SLaunchSequence::desc() const
0181 {
0182     std::stringstream ss ;
0183     ss
0184         << "SLaunchSequence::desc"
0185         << brief()
0186         << std::endl
0187         ;
0188     for(unsigned i=0 ; i < launches.size() ; i++)
0189     {
0190         const SLaunch& launch = launches[i] ;
0191         ss << launch.desc() << std::endl ;
0192     }
0193     std::string s = ss.str();
0194     return s ;
0195 }
0196
0197 #endif
0198