Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-09 07:49:39

0001 #pragma once
0002 /**
0003 SLaunchSequence
0004 ================
0005 
0006 This is an updated version of the old cudarap/LaunchSequence
0007 
0008 Old defaults, chosen while using macOS mobile GPU Geforce 750M:: 
0009 
0010     unsigned max_blocks=128
0011     unsigned threads_per_block=256  
0012  
0013 Example of a CUDA launch using this::
0014 
0015     init_rng<<<launch.blocks_per_launch, launch.threads_per_block>>>( launch.threads_per_launch, launch.thread_offset, dev_rng_states_launch, seed, offset );
0016 
0017 Can experiment with envvars
0018 
0019     THREADS_PER_BLOCK
0020         rather constrained even with TITAN V,  TITAN RTX cannot exceed 1024   
0021 
0022     MAX_BLOCKS
0023         not constrained, the maximum is enormous 
0024 
0025 **/
0026 
0027 
0028 #include <vector>
0029 #include <cassert>
0030 #include <cstring>
0031 #include <cstdio>
0032 #include <cstdlib>
0033 
0034 #if defined(__CUDACC__) || defined(__CUDABE__)
0035 #else
0036 #include <string>
0037 #include <sstream>
0038 #include <iomanip>
0039 #endif 
0040 
0041 #include "ssys.h"
0042 
0043 #include "SYSRAP_API_EXPORT.hh"
0044 
0045 struct SYSRAP_API SLaunch 
0046 {
0047    unsigned thread_offset ; 
0048    unsigned threads_per_launch ; 
0049    unsigned blocks_per_launch ; 
0050    unsigned threads_per_block ; 
0051    unsigned sequence_index ; 
0052    float    kernel_time ; 
0053 
0054 #if defined(__CUDACC__) || defined(__CUDABE__)
0055 #else
0056    std::string desc() const ; 
0057 #endif
0058 
0059 }; 
0060 
0061 #if defined(__CUDACC__) || defined(__CUDABE__)
0062 #else
0063 inline std::string SLaunch::desc() const 
0064 {
0065    std::stringstream ss ; 
0066    ss 
0067        << "SLaunch::desc"
0068        << " sequence_index " << std::setw(3) << sequence_index 
0069        << " thread_offset " << std::setw(7) << thread_offset
0070        << " threads_per_launch " << std::setw(6) << threads_per_launch
0071        << " blocks_per_launch " << std::setw(6) << blocks_per_launch
0072        << " threads_per_block " << std::setw(6) << threads_per_block 
0073        << " kernel_time (ms) " << std::setw(20) << std::fixed << std::setprecision(4) << kernel_time 
0074        ;
0075 
0076    std::string s = ss.str(); 
0077    return s ; 
0078 }
0079 #endif
0080 
0081 
0082 struct SYSRAP_API SLaunchSequence 
0083 {
0084     unsigned items ; 
0085     unsigned threads_per_block ;
0086     unsigned max_blocks ;
0087 
0088     std::vector<SLaunch> launches ;
0089 
0090     SLaunchSequence(unsigned items); 
0091     void init(); 
0092 
0093 
0094 #if defined(__CUDACC__) || defined(__CUDABE__)
0095 #else
0096     std::string brief() const ; 
0097     std::string desc() const ; 
0098     float total_time() const ; 
0099     unsigned total_threads() const ; 
0100 #endif
0101 }; 
0102 
0103 inline SLaunchSequence::SLaunchSequence(unsigned items_)
0104     :
0105     items(items_),
0106     threads_per_block(ssys::getenvint("THREADS_PER_BLOCK", 512)),
0107     max_blocks(ssys::getenvint("MAX_BLOCKS", 128))
0108 {
0109     init(); 
0110 }
0111 
0112 inline void SLaunchSequence::init()
0113 {
0114     assert( threads_per_block <= 1024 ); // THREADS_PER_BLOCK is highly constrained, unlike MAX_BLOCKS
0115 
0116     launches.clear();
0117     unsigned thread_offset = 0 ;
0118     unsigned sequence_index = 0 ;
0119 
0120     while( thread_offset < items )
0121     {
0122         unsigned remaining = items - thread_offset ;
0123         unsigned blocks_per_launch = remaining / threads_per_block ;
0124         if(remaining % threads_per_block != 0) blocks_per_launch += 1 ;  
0125         if( blocks_per_launch > max_blocks ) blocks_per_launch = max_blocks ; 
0126         // blocks_per_launch sticks at max_blocks until the last launch of the sequence  
0127 
0128         unsigned threads_per_launch = blocks_per_launch * threads_per_block ; 
0129         if(threads_per_launch > remaining) threads_per_launch = remaining ;
0130 
0131         launches.push_back( { thread_offset, threads_per_launch, blocks_per_launch, threads_per_block, sequence_index, -1.f } );
0132 
0133         thread_offset += threads_per_launch ; 
0134         sequence_index += 1 ;
0135     }
0136 }
0137 
0138 
0139 #if defined(__CUDACC__) || defined(__CUDABE__)
0140 #else
0141 
0142 inline float SLaunchSequence::total_time() const 
0143 {
0144    float total = 0.0f ; 
0145    for(unsigned i=0 ; i < launches.size() ; i++ )
0146    {
0147        const SLaunch& launch = launches[i] ;
0148        if(launch.kernel_time > 0.f ) total += launch.kernel_time ;
0149    }
0150    return total ; 
0151 }
0152 
0153 inline unsigned SLaunchSequence::total_threads() const 
0154 {
0155    unsigned total = 0 ; 
0156    for(unsigned i=0 ; i < launches.size() ; i++ )
0157    {
0158        const SLaunch& launch = launches[i] ;
0159        total += launch.threads_per_launch ;
0160    }
0161    return total ; 
0162 }
0163 
0164 
0165 inline std::string SLaunchSequence::brief() const
0166 {
0167     std::stringstream ss ; 
0168     ss 
0169        << " items " << std::setw(7) << items
0170        << " total_threads " << std::setw(7) << total_threads()
0171        << " THREADS_PER_BLOCK " << std::setw(5) << threads_per_block
0172        << " MAX_BLOCKS " << std::setw(6) << max_blocks
0173        << " num_launches " << std::setw(4) << launches.size() 
0174        << " total_time " << std::setw(10) << std::fixed << std::setprecision(4) << total_time() 
0175        ;
0176     std::string s = ss.str(); 
0177     return s ;  
0178 }
0179 
0180 inline std::string SLaunchSequence::desc() const
0181 {
0182     std::stringstream ss ;
0183     ss
0184         << "SLaunchSequence::desc"
0185         << brief()
0186         << std::endl 
0187         ;
0188     for(unsigned i=0 ; i < launches.size() ; i++)  
0189     {
0190         const SLaunch& launch = launches[i] ; 
0191         ss << launch.desc() << std::endl ; 
0192     } 
0193     std::string s = ss.str(); 
0194     return s ;  
0195 }
0196 
0197 #endif
0198