File indexing completed on 2026-04-09 07:49:39
0001 #pragma once
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028 #include <vector>
0029 #include <cassert>
0030 #include <cstring>
0031 #include <cstdio>
0032 #include <cstdlib>
0033
0034 #if defined(__CUDACC__) || defined(__CUDABE__)
0035 #else
0036 #include <string>
0037 #include <sstream>
0038 #include <iomanip>
0039 #endif
0040
0041 #include "ssys.h"
0042
0043 #include "SYSRAP_API_EXPORT.hh"
0044
0045 struct SYSRAP_API SLaunch
0046 {
0047 unsigned thread_offset ;
0048 unsigned threads_per_launch ;
0049 unsigned blocks_per_launch ;
0050 unsigned threads_per_block ;
0051 unsigned sequence_index ;
0052 float kernel_time ;
0053
0054 #if defined(__CUDACC__) || defined(__CUDABE__)
0055 #else
0056 std::string desc() const ;
0057 #endif
0058
0059 };
0060
0061 #if defined(__CUDACC__) || defined(__CUDABE__)
0062 #else
0063 inline std::string SLaunch::desc() const
0064 {
0065 std::stringstream ss ;
0066 ss
0067 << "SLaunch::desc"
0068 << " sequence_index " << std::setw(3) << sequence_index
0069 << " thread_offset " << std::setw(7) << thread_offset
0070 << " threads_per_launch " << std::setw(6) << threads_per_launch
0071 << " blocks_per_launch " << std::setw(6) << blocks_per_launch
0072 << " threads_per_block " << std::setw(6) << threads_per_block
0073 << " kernel_time (ms) " << std::setw(20) << std::fixed << std::setprecision(4) << kernel_time
0074 ;
0075
0076 std::string s = ss.str();
0077 return s ;
0078 }
0079 #endif
0080
0081
0082 struct SYSRAP_API SLaunchSequence
0083 {
0084 unsigned items ;
0085 unsigned threads_per_block ;
0086 unsigned max_blocks ;
0087
0088 std::vector<SLaunch> launches ;
0089
0090 SLaunchSequence(unsigned items);
0091 void init();
0092
0093
0094 #if defined(__CUDACC__) || defined(__CUDABE__)
0095 #else
0096 std::string brief() const ;
0097 std::string desc() const ;
0098 float total_time() const ;
0099 unsigned total_threads() const ;
0100 #endif
0101 };
0102
0103 inline SLaunchSequence::SLaunchSequence(unsigned items_)
0104 :
0105 items(items_),
0106 threads_per_block(ssys::getenvint("THREADS_PER_BLOCK", 512)),
0107 max_blocks(ssys::getenvint("MAX_BLOCKS", 128))
0108 {
0109 init();
0110 }
0111
0112 inline void SLaunchSequence::init()
0113 {
0114 assert( threads_per_block <= 1024 );
0115
0116 launches.clear();
0117 unsigned thread_offset = 0 ;
0118 unsigned sequence_index = 0 ;
0119
0120 while( thread_offset < items )
0121 {
0122 unsigned remaining = items - thread_offset ;
0123 unsigned blocks_per_launch = remaining / threads_per_block ;
0124 if(remaining % threads_per_block != 0) blocks_per_launch += 1 ;
0125 if( blocks_per_launch > max_blocks ) blocks_per_launch = max_blocks ;
0126
0127
0128 unsigned threads_per_launch = blocks_per_launch * threads_per_block ;
0129 if(threads_per_launch > remaining) threads_per_launch = remaining ;
0130
0131 launches.push_back( { thread_offset, threads_per_launch, blocks_per_launch, threads_per_block, sequence_index, -1.f } );
0132
0133 thread_offset += threads_per_launch ;
0134 sequence_index += 1 ;
0135 }
0136 }
0137
0138
0139 #if defined(__CUDACC__) || defined(__CUDABE__)
0140 #else
0141
0142 inline float SLaunchSequence::total_time() const
0143 {
0144 float total = 0.0f ;
0145 for(unsigned i=0 ; i < launches.size() ; i++ )
0146 {
0147 const SLaunch& launch = launches[i] ;
0148 if(launch.kernel_time > 0.f ) total += launch.kernel_time ;
0149 }
0150 return total ;
0151 }
0152
0153 inline unsigned SLaunchSequence::total_threads() const
0154 {
0155 unsigned total = 0 ;
0156 for(unsigned i=0 ; i < launches.size() ; i++ )
0157 {
0158 const SLaunch& launch = launches[i] ;
0159 total += launch.threads_per_launch ;
0160 }
0161 return total ;
0162 }
0163
0164
0165 inline std::string SLaunchSequence::brief() const
0166 {
0167 std::stringstream ss ;
0168 ss
0169 << " items " << std::setw(7) << items
0170 << " total_threads " << std::setw(7) << total_threads()
0171 << " THREADS_PER_BLOCK " << std::setw(5) << threads_per_block
0172 << " MAX_BLOCKS " << std::setw(6) << max_blocks
0173 << " num_launches " << std::setw(4) << launches.size()
0174 << " total_time " << std::setw(10) << std::fixed << std::setprecision(4) << total_time()
0175 ;
0176 std::string s = ss.str();
0177 return s ;
0178 }
0179
0180 inline std::string SLaunchSequence::desc() const
0181 {
0182 std::stringstream ss ;
0183 ss
0184 << "SLaunchSequence::desc"
0185 << brief()
0186 << std::endl
0187 ;
0188 for(unsigned i=0 ; i < launches.size() ; i++)
0189 {
0190 const SLaunch& launch = launches[i] ;
0191 ss << launch.desc() << std::endl ;
0192 }
0193 std::string s = ss.str();
0194 return s ;
0195 }
0196
0197 #endif
0198