Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-15 10:28:51

0001 // Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
0002 
0003 /*************************************************************************
0004  * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers.               *
0005  * All rights reserved.                                                  *
0006  *                                                                       *
0007  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0008  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0009  *************************************************************************/
0010 
0011 #ifndef TMVA_RCHUNKCONSTRUCTOR
0012 #define TMVA_RCHUNKCONSTRUCTOR
0013 
0014 #include <vector>
0015 
0016 #include "TMVA/RTensor.hxx"
0017 #include "ROOT/RDataFrame.hxx"
0018 #include "ROOT/RDF/Utils.hxx"
0019 #include "ROOT/RVec.hxx"
0020 
0021 #include "ROOT/RLogger.hxx"
0022 
0023 namespace TMVA {
0024 namespace Experimental {
0025 namespace Internal {
0026 
0027 // clang-format off
0028 /**
0029 \class ROOT::TMVA::Experimental::Internal::RChunkConstructor
0030 \ingroup tmva
0031 \brief The logic for constructing chunks from a dataset.
0032 
0033 This struct handles the logic for splitting a dataset into smaller subsets 
0034 known as chunks, which are constructed from blocks.
0035  
0036 A chunk is the largest portion of the dataset loaded into memory at once, 
0037 and each chunk is further divided into batches for machine learning training.
0038  
0039 The dataset is split into disjoint chunks based on a user-defined chunk size.
0040 There are two types of chunks:
0041  - Full chunks: contain exactly the number of entries specified by the chunk size.
0042  - Leftover chunk: contains any remaining entries that don't make up a full chunk.
0043  
0044 Each chunk is constructed from blocks based on a user-defined block size.
0045 There are two types of blocks:
0046  - Full blocks: contain exactly the number of entries specified by the block size.
0047  - Leftover block: contains any remaining entries that don't make up a full block.
0048 
0049 The blocks are defined by their start and end entries, which correspond to positions within the dataset’s total number of entries.
0050 */
0051 
0052 struct RChunkConstructor {
0053    // clang-format on
0054    std::size_t fNumEntries{};
0055    std::size_t fChunkSize{};
0056    std::size_t fBlockSize{};
0057 
0058    // size of full and leftover chunks
0059    std::size_t SizeOfFullChunk;
0060    std::size_t SizeOfLeftoverChunk;
0061 
0062    // size of full and leftover blocks in a full and leftover chunk
0063    std::size_t SizeOfFullBlockInFullChunk;
0064    std::size_t SizeOfLeftoverBlockInFullChunk;
0065    std::size_t SizeOfFullBlockInLeftoverChunk;
0066    std::size_t SizeOfLeftoverBlockInLeftoverChunk;
0067 
0068    // number of full, leftover and total chunks
0069    std::size_t FullChunks;
0070    std::size_t LeftoverChunks;
0071    std::size_t Chunks;
0072 
0073    // number of full, leftover and total blocks in a full chunk
0074    std::size_t FullBlocksPerFullChunk;
0075    std::size_t LeftoverBlocksPerFullChunk;
0076    std::size_t BlockPerFullChunk;
0077 
0078    // number of full, leftover and total blocks in the leftover chunk
0079    std::size_t FullBlocksPerLeftoverChunk;
0080    std::size_t LeftoverBlocksPerLeftoverChunk;
0081    std::size_t BlockPerLeftoverChunk;
0082 
0083    // total number of full and leftover blocks in the full chunks
0084    std::size_t FullBlocksInFullChunks;
0085    std::size_t LeftoverBlocksInFullChunks;
0086 
0087    // total number of full and leftover blocks in the leftover chunks
0088    std::size_t FullBlocksInLeftoverChunks;
0089    std::size_t LeftoverBlocksInLeftoverChunks;
0090 
0091    // vector of the different block sizes
0092    std::vector<std::size_t> SizeOfBlocks;
0093 
0094    // vector with the number of the different block
0095    std::vector<std::size_t> NumberOfDifferentBlocks;
0096 
0097    // total number of blocks
0098    std::size_t NumberOfBlocks;
0099 
0100    // pair of start and end entries in the different block types
0101    std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
0102 
0103    std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInFullChunks;
0104    std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInFullChunks;
0105 
0106    std::vector<std::pair<Long_t, Long_t>> FullBlockIntervalsInLeftoverChunks;
0107    std::vector<std::pair<Long_t, Long_t>> LeftoverBlockIntervalsInLeftoverChunks;
0108 
0109    std::vector<std::vector<std::pair<Long_t, Long_t>>> ChunksIntervals;
0110 
0111    std::vector<std::size_t> ChunksSizes;
0112 
0113    RChunkConstructor(const std::size_t numEntries, const std::size_t chunkSize, const std::size_t blockSize)
0114       : fNumEntries(numEntries), fChunkSize(chunkSize), fBlockSize(blockSize)
0115    {
0116       // size of full and leftover chunks
0117       SizeOfFullChunk = chunkSize;
0118       SizeOfLeftoverChunk = fNumEntries % SizeOfFullChunk;
0119 
0120       // size of full and leftover blocks in a full and leftover chunk
0121       SizeOfFullBlockInFullChunk = blockSize;
0122       SizeOfLeftoverBlockInFullChunk = SizeOfFullChunk % blockSize;
0123       SizeOfFullBlockInLeftoverChunk = blockSize;
0124       SizeOfLeftoverBlockInLeftoverChunk = SizeOfLeftoverChunk % blockSize;
0125 
0126       // number of full, leftover and total chunks
0127       FullChunks = numEntries / SizeOfFullChunk;
0128       LeftoverChunks = SizeOfLeftoverChunk == 0 ? 0 : 1;
0129       Chunks = FullChunks + LeftoverChunks;
0130 
0131       // number of full, leftover and total blocks in a full chunk
0132       FullBlocksPerFullChunk = SizeOfFullChunk / blockSize;
0133       LeftoverBlocksPerFullChunk = SizeOfLeftoverBlockInFullChunk == 0 ? 0 : 1;
0134       BlockPerFullChunk = FullBlocksPerFullChunk + LeftoverBlocksPerFullChunk;
0135 
0136       // number of full, leftover and total blocks in the leftover chunk
0137       FullBlocksPerLeftoverChunk = SizeOfLeftoverChunk / blockSize;
0138       LeftoverBlocksPerLeftoverChunk = SizeOfLeftoverBlockInLeftoverChunk == 0 ? 0 : 1;
0139       BlockPerLeftoverChunk = FullBlocksPerLeftoverChunk + LeftoverBlocksPerLeftoverChunk;
0140 
0141       // total number of full and leftover blocks in the full chunks
0142       FullBlocksInFullChunks = FullBlocksPerFullChunk * FullChunks;
0143       LeftoverBlocksInFullChunks = LeftoverBlocksPerFullChunk * FullChunks;
0144 
0145       // total number of full and leftover blocks in the leftover chunks
0146       FullBlocksInLeftoverChunks = FullBlocksPerLeftoverChunk * LeftoverChunks;
0147       LeftoverBlocksInLeftoverChunks = LeftoverBlocksPerLeftoverChunk * LeftoverChunks;
0148 
0149       // vector of the different block sizes
0150       SizeOfBlocks = {SizeOfFullBlockInFullChunk, SizeOfLeftoverBlockInFullChunk, SizeOfFullBlockInLeftoverChunk,
0151                       SizeOfLeftoverBlockInLeftoverChunk};
0152 
0153       // vector with the number of the different block
0154       NumberOfDifferentBlocks = {FullBlocksInFullChunks, LeftoverBlocksInFullChunks, FullBlocksInLeftoverChunks,
0155                                  LeftoverBlocksInLeftoverChunks};
0156 
0157       // total number of blocks
0158       NumberOfBlocks = std::accumulate(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), 0);
0159    };
0160 
0161    //////////////////////////////////////////////////////////////////////////
0162    /// \brief Group the blocks based on the block type (full or leftover) based on the size of the block.
0163    void DistributeBlockIntervals()
0164    {
0165 
0166       std::vector<std::vector<std::pair<Long_t, Long_t>> *> TypesOfBlockIntervals = {
0167          &FullBlockIntervalsInFullChunks, &LeftoverBlockIntervalsInFullChunks, &FullBlockIntervalsInLeftoverChunks,
0168          &LeftoverBlockIntervalsInLeftoverChunks};
0169 
0170       std::vector<std::size_t> IndexOfDifferentBlocks(NumberOfDifferentBlocks.size());
0171       std::partial_sum(NumberOfDifferentBlocks.begin(), NumberOfDifferentBlocks.end(), IndexOfDifferentBlocks.begin());
0172       IndexOfDifferentBlocks.insert(IndexOfDifferentBlocks.begin(), 0);
0173 
0174       for (size_t i = 0; i < TypesOfBlockIntervals.size(); ++i) {
0175          size_t start = IndexOfDifferentBlocks[i];
0176          size_t end = IndexOfDifferentBlocks[i + 1];
0177 
0178          TypesOfBlockIntervals[i]->insert(TypesOfBlockIntervals[i]->begin(), BlockIntervals.begin() + start,
0179                                           BlockIntervals.begin() + end);
0180       }
0181    }
0182 
0183    //////////////////////////////////////////////////////////////////////////
0184    /// \brief Creates chunks from the dataset consisting of blocks with the begin and end entry. 
0185    void CreateChunksIntervals()
0186    {
0187 
0188       ChunksIntervals.resize(Chunks);
0189       for (size_t i = 0; i < FullChunks; i++) {
0190 
0191          size_t start_FullBlock = FullBlocksPerFullChunk * i;
0192          size_t end_FullBlock = FullBlocksPerFullChunk * (i + 1);
0193 
0194          size_t start_LeftoverBlock = LeftoverBlocksPerFullChunk * i;
0195          size_t end_LeftoverBlock = LeftoverBlocksPerFullChunk * (i + 1);
0196 
0197          ChunksIntervals[i].insert(ChunksIntervals[i].end(), FullBlockIntervalsInFullChunks.begin() + start_FullBlock,
0198                                    FullBlockIntervalsInFullChunks.begin() + end_FullBlock);
0199          ChunksIntervals[i].insert(ChunksIntervals[i].end(),
0200                                    LeftoverBlockIntervalsInFullChunks.begin() + start_LeftoverBlock,
0201                                    LeftoverBlockIntervalsInFullChunks.begin() + end_LeftoverBlock);
0202       }
0203 
0204       for (size_t i = 0; i < LeftoverChunks; i++) {
0205 
0206          size_t j = i + FullChunks;
0207          size_t start_FullBlock = FullBlocksPerLeftoverChunk * i;
0208          size_t end_FullBlock = FullBlocksPerLeftoverChunk * (i + 1);
0209 
0210          size_t start_LeftoverBlock = LeftoverBlocksPerLeftoverChunk * i;
0211          size_t end_LeftoverBlock = LeftoverBlocksPerLeftoverChunk * (i + 1);
0212 
0213          ChunksIntervals[j].insert(ChunksIntervals[j].end(),
0214                                    FullBlockIntervalsInLeftoverChunks.begin() + start_FullBlock,
0215                                    FullBlockIntervalsInLeftoverChunks.begin() + end_FullBlock);
0216          ChunksIntervals[j].insert(ChunksIntervals[j].end(),
0217                                    LeftoverBlockIntervalsInLeftoverChunks.begin() + start_LeftoverBlock,
0218                                    LeftoverBlockIntervalsInLeftoverChunks.begin() + end_LeftoverBlock);
0219       }
0220    }
0221 
0222    //////////////////////////////////////////////////////////////////////////
0223    /// \brief Fills a vector with the size of every chunk from the dataset 
0224    void SizeOfChunks()
0225    {
0226 
0227       for (size_t i = 0; i < Chunks; i++) {
0228          std::size_t chunkSize = 0;
0229          for (size_t j = 0; j < ChunksIntervals[i].size(); j++) {
0230             std::size_t start = ChunksIntervals[i][j].first;
0231             std::size_t end = ChunksIntervals[i][j].second;
0232 
0233             std::size_t intervalSize = end - start;
0234             chunkSize += intervalSize;
0235          }
0236 
0237          ChunksSizes.insert(ChunksSizes.end(), chunkSize);
0238       }
0239    }
0240 };
0241 } // namespace Internal
0242 } // namespace Experimental
0243 } // namespace TMVA
0244 
0245 #endif // TMVA_RCHUNKCONSTRUCTOR