Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-15 10:28:51

0001 // Author: Dante Niewenhuis, VU Amsterdam 07/2023
0002 // Author: Kristupas Pranckietis, Vilnius University 05/2024
0003 // Author: Nopphakorn Subsa-Ard, King Mongkut's University of Technology Thonburi (KMUTT) (TH) 08/2024
0004 // Author: Vincenzo Eduardo Padulano, CERN 10/2024
0005 // Author: Martin Føll, University of Oslo (UiO) & CERN 05/2025
0006 
0007 /*************************************************************************
0008  * Copyright (C) 1995-2025, Rene Brun and Fons Rademakers.               *
0009  * All rights reserved.                                                  *
0010  *                                                                       *
0011  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0012  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0013  *************************************************************************/
0014 
0015 #ifndef TMVA_RBATCHLOADER
0016 #define TMVA_RBATCHLOADER
0017 
0018 #include <vector>
0019 #include <memory>
0020 #include <numeric>
0021 
0022 // Imports for threading
0023 #include <queue>
0024 #include <mutex>
0025 #include <condition_variable>
0026 
0027 #include "TMVA/RTensor.hxx"
0028 #include "TMVA/Tools.h"
0029 
0030 namespace TMVA::Experimental::Internal {
0031 
0032 /**
0033 \class ROOT::TMVA::Experimental::Internal::RBatchLoader
0034 \ingroup tmva
0035 \brief Building and loading the batches from loaded chunks in RChunkLoader
0036 
0037 In this class the chunks that are loaded into memory (see RChunkLoader) are split into batches used in the ML training
0038 which are loaded into a queue. This is done for both the training and validation chunks separately.
0039 */
0040 
0041 class RBatchLoader {
0042 private:
0043    std::size_t fBatchSize;
0044    std::size_t fNumColumns;
0045 
0046    bool fIsActive = false;
0047 
0048    std::mutex fBatchLock;
0049    std::condition_variable fBatchCondition;
0050 
0051    // queuse of tensors of the training and validation batches
0052    std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fTrainingBatchQueue;
0053    std::queue<std::unique_ptr<TMVA::Experimental::RTensor<float>>> fValidationBatchQueue;
0054 
0055    // number of training and validation batches in the queue
0056    std::size_t fNumTrainingBatchQueue;
0057    std::size_t fNumValidationBatchQueue;
0058 
0059    // current batch that is loaded into memory
0060    std::unique_ptr<TMVA::Experimental::RTensor<float>> fCurrentBatch;
0061 
0062    // primary and secondary batches used to create batches from a chunk
0063    std::unique_ptr<TMVA::Experimental::RTensor<float>> fPrimaryLeftoverTrainingBatch;
0064    std::unique_ptr<TMVA::Experimental::RTensor<float>> fSecondaryLeftoverTrainingBatch;
0065 
0066    std::unique_ptr<TMVA::Experimental::RTensor<float>> fPrimaryLeftoverValidationBatch;
0067    std::unique_ptr<TMVA::Experimental::RTensor<float>> fSecondaryLeftoverValidationBatch;
0068 
0069 public:
0070    RBatchLoader(std::size_t batchSize, std::size_t numColumns) : fBatchSize(batchSize), fNumColumns(numColumns)
0071    {
0072 
0073       fPrimaryLeftoverTrainingBatch =
0074          std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0075       fSecondaryLeftoverTrainingBatch =
0076          std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0077 
0078       fPrimaryLeftoverValidationBatch =
0079          std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0080       fSecondaryLeftoverValidationBatch =
0081          std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0082 
0083       fNumTrainingBatchQueue = fTrainingBatchQueue.size();
0084       fNumValidationBatchQueue = fValidationBatchQueue.size();
0085    }
0086 
0087 public:
0088    void Activate()
0089    {
0090       {
0091          std::lock_guard<std::mutex> lock(fBatchLock);
0092          fIsActive = true;
0093       }
0094       fBatchCondition.notify_all();
0095    }
0096 
0097    /// \brief DeActivate the batchloader. This means that no more batches are created.
0098    /// Batches can still be returned if they are already loaded
0099    void DeActivate()
0100    {
0101       {
0102          std::lock_guard<std::mutex> lock(fBatchLock);
0103          fIsActive = false;
0104       }
0105       fBatchCondition.notify_all();
0106    }
0107 
0108    /// \brief Return a batch of data as a unique pointer.
0109    /// After the batch has been processed, it should be destroyed.
0110    /// \param[in] chunkTensor RTensor with the data from the chunk
0111    /// \param[in] idxs Index of batch in the chunk
0112    /// \return Training batch
0113    std::unique_ptr<TMVA::Experimental::RTensor<float>>
0114    CreateBatch(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t idxs)
0115    {
0116       auto batch =
0117          std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>({fBatchSize, fNumColumns}));
0118       std::copy(chunkTensor.GetData() + (idxs * fBatchSize * fNumColumns),
0119                 chunkTensor.GetData() + ((idxs + 1) * fBatchSize * fNumColumns), batch->GetData());
0120 
0121       return batch;
0122    }
0123 
0124    /// \brief Loading the training batch from the queue
0125    /// \return Training batch
0126    TMVA::Experimental::RTensor<float> GetTrainBatch()
0127    {
0128 
0129       if (fTrainingBatchQueue.empty()) {
0130          fCurrentBatch = std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>({0}));
0131          return *fCurrentBatch;
0132       }
0133 
0134       fCurrentBatch = std::move(fTrainingBatchQueue.front());
0135       fTrainingBatchQueue.pop();
0136 
0137       return *fCurrentBatch;
0138    }
0139 
0140    /// \brief Loading the validation batch from the queue
0141    /// \return Training batch
0142    TMVA::Experimental::RTensor<float> GetValidationBatch()
0143    {
0144 
0145       if (fValidationBatchQueue.empty()) {
0146          fCurrentBatch = std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>({0}));
0147          return *fCurrentBatch;
0148       }
0149 
0150       fCurrentBatch = std::move(fValidationBatchQueue.front());
0151       fValidationBatchQueue.pop();
0152 
0153       return *fCurrentBatch;
0154    }
0155 
0156    /// \brief Creating the training batches from a chunk and add them to the queue.
0157    /// \param[in] chunkTensor RTensor with the data from the chunk
0158    /// \param[in] lastbatch Check if the batch in the chunk is the last one
0159    /// \param[in] leftoverBatchSize Size of the leftover batch in the training dataset
0160    /// \param[in] dromRemainder Bool to drop the remainder batch or not
0161    void CreateTrainingBatches(TMVA::Experimental::RTensor<float> &chunkTensor, int lastbatch,
0162                               std::size_t leftoverBatchSize, bool dropRemainder)
0163    {
0164       std::size_t ChunkSize = chunkTensor.GetShape()[0];
0165       std::size_t Batches = ChunkSize / fBatchSize;
0166       std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
0167 
0168       // create a vector of batches
0169       std::vector<std::unique_ptr<TMVA::Experimental::RTensor<float>>> batches;
0170 
0171       // fill the full batches from the chunk into a vector
0172       for (std::size_t i = 0; i < Batches; i++) {
0173          // Fill a batch
0174          batches.emplace_back(CreateBatch(chunkTensor, i));
0175       }
0176 
0177       // copy the remaining entries from the chunk into a leftover batch
0178       TMVA::Experimental::RTensor<float> LeftoverBatch({LeftoverBatchSize, fNumColumns});
0179       std::copy(chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns),
0180                 chunkTensor.GetData() + (Batches * fBatchSize * fNumColumns + LeftoverBatchSize * fNumColumns),
0181                 LeftoverBatch.GetData());
0182 
0183       // calculate how many empty slots are left in fPrimaryLeftoverTrainingBatch
0184       std::size_t PrimaryLeftoverSize = (*fPrimaryLeftoverTrainingBatch).GetShape()[0];
0185       std::size_t emptySlots = fBatchSize - PrimaryLeftoverSize;
0186 
0187       // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
0188       if (emptySlots >= LeftoverBatchSize) {
0189          (*fPrimaryLeftoverTrainingBatch) =
0190             (*fPrimaryLeftoverTrainingBatch).Resize({PrimaryLeftoverSize + LeftoverBatchSize, fNumColumns});
0191          std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
0192                    fPrimaryLeftoverTrainingBatch->GetData() + (PrimaryLeftoverSize * fNumColumns));
0193 
0194          // copy LeftoverBatch to end of fPrimaryLeftoverTrainingBatch and add it to the batch vector
0195          if (emptySlots == LeftoverBatchSize) {
0196             auto copy =
0197                std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
0198             std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
0199                       fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
0200             batches.emplace_back(std::move(copy));
0201 
0202             // reset fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
0203             *fPrimaryLeftoverTrainingBatch = *fSecondaryLeftoverTrainingBatch;
0204             fSecondaryLeftoverValidationBatch =
0205                std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0206          }
0207       }
0208 
0209       // copy LeftoverBatch to both fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverTrainingBatch
0210       else if (emptySlots < LeftoverBatchSize) {
0211          // copy the first part of LeftoverBatch to end of fPrimaryLeftoverTrainingBatch
0212          (*fPrimaryLeftoverTrainingBatch) = (*fPrimaryLeftoverTrainingBatch).Resize({fBatchSize, fNumColumns});
0213          std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * fNumColumns),
0214                    fPrimaryLeftoverTrainingBatch->GetData() + (PrimaryLeftoverSize * fNumColumns));
0215 
0216          // copy the last part of LeftoverBatch to the end of fSecondaryLeftoverTrainingBatch
0217          (*fSecondaryLeftoverTrainingBatch) =
0218             (*fSecondaryLeftoverTrainingBatch).Resize({LeftoverBatchSize - emptySlots, fNumColumns});
0219          std::copy(LeftoverBatch.GetData() + (emptySlots * fNumColumns),
0220                    LeftoverBatch.GetData() + (LeftoverBatchSize * fNumColumns),
0221                    fSecondaryLeftoverTrainingBatch->GetData());
0222 
0223          // add fPrimaryLeftoverTrainingBatch to the batch vector
0224          auto copy =
0225             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
0226          std::copy(fPrimaryLeftoverTrainingBatch->GetData(),
0227                    fPrimaryLeftoverTrainingBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
0228          batches.emplace_back(std::move(copy));
0229 
0230          // exchange fPrimaryLeftoverTrainingBatch and fSecondaryLeftoverValidationBatch
0231          *fPrimaryLeftoverTrainingBatch = *fSecondaryLeftoverTrainingBatch;
0232 
0233          // reset fSecondaryLeftoverValidationBatch
0234          fSecondaryLeftoverValidationBatch =
0235             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0236       }
0237 
0238       // copy the content of fPrimaryLeftoverTrainingBatch to the leftover batch from the chunk
0239       if (lastbatch == 1) {
0240 
0241          if (dropRemainder == false && leftoverBatchSize > 0) {
0242             auto copy = std::make_unique<TMVA::Experimental::RTensor<float>>(
0243                std::vector<std::size_t>{leftoverBatchSize, fNumColumns});
0244             std::copy((*fPrimaryLeftoverTrainingBatch).GetData(),
0245                       (*fPrimaryLeftoverTrainingBatch).GetData() + (leftoverBatchSize * fNumColumns), copy->GetData());
0246             batches.emplace_back(std::move(copy));
0247          }
0248 
0249          fPrimaryLeftoverTrainingBatch =
0250             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0251          fSecondaryLeftoverTrainingBatch =
0252             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0253       }
0254 
0255       // append the batches from the batch vector from the chunk to the training batch queue
0256       for (std::size_t i = 0; i < batches.size(); i++) {
0257          fTrainingBatchQueue.push(std::move(batches[i]));
0258       }
0259    }
0260 
0261    /// \brief Creating the validation batches from a chunk and adding them to the queue
0262    /// \param[in] chunkTensor RTensor with the data from the chunk
0263    /// \param[in] lastbatch Check if the batch in the chunk is the last one
0264    /// \param[in] leftoverBatchSize Size of the leftover batch in the validation dataset
0265    /// \param[in] dromRemainder Bool to drop the remainder batch or not
0266    void CreateValidationBatches(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t lastbatch,
0267                                 std::size_t leftoverBatchSize, bool dropRemainder)
0268    {
0269       std::size_t ChunkSize = chunkTensor.GetShape()[0];
0270       std::size_t NumCols = chunkTensor.GetShape()[1];
0271       std::size_t Batches = ChunkSize / fBatchSize;
0272       std::size_t LeftoverBatchSize = ChunkSize % fBatchSize;
0273 
0274       std::vector<std::unique_ptr<TMVA::Experimental::RTensor<float>>> batches;
0275 
0276       for (std::size_t i = 0; i < Batches; i++) {
0277          // Fill a batch
0278          batches.emplace_back(CreateBatch(chunkTensor, i));
0279       }
0280 
0281       TMVA::Experimental::RTensor<float> LeftoverBatch({LeftoverBatchSize, NumCols});
0282       std::copy(chunkTensor.GetData() + (Batches * fBatchSize * NumCols),
0283                 chunkTensor.GetData() + (Batches * fBatchSize * NumCols + LeftoverBatchSize * NumCols),
0284                 LeftoverBatch.GetData());
0285 
0286       std::size_t PrimaryLeftoverSize = (*fPrimaryLeftoverValidationBatch).GetShape()[0];
0287       std::size_t emptySlots = fBatchSize - PrimaryLeftoverSize;
0288 
0289       if (emptySlots >= LeftoverBatchSize) {
0290          (*fPrimaryLeftoverValidationBatch) =
0291             (*fPrimaryLeftoverValidationBatch).Resize({PrimaryLeftoverSize + LeftoverBatchSize, NumCols});
0292          std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (LeftoverBatchSize * NumCols),
0293                    fPrimaryLeftoverValidationBatch->GetData() + (PrimaryLeftoverSize * NumCols));
0294 
0295          if (emptySlots == LeftoverBatchSize) {
0296             auto copy =
0297                std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
0298             std::copy(fPrimaryLeftoverValidationBatch->GetData(),
0299                       fPrimaryLeftoverValidationBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
0300             batches.emplace_back(std::move(copy));
0301             *fPrimaryLeftoverValidationBatch = *fSecondaryLeftoverValidationBatch;
0302             fSecondaryLeftoverValidationBatch =
0303                std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0304          }
0305       }
0306 
0307       else if (emptySlots < LeftoverBatchSize) {
0308          (*fPrimaryLeftoverValidationBatch) = (*fPrimaryLeftoverValidationBatch).Resize({fBatchSize, NumCols});
0309          std::copy(LeftoverBatch.GetData(), LeftoverBatch.GetData() + (emptySlots * NumCols),
0310                    fPrimaryLeftoverValidationBatch->GetData() + (PrimaryLeftoverSize * NumCols));
0311          (*fSecondaryLeftoverValidationBatch) =
0312             (*fSecondaryLeftoverValidationBatch).Resize({LeftoverBatchSize - emptySlots, NumCols});
0313          std::copy(LeftoverBatch.GetData() + (emptySlots * NumCols),
0314                    LeftoverBatch.GetData() + (LeftoverBatchSize * NumCols),
0315                    fSecondaryLeftoverValidationBatch->GetData());
0316          auto copy =
0317             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{fBatchSize, fNumColumns});
0318          std::copy(fPrimaryLeftoverValidationBatch->GetData(),
0319                    fPrimaryLeftoverValidationBatch->GetData() + (fBatchSize * fNumColumns), copy->GetData());
0320          batches.emplace_back(std::move(copy));
0321          *fPrimaryLeftoverValidationBatch = *fSecondaryLeftoverValidationBatch;
0322          fSecondaryLeftoverValidationBatch =
0323             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0324       }
0325 
0326       if (lastbatch == 1) {
0327 
0328          if (dropRemainder == false && leftoverBatchSize > 0) {
0329             auto copy = std::make_unique<TMVA::Experimental::RTensor<float>>(
0330                std::vector<std::size_t>{leftoverBatchSize, fNumColumns});
0331             std::copy((*fPrimaryLeftoverValidationBatch).GetData(),
0332                       (*fPrimaryLeftoverValidationBatch).GetData() + (leftoverBatchSize * fNumColumns),
0333                       copy->GetData());
0334             batches.emplace_back(std::move(copy));
0335          }
0336          fPrimaryLeftoverValidationBatch =
0337             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0338          fSecondaryLeftoverValidationBatch =
0339             std::make_unique<TMVA::Experimental::RTensor<float>>(std::vector<std::size_t>{0, fNumColumns});
0340       }
0341 
0342       for (std::size_t i = 0; i < batches.size(); i++) {
0343          fValidationBatchQueue.push(std::move(batches[i]));
0344       }
0345    }
0346    std::size_t GetNumTrainingBatchQueue() { return fTrainingBatchQueue.size(); }
0347    std::size_t GetNumValidationBatchQueue() { return fValidationBatchQueue.size(); }
0348 };
0349 
0350 } // namespace TMVA::Experimental::Internal
0351 
0352 #endif // TMVA_RBATCHLOADER