File indexing completed on 2025-12-15 10:28:51
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015 #ifndef TMVA_RCHUNKLOADER
0016 #define TMVA_RCHUNKLOADER
0017
0018 #include <vector>
0019 #include <random>
0020
0021 #include "TMVA/RTensor.hxx"
0022 #include "TMVA/BatchGenerator/RChunkConstructor.hxx"
0023 #include "ROOT/RDataFrame.hxx"
0024 #include "ROOT/RDF/Utils.hxx"
0025 #include "ROOT/RVec.hxx"
0026
0027 #include "ROOT/RLogger.hxx"
0028
0029 namespace TMVA {
0030 namespace Experimental {
0031 namespace Internal {
0032
0033
0034
0035
0036
0037
0038
0039
0040 template <typename... ColTypes>
0041 class RChunkLoaderFunctor {
0042
0043 std::size_t fOffset{};
0044 std::size_t fVecSizeIdx{};
0045 float fVecPadding{};
0046 std::vector<std::size_t> fMaxVecSizes{};
0047 TMVA::Experimental::RTensor<float> &fChunkTensor;
0048
0049 std::size_t fNumChunkCols;
0050
0051 int fI;
0052 int fNumColumns;
0053
0054
0055
0056 template <typename T, std::enable_if_t<ROOT::Internal::RDF::IsDataContainer<T>::value, int> = 0>
0057 void AssignToTensor(const T &vec, int i, int numColumns)
0058 {
0059 std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
0060 std::size_t vec_size = vec.size();
0061 if (vec_size < max_vec_size)
0062 {
0063 std::copy(vec.begin(), vec.end(), &fChunkTensor.GetData()[fOffset + numColumns * i]);
0064 std::fill(&fChunkTensor.GetData()[fOffset + numColumns * i + vec_size],
0065 &fChunkTensor.GetData()[fOffset + numColumns * i + max_vec_size], fVecPadding);
0066 } else
0067 {
0068 std::copy(vec.begin(), vec.begin() + max_vec_size, &fChunkTensor.GetData()[fOffset + numColumns * i]);
0069 }
0070 fOffset += max_vec_size;
0071 }
0072
0073
0074
0075 template <typename T, std::enable_if_t<!ROOT::Internal::RDF::IsDataContainer<T>::value, int> = 0>
0076 void AssignToTensor(const T &val, int i, int numColumns)
0077 {
0078 fChunkTensor.GetData()[fOffset + numColumns * i] = val;
0079 fOffset++;
0080
0081 }
0082
0083 public:
0084 RChunkLoaderFunctor(TMVA::Experimental::RTensor<float> &chunkTensor, std::size_t numColumns,
0085 const std::vector<std::size_t> &maxVecSizes, float vecPadding, int i)
0086 : fChunkTensor(chunkTensor), fMaxVecSizes(maxVecSizes), fVecPadding(vecPadding), fI(i), fNumColumns(numColumns)
0087 {
0088 }
0089
0090 void operator()(const ColTypes &...cols)
0091 {
0092 fVecSizeIdx = 0;
0093 (AssignToTensor(cols, fI, fNumColumns), ...);
0094 }
0095 };
0096
0097
0098
0099
0100
0101
0102
0103
0104
0105
0106 template <typename... Args>
0107 class RChunkLoader {
0108 private:
0109
0110 std::size_t fNumEntries;
0111 std::size_t fChunkSize;
0112 std::size_t fBlockSize;
0113 float fValidationSplit;
0114
0115 std::vector<std::size_t> fVecSizes;
0116 std::size_t fSumVecSizes;
0117 std::size_t fVecPadding;
0118 std::size_t fNumChunkCols;
0119
0120 std::size_t fNumTrainEntries;
0121 std::size_t fNumValidationEntries;
0122
0123 ROOT::RDF::RNode &f_rdf;
0124 std::vector<std::string> fCols;
0125 std::size_t fNumCols;
0126 std::size_t fSetSeed;
0127
0128 bool fNotFiltered;
0129 bool fShuffle;
0130
0131 ROOT::RDF::RResultPtr<std::vector<ULong64_t>> fEntries;
0132
0133 std::unique_ptr<RChunkConstructor> fTraining;
0134 std::unique_ptr<RChunkConstructor> fValidation;
0135
0136 public:
0137 RChunkLoader(ROOT::RDF::RNode &rdf, std::size_t numEntries,
0138 ROOT::RDF::RResultPtr<std::vector<ULong64_t>> rdf_entries, const std::size_t chunkSize,
0139 const std::size_t blockSize, const float validationSplit, const std::vector<std::string> &cols,
0140 const std::vector<std::size_t> &vecSizes = {}, const float vecPadding = 0.0, bool shuffle = true,
0141 const std::size_t setSeed = 0)
0142 : f_rdf(rdf),
0143 fNumEntries(numEntries),
0144 fEntries(rdf_entries),
0145 fCols(cols),
0146 fVecSizes(vecSizes),
0147 fVecPadding(vecPadding),
0148 fChunkSize(chunkSize),
0149 fBlockSize(blockSize),
0150 fValidationSplit(validationSplit),
0151 fNotFiltered(f_rdf.GetFilterNames().empty()),
0152 fShuffle(shuffle),
0153 fSetSeed(setSeed)
0154 {
0155 fNumCols = fCols.size();
0156 fSumVecSizes = std::accumulate(fVecSizes.begin(), fVecSizes.end(), 0);
0157
0158 fNumChunkCols = fNumCols + fSumVecSizes - fVecSizes.size();
0159
0160
0161 fNumValidationEntries = static_cast<std::size_t>(fValidationSplit * fNumEntries);
0162 fNumTrainEntries = fNumEntries - fNumValidationEntries;
0163
0164 fTraining = std::make_unique<RChunkConstructor>(fNumTrainEntries, fChunkSize, fBlockSize);
0165 fValidation = std::make_unique<RChunkConstructor>(fNumValidationEntries, fChunkSize, fBlockSize);
0166 }
0167
0168
0169
0170 void SplitDataset()
0171 {
0172 std::random_device rd;
0173 std::mt19937 g;
0174
0175 if (fSetSeed == 0) {
0176 g.seed(rd());
0177 } else {
0178 g.seed(fSetSeed);
0179 }
0180
0181 std::vector<Long_t> BlockSizes = {};
0182
0183
0184 for (size_t i = 0; i < fTraining->NumberOfDifferentBlocks.size(); i++) {
0185 BlockSizes.insert(BlockSizes.end(), fTraining->NumberOfDifferentBlocks[i], fTraining->SizeOfBlocks[i]);
0186 }
0187
0188 for (size_t i = 0; i < fValidation->NumberOfDifferentBlocks.size(); i++) {
0189 BlockSizes.insert(BlockSizes.end(), fValidation->NumberOfDifferentBlocks[i], fValidation->SizeOfBlocks[i]);
0190 }
0191
0192
0193 std::vector<Long_t> indices(BlockSizes.size());
0194
0195 for (int i = 0; i < indices.size(); ++i) {
0196 indices[i] = i;
0197 }
0198
0199
0200 if (fShuffle) {
0201 std::shuffle(indices.begin(), indices.end(), g);
0202 }
0203
0204
0205 std::vector<Long_t> PermutedBlockSizes(BlockSizes.size());
0206 for (int i = 0; i < BlockSizes.size(); ++i) {
0207 PermutedBlockSizes[i] = BlockSizes[indices[i]];
0208 }
0209
0210
0211 std::vector<Long_t> BlockBoundaries(BlockSizes.size());
0212
0213
0214
0215 std::partial_sum(PermutedBlockSizes.begin(), PermutedBlockSizes.end(), BlockBoundaries.begin());
0216 BlockBoundaries.insert(BlockBoundaries.begin(), 0);
0217
0218
0219 std::vector<std::pair<Long_t, Long_t>> BlockIntervals;
0220 for (size_t i = 0; i < BlockBoundaries.size() - 1; ++i) {
0221 BlockIntervals.emplace_back(BlockBoundaries[i], BlockBoundaries[i + 1]);
0222 }
0223
0224
0225
0226 std::vector<std::pair<Long_t, Long_t>> UnpermutedBlockIntervals(BlockIntervals.size());
0227 for (int i = 0; i < BlockIntervals.size(); ++i) {
0228 UnpermutedBlockIntervals[indices[i]] = BlockIntervals[i];
0229 }
0230
0231
0232 fTraining->BlockIntervals.insert(fTraining->BlockIntervals.begin(), UnpermutedBlockIntervals.begin(),
0233 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks);
0234 fValidation->BlockIntervals.insert(fValidation->BlockIntervals.begin(),
0235 UnpermutedBlockIntervals.begin() + fTraining->NumberOfBlocks,
0236 UnpermutedBlockIntervals.end());
0237
0238
0239 fTraining->DistributeBlockIntervals();
0240 fValidation->DistributeBlockIntervals();
0241 }
0242
0243
0244
0245 void CreateTrainingChunksIntervals()
0246 {
0247
0248 std::random_device rd;
0249 std::mt19937 g;
0250
0251 if (fSetSeed == 0) {
0252 g.seed(rd());
0253 } else {
0254 g.seed(fSetSeed);
0255 }
0256
0257
0258 if (fShuffle) {
0259 std::shuffle(fTraining->FullBlockIntervalsInFullChunks.begin(),
0260 fTraining->FullBlockIntervalsInFullChunks.end(), g);
0261 std::shuffle(fTraining->LeftoverBlockIntervalsInFullChunks.begin(),
0262 fTraining->LeftoverBlockIntervalsInFullChunks.end(), g);
0263 std::shuffle(fTraining->FullBlockIntervalsInLeftoverChunks.begin(),
0264 fTraining->FullBlockIntervalsInLeftoverChunks.end(), g);
0265 std::shuffle(fTraining->LeftoverBlockIntervalsInLeftoverChunks.begin(),
0266 fTraining->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
0267 }
0268
0269
0270 fTraining->ChunksIntervals = {};
0271 fTraining->ChunksSizes = {};
0272
0273
0274 fTraining->CreateChunksIntervals();
0275
0276 if (fShuffle) {
0277 std::shuffle(fTraining->ChunksIntervals.begin(), fTraining->ChunksIntervals.end(), g);
0278 }
0279
0280 fTraining->SizeOfChunks();
0281 }
0282
0283
0284
0285 void CreateValidationChunksIntervals()
0286 {
0287 std::random_device rd;
0288 std::mt19937 g;
0289
0290 if (fSetSeed == 0) {
0291 g.seed(rd());
0292 } else {
0293 g.seed(fSetSeed);
0294 }
0295
0296 if (fShuffle) {
0297 std::shuffle(fValidation->FullBlockIntervalsInFullChunks.begin(),
0298 fValidation->FullBlockIntervalsInFullChunks.end(), g);
0299 std::shuffle(fValidation->LeftoverBlockIntervalsInFullChunks.begin(),
0300 fValidation->LeftoverBlockIntervalsInFullChunks.end(), g);
0301 std::shuffle(fValidation->FullBlockIntervalsInLeftoverChunks.begin(),
0302 fValidation->FullBlockIntervalsInLeftoverChunks.end(), g);
0303 std::shuffle(fValidation->LeftoverBlockIntervalsInLeftoverChunks.begin(),
0304 fValidation->LeftoverBlockIntervalsInLeftoverChunks.end(), g);
0305 }
0306
0307 fValidation->ChunksIntervals = {};
0308 fValidation->ChunksSizes = {};
0309
0310 fValidation->CreateChunksIntervals();
0311
0312 if (fShuffle) {
0313 std::shuffle(fValidation->ChunksIntervals.begin(), fValidation->ChunksIntervals.end(), g);
0314 }
0315
0316 fValidation->SizeOfChunks();
0317 }
0318
0319
0320
0321
0322
0323 void LoadTrainingChunk(TMVA::Experimental::RTensor<float> &TrainChunkTensor, std::size_t chunk)
0324 {
0325
0326 std::random_device rd;
0327 std::mt19937 g;
0328
0329 if (fSetSeed == 0) {
0330 g.seed(rd());
0331 } else {
0332 g.seed(fSetSeed);
0333 }
0334
0335 std::size_t chunkSize = fTraining->ChunksSizes[chunk];
0336
0337 if (chunk < fTraining->Chunks) {
0338 TMVA::Experimental::RTensor<float> Tensor({chunkSize, fNumChunkCols});
0339 TrainChunkTensor = TrainChunkTensor.Resize({{chunkSize, fNumChunkCols}});
0340
0341
0342 std::vector<int> indices(chunkSize);
0343 std::iota(indices.begin(), indices.end(), 0);
0344
0345
0346 if (fShuffle) {
0347 std::shuffle(indices.begin(), indices.end(), g);
0348 }
0349
0350
0351 std::size_t chunkEntry = 0;
0352 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fTraining->ChunksIntervals[chunk];
0353
0354 std::sort(BlocksInChunk.begin(), BlocksInChunk.end(),
0355 [](const std::pair<Long_t, Long_t>& a, const std::pair<Long_t, Long_t>& b) {
0356 return a.first < b.first;
0357 });
0358
0359 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
0360
0361
0362 if (fNotFiltered) {
0363 RChunkLoaderFunctor<Args...> func(Tensor, fNumChunkCols, fVecSizes, fVecPadding, chunkEntry);
0364 ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, BlocksInChunk[i].first, BlocksInChunk[i].second);
0365
0366 f_rdf.Foreach(func, fCols);
0367 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
0368 }
0369
0370
0371 else {
0372 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
0373 for (std::size_t j = 0; j < blockSize; j++) {
0374 RChunkLoaderFunctor<Args...> func(Tensor, fNumChunkCols, fVecSizes, fVecPadding, chunkEntry);
0375 ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, (*fEntries)[BlocksInChunk[i].first + j],
0376 (*fEntries)[BlocksInChunk[i].first + j + 1]);
0377 f_rdf.Foreach(func, fCols);
0378 chunkEntry++;
0379 }
0380 }
0381 }
0382
0383
0384 for (std::size_t i = 0; i < chunkSize; i++) {
0385 std::copy(Tensor.GetData() + indices[i] * fNumChunkCols,
0386 Tensor.GetData() + (indices[i] + 1) * fNumChunkCols,
0387 TrainChunkTensor.GetData() + i * fNumChunkCols);
0388 }
0389 }
0390 }
0391
0392
0393
0394
0395
0396 void LoadValidationChunk(TMVA::Experimental::RTensor<float> &ValidationChunkTensor, std::size_t chunk)
0397 {
0398
0399 std::random_device rd;
0400 std::mt19937 g;
0401
0402 if (fSetSeed == 0) {
0403 g.seed(rd());
0404 } else {
0405 g.seed(fSetSeed);
0406 }
0407
0408 std::size_t chunkSize = fValidation->ChunksSizes[chunk];
0409
0410 if (chunk < fValidation->Chunks) {
0411 TMVA::Experimental::RTensor<float> Tensor({chunkSize, fNumChunkCols});
0412 ValidationChunkTensor = ValidationChunkTensor.Resize({{chunkSize, fNumChunkCols}});
0413
0414
0415 std::vector<int> indices(chunkSize);
0416 std::iota(indices.begin(), indices.end(), 0);
0417
0418
0419 if (fShuffle) {
0420 std::shuffle(indices.begin(), indices.end(), g);
0421 }
0422
0423 std::size_t chunkEntry = 0;
0424 std::vector<std::pair<Long_t, Long_t>> BlocksInChunk = fValidation->ChunksIntervals[chunk];
0425
0426 std::sort(BlocksInChunk.begin(), BlocksInChunk.end(),
0427 [](const std::pair<Long_t, Long_t>& a, const std::pair<Long_t, Long_t>& b) {
0428 return a.first < b.first;
0429 });
0430
0431 for (std::size_t i = 0; i < BlocksInChunk.size(); i++) {
0432
0433
0434 if (fNotFiltered) {
0435 RChunkLoaderFunctor<Args...> func(Tensor, fNumChunkCols, fVecSizes, fVecPadding, chunkEntry);
0436 ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, BlocksInChunk[i].first, BlocksInChunk[i].second);
0437 f_rdf.Foreach(func, fCols);
0438 chunkEntry += BlocksInChunk[i].second - BlocksInChunk[i].first;
0439 }
0440
0441
0442 else {
0443 std::size_t blockSize = BlocksInChunk[i].second - BlocksInChunk[i].first;
0444 for (std::size_t j = 0; j < blockSize; j++) {
0445 RChunkLoaderFunctor<Args...> func(Tensor, fNumChunkCols, fVecSizes, fVecPadding, chunkEntry);
0446 ROOT::Internal::RDF::ChangeBeginAndEndEntries(f_rdf, (*fEntries)[BlocksInChunk[i].first + j],
0447 (*fEntries)[BlocksInChunk[i].first + j + 1]);
0448
0449 f_rdf.Foreach(func, fCols);
0450 chunkEntry++;
0451 }
0452 }
0453 }
0454
0455
0456 for (std::size_t i = 0; i < chunkSize; i++) {
0457 std::copy(Tensor.GetData() + indices[i] * fNumChunkCols,
0458 Tensor.GetData() + (indices[i] + 1) * fNumChunkCols,
0459 ValidationChunkTensor.GetData() + i * fNumChunkCols);
0460 }
0461 }
0462 }
0463
0464 std::vector<std::size_t> GetTrainingChunkSizes() { return fTraining->ChunksSizes; }
0465 std::vector<std::size_t> GetValidationChunkSizes() { return fValidation->ChunksSizes; }
0466
0467 std::size_t GetNumTrainingEntries() { return fNumTrainEntries; }
0468 std::size_t GetNumValidationEntries() { return fNumValidationEntries; }
0469
0470 void CheckIfUnique(TMVA::Experimental::RTensor<float> &Tensor)
0471 {
0472 auto tensorSize = Tensor.GetSize();
0473 TMVA::Experimental::RTensor<float> SqueezeTensor = Tensor.Reshape({1, tensorSize}).Squeeze();
0474
0475 std::list<int> allEntries;
0476 for (int i = 0; i < tensorSize; i++) {
0477 allEntries.push_back(SqueezeTensor(0, i));
0478 }
0479 allEntries.sort();
0480 allEntries.unique();
0481 if (allEntries.size() == tensorSize) {
0482 std::cout << "Tensor consists of only unique elements" << std::endl;
0483 }
0484 };
0485
0486 void CheckIfOverlap(TMVA::Experimental::RTensor<float> &Tensor1, TMVA::Experimental::RTensor<float> &Tensor2)
0487 {
0488 auto tensorSize1 = Tensor1.GetSize();
0489 TMVA::Experimental::RTensor<float> SqueezeTensor1 = Tensor1.Reshape({1, tensorSize1}).Squeeze();
0490
0491 std::list<int> allEntries1;
0492 for (int i = 0; i < tensorSize1; i++) {
0493 allEntries1.push_back(SqueezeTensor1(0, i));
0494 }
0495
0496 auto tensorSize2 = Tensor2.GetSize();
0497 TMVA::Experimental::RTensor<float> SqueezeTensor2 = Tensor2.Reshape({1, tensorSize2}).Squeeze();
0498
0499 std::list<int> allEntries2;
0500 for (int i = 0; i < tensorSize2; i++) {
0501 allEntries2.push_back(SqueezeTensor2(0, i));
0502 }
0503
0504 std::set<int> result;
0505
0506
0507
0508
0509 std::set<int> set1(allEntries1.begin(), allEntries1.end());
0510 std::set<int> set2(allEntries2.begin(), allEntries2.end());
0511 std::set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), inserter(result, result.begin()));
0512
0513
0514 if (result.size() == 0) {
0515 std::cout << "No overlap between the tensors" << std::endl;
0516 } else {
0517 std::cout << "Intersection between tensors: ";
0518 for (int num : result) {
0519 std::cout << num << " ";
0520 }
0521 std::cout << std::endl;
0522 }
0523 };
0524
0525 std::size_t GetNumTrainingChunks() { return fTraining->Chunks; }
0526
0527 std::size_t GetNumValidationChunks() { return fValidation->Chunks; }
0528 };
0529
0530 }
0531 }
0532 }
0533 #endif