Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:11:04

0001 #ifndef TMVA_CHUNKLOADER
0002 #define TMVA_CHUNKLOADER
0003 
0004 #include <iostream>
0005 #include <vector>
0006 
0007 #include "TMVA/RTensor.hxx"
0008 #include "ROOT/RDataFrame.hxx"
0009 #include "ROOT/RVec.hxx"
0010 
0011 #include "ROOT/RLogger.hxx"
0012 
0013 namespace TMVA {
0014 namespace Experimental {
0015 namespace Internal {
0016 
0017 // RChunkLoader class used to load content of a RDataFrame onto a RTensor.
0018 template <typename First, typename... Rest>
0019 class RChunkLoaderFunctor {
0020 
0021 private:
0022    std::size_t fOffset = 0;
0023    std::size_t fVecSizeIdx = 0;
0024    std::vector<std::size_t> fMaxVecSizes;
0025 
0026    float fVecPadding;
0027 
0028    TMVA::Experimental::RTensor<float> &fChunkTensor;
0029 
0030    /// \brief Load the final given value into fChunkTensor
0031    /// \tparam First_T
0032    /// \param first
0033    template <typename First_T>
0034    void AssignToTensor(First_T first)
0035    {
0036       fChunkTensor.GetData()[fOffset++] = first;
0037    }
0038 
0039    /// \brief Load the final given value into fChunkTensor
0040    /// \tparam VecType
0041    /// \param first
0042    template <typename VecType>
0043    void AssignToTensor(const ROOT::RVec<VecType> &first)
0044    {
0045       AssignVector(first);
0046    }
0047 
0048    /// \brief Recursively loop through the given values, and load them onto the fChunkTensor
0049    /// \tparam First_T
0050    /// \tparam ...Rest_T
0051    /// \param first
0052    /// \param ...rest
0053    template <typename First_T, typename... Rest_T>
0054    void AssignToTensor(First_T first, Rest_T... rest)
0055    {
0056       fChunkTensor.GetData()[fOffset++] = first;
0057 
0058       AssignToTensor(std::forward<Rest_T>(rest)...);
0059    }
0060 
0061    /// \brief Recursively loop through the given values, and load them onto the fChunkTensor
0062    /// \tparam VecType
0063    /// \tparam ...Rest_T
0064    /// \param first
0065    /// \param ...rest
0066    template <typename VecType, typename... Rest_T>
0067    void AssignToTensor(const ROOT::RVec<VecType> &first, Rest_T... rest)
0068    {
0069       AssignVector(first);
0070 
0071       AssignToTensor(std::forward<Rest_T>(rest)...);
0072    }
0073 
0074    /// \brief Loop through the values of a given vector and load them into the RTensor
0075    /// Note: the given vec_size does not have to be the same size as the given vector
0076    ///       If the size is bigger than the given vector, zeros are used as padding.
0077    ///       If the size is smaller, the remaining values are ignored.
0078    /// \tparam VecType
0079    /// \param vec
0080    template <typename VecType>
0081    void AssignVector(const ROOT::RVec<VecType> &vec)
0082    {
0083       std::size_t max_vec_size = fMaxVecSizes[fVecSizeIdx++];
0084       std::size_t vec_size = vec.size();
0085 
0086       for (std::size_t i = 0; i < max_vec_size; i++) {
0087          if (i < vec_size) {
0088             fChunkTensor.GetData()[fOffset++] = vec[i];
0089          } else {
0090             fChunkTensor.GetData()[fOffset++] = fVecPadding;
0091          }
0092       }
0093    }
0094 
0095 public:
0096    RChunkLoaderFunctor(TMVA::Experimental::RTensor<float> &chunkTensor,
0097                        const std::vector<std::size_t> &maxVecSizes = std::vector<std::size_t>(),
0098                        const float vecPadding = 0.0)
0099       : fChunkTensor(chunkTensor), fMaxVecSizes(maxVecSizes), fVecPadding(vecPadding)
0100    {
0101    }
0102 
0103    /// \brief Loop through all columns of an event and put their values into an RTensor
0104    /// \param first
0105    /// \param ...rest
0106    void operator()(First first, Rest... rest)
0107    {
0108       fVecSizeIdx = 0;
0109       AssignToTensor(std::forward<First>(first), std::forward<Rest>(rest)...);
0110    }
0111 };
0112 
0113 template <typename... Args>
0114 class RChunkLoader {
0115 
0116 private:
0117    std::string fTreeName;
0118    std::string fFileName;
0119    std::size_t fChunkSize;
0120    std::size_t fNumColumns;
0121 
0122    std::vector<std::string> fCols;
0123    std::string fFilters;
0124 
0125    std::vector<std::size_t> fVecSizes;
0126    std::size_t fVecPadding;
0127 
0128 public:
0129    /// \brief Constructor for the RChunkLoader
0130    /// \param treeName
0131    /// \param fileName
0132    /// \param chunkSize
0133    /// \param cols
0134    /// \param filters
0135    /// \param vecSizes
0136    /// \param vecPadding
0137    RChunkLoader(const std::string &treeName, const std::string &fileName, const std::size_t chunkSize,
0138                 const std::vector<std::string> &cols, const std::string &filters = "",
0139                 const std::vector<std::size_t> &vecSizes = {}, const float vecPadding = 0.0)
0140       : fTreeName(treeName),
0141         fFileName(fileName),
0142         fChunkSize(chunkSize),
0143         fCols(cols),
0144         fFilters(filters),
0145         fVecSizes(vecSizes),
0146         fVecPadding(vecPadding),
0147         fNumColumns(cols.size())
0148    {
0149    }
0150 
0151    /// \brief Load a chunk of data using the RChunkLoaderFunctor
0152    /// \param chunkTensor
0153    /// \param currentRow
0154    /// \return A pair of size_t defining the number of events processed and how many passed all filters
0155    std::pair<std::size_t, std::size_t>
0156    LoadChunk(TMVA::Experimental::RTensor<float> &chunkTensor, const std::size_t currentRow)
0157    {
0158       RChunkLoaderFunctor<Args...> func(chunkTensor, fVecSizes, fVecPadding);
0159 
0160       // Create TDataFrame of the chunk
0161       // Use RDatasetSpec to start reading at the current row
0162       long long start_l = currentRow;
0163       ROOT::RDF::Experimental::RDatasetSpec x_spec =
0164          ROOT::RDF::Experimental::RDatasetSpec()
0165             .AddSample({"", fTreeName, fFileName})
0166             .WithGlobalRange({start_l, std::numeric_limits<Long64_t>::max()});
0167 
0168       ROOT::RDataFrame x_rdf(x_spec);
0169 
0170       // Load events if filters are given
0171       if (fFilters.size() > 0) {
0172          return loadFiltered(x_rdf, func);
0173       }
0174 
0175       // load events if no filters are given
0176       return loadNonFiltered(x_rdf, func);
0177    }
0178 
0179 private:
0180    /// \brief Add filters to the RDataFrame and load a chunk of data
0181    /// \param x_rdf
0182    /// \param func
0183    /// \return A pair of size_t defining the number of events processed and how many passed all filters
0184    std::pair<std::size_t, std::size_t> loadFiltered(ROOT::RDataFrame &x_rdf, RChunkLoaderFunctor<Args...> &func)
0185    {
0186       // Add the given filters to the RDataFrame
0187       auto x_filter = x_rdf.Filter(fFilters, "RBatchGenerator_Filter");
0188 
0189       // add range to the DataFrame
0190       auto x_ranged = x_filter.Range(fChunkSize);
0191       auto myReport = x_ranged.Report();
0192 
0193       // load data
0194       x_ranged.Foreach(func, fCols);
0195 
0196       // Use the report to gather the number of events processed and passed.
0197       // passed_events is used to determine the starting event of the next chunk
0198       // processed_events is used to determine if the end of the database is reached.
0199       std::size_t processed_events = myReport.begin()->GetAll();
0200       std::size_t passed_events = (myReport.end() - 1)->GetPass();
0201 
0202       return std::make_pair(processed_events, passed_events);
0203    }
0204 
0205    /// \brief Loop over the events in the dataframe untill either the end of the dataframe
0206    /// is reached, or a full chunk is loaded
0207    /// \param x_rdf
0208    /// \param func
0209    /// \return A pair of size_t defining the number of events processed and how many passed all filters
0210    std::pair<std::size_t, std::size_t> loadNonFiltered(ROOT::RDataFrame &x_rdf, RChunkLoaderFunctor<Args...> &func)
0211    {
0212       // add range
0213       auto x_ranged = x_rdf.Range(fChunkSize);
0214       // auto x_ranged = x_rdf.Range(currentRow, currentRow + fChunkSize);
0215       auto myCount = x_ranged.Count();
0216 
0217       // load data
0218       x_ranged.Foreach(func, fCols);
0219 
0220       // get loading info
0221       std::size_t processed_events = myCount.GetValue();
0222       std::size_t passed_events = myCount.GetValue();
0223       return std::make_pair(processed_events, passed_events);
0224    }
0225 };
0226 
0227 } // namespace Internal
0228 } // namespace Experimental
0229 } // namespace TMVA
0230 #endif // TMVA_CHUNKLOADER