Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:10:45

0001 /// \file RNTupleDS.hxx
0002 /// \ingroup NTuple ROOT7
0003 /// \author Jakob Blomer <jblomer@cern.ch>
0004 /// \author Enrico Guiraud <enrico.guiraud@cern.ch>
0005 /// \date 2018-10-04
0006 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
0007 /// is welcome!
0008 
0009 /*************************************************************************
0010  * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers.               *
0011  * All rights reserved.                                                  *
0012  *                                                                       *
0013  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0014  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0015  *************************************************************************/
0016 
0017 #ifndef ROOT_RNTupleDS
0018 #define ROOT_RNTupleDS
0019 
0020 #include <ROOT/RDataFrame.hxx>
0021 #include <ROOT/RDataSource.hxx>
0022 #include <ROOT/RNTupleUtil.hxx>
0023 #include <string_view>
0024 
0025 #include <cstdint>
0026 #include <memory>
0027 #include <string>
0028 #include <vector>
0029 #include <unordered_map>
0030 
0031 namespace ROOT {
0032 namespace Experimental {
0033 
0034 class RFieldBase;
0035 class RNTuple;
0036 class RNTupleDescriptor;
0037 
0038 namespace Internal {
0039 class RNTupleColumnReader;
0040 class RPageSource;
0041 }
0042 
0043 class RNTupleDS final : public ROOT::RDF::RDataSource {
0044    friend class Internal::RNTupleColumnReader;
0045 
0046    /// The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records.
0047    /// The GetEntryRanges() swaps fNextRanges and fCurrentRanges and uses the list of
0048    /// REntryRangeDS records to return the list of ranges ready to use by the RDF loop manager.
0049    struct REntryRangeDS {
0050       std::unique_ptr<ROOT::Experimental::Internal::RPageSource> fSource;
0051       ULong64_t fFirstEntry = 0; ///< First entry index in fSource
0052       /// End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry
0053       ULong64_t fLastEntry = 0;
0054    };
0055 
0056    /// The first source is used to extract the schema and build the prototype fields. The page source
0057    /// is used to extract a clone of the descriptor to fPrincipalDescriptor. Afterwards it is moved
0058    /// into the first REntryRangeDS.
0059    std::unique_ptr<Internal::RPageSource> fPrincipalSource;
0060    /// A clone of the first pages source's descriptor.
0061    std::unique_ptr<RNTupleDescriptor> fPrincipalDescriptor;
0062 
0063    /// The data source may be constructed with an ntuple name and a list of files
0064    std::string fNTupleName;
0065    std::vector<std::string> fFileNames;
0066    std::size_t fNextFileIndex = 0; ///< Index into fFileNames to the next file to process
0067 
0068    /// We prepare a prototype field for every column. If a column reader is actually requested
0069    /// in GetColumnReaders(), we move a clone of the field into a new column reader for RDataFrame.
0070    /// Only the clone connects to the backing page store and acquires I/O resources.
0071    /// The field IDs are set in the context of the first source and used as keys in fFieldId2QualifiedName.
0072    std::vector<std::unique_ptr<ROOT::Experimental::RFieldBase>> fProtoFields;
0073    /// Connects the IDs of active proto fields and their subfields to their fully qualified name (a.b.c.d).
0074    /// This enables the column reader to rewire the field IDs when the file changes (chain),
0075    /// using the fully qualified name as a search key in the descriptor of the other page sources.
0076    std::unordered_map<ROOT::Experimental::DescriptorId_t, std::string> fFieldId2QualifiedName;
0077    std::vector<std::string> fColumnNames;
0078    std::vector<std::string> fColumnTypes;
0079    /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers
0080    /// to new page sources when the files in the chain change.
0081    std::vector<std::vector<Internal::RNTupleColumnReader *>> fActiveColumnReaders;
0082 
0083    unsigned int fNSlots = 0;
0084    ULong64_t fSeenEntries = 0;                ///< The number of entries so far returned by GetEntryRanges()
0085    std::vector<REntryRangeDS> fCurrentRanges; ///< Basis for the ranges returned by the last GetEntryRanges() call
0086    std::vector<REntryRangeDS> fNextRanges;    ///< Basis for the ranges populated by the PrepareNextRanges() call
0087    /// Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index in
0088    /// the fCurrentRanges vectors.  This is necessary because the returned ranges get distributed arbitrarily
0089    /// onto slots.  In the InitSlot method, the column readers use this map to find the correct range to connect to.
0090    std::unordered_map<ULong64_t, std::size_t> fFirstEntry2RangeIdx;
0091 
0092    /// \brief Holds useful information about fields added to the RNTupleDS
0093    struct RFieldInfo {
0094       DescriptorId_t fFieldId;
0095       std::size_t fNRepetitions;
0096       // Enable `std::vector::emplace_back` for this type
0097       RFieldInfo(DescriptorId_t fieldId, std::size_t nRepetitions) : fFieldId(fieldId), fNRepetitions(nRepetitions) {}
0098    };
0099 
0100    /// Provides the RDF column "colName" given the field identified by fieldID. For records and collections,
0101    /// AddField recurses into the sub fields. The fieldInfos argument is a list of objects holding info
0102    /// about the fields of the outer collection(s) (w.r.t. fieldId). For instance, if fieldId refers to an
0103    /// `std::vector<Jet>`, with
0104    /// struct Jet {
0105    ///    float pt;
0106    ///    float eta;
0107    /// };
0108    /// AddField will recurse into Jet.pt and Jet.eta and provide the two inner fields as std::vector<float> each.
0109    void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId,
0110                  std::vector<RFieldInfo> fieldInfos);
0111 
0112    /// Populates fNextRanges with the next set of entry ranges. Opens files from the chain as necessary
0113    /// and aligns ranges with cluster boundaries for scheduling the tail of files.
0114    /// Upon return, the fNextRanges list is ordered.  It has usually fNSlots elements; fewer if there
0115    /// is not enough work to give at least one cluster to every slot.
0116    void PrepareNextRanges();
0117 
0118    explicit RNTupleDS(std::unique_ptr<ROOT::Experimental::Internal::RPageSource> pageSource);
0119 
0120 public:
0121    RNTupleDS(std::string_view ntupleName, std::string_view fileName);
0122    RNTupleDS(ROOT::Experimental::RNTuple *ntuple);
0123    RNTupleDS(std::string_view ntupleName, const std::vector<std::string> &fileNames);
0124    ~RNTupleDS();
0125 
0126    void SetNSlots(unsigned int nSlots) final;
0127    const std::vector<std::string> &GetColumnNames() const final { return fColumnNames; }
0128    bool HasColumn(std::string_view colName) const final;
0129    std::string GetTypeName(std::string_view colName) const final;
0130    std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
0131    std::string GetLabel() final { return "RNTupleDS"; }
0132 
0133    bool SetEntry(unsigned int slot, ULong64_t entry) final;
0134 
0135    void Initialize() final;
0136    void InitSlot(unsigned int slot, ULong64_t firstEntry) final;
0137    void FinalizeSlot(unsigned int slot) final;
0138    void Finalize() final;
0139 
0140    std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
0141    GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &) final;
0142 
0143 protected:
0144    Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final;
0145 };
0146 
0147 } // ns Experimental
0148 
0149 namespace RDF {
0150 namespace Experimental {
0151 RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName);
0152 RDataFrame FromRNTuple(std::string_view ntupleName, const std::vector<std::string> &fileNames);
0153 RDataFrame FromRNTuple(ROOT::Experimental::RNTuple *ntuple);
0154 } // namespace Experimental
0155 } // namespace RDF
0156 
0157 } // ns ROOT
0158 
0159 #endif