|
||||
File indexing completed on 2025-01-18 10:10:45
0001 /// \file RNTupleDS.hxx 0002 /// \ingroup NTuple ROOT7 0003 /// \author Jakob Blomer <jblomer@cern.ch> 0004 /// \author Enrico Guiraud <enrico.guiraud@cern.ch> 0005 /// \date 2018-10-04 0006 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback 0007 /// is welcome! 0008 0009 /************************************************************************* 0010 * Copyright (C) 1995-2020, Rene Brun and Fons Rademakers. * 0011 * All rights reserved. * 0012 * * 0013 * For the licensing terms see $ROOTSYS/LICENSE. * 0014 * For the list of contributors see $ROOTSYS/README/CREDITS. * 0015 *************************************************************************/ 0016 0017 #ifndef ROOT_RNTupleDS 0018 #define ROOT_RNTupleDS 0019 0020 #include <ROOT/RDataFrame.hxx> 0021 #include <ROOT/RDataSource.hxx> 0022 #include <ROOT/RNTupleUtil.hxx> 0023 #include <string_view> 0024 0025 #include <cstdint> 0026 #include <memory> 0027 #include <string> 0028 #include <vector> 0029 #include <unordered_map> 0030 0031 namespace ROOT { 0032 namespace Experimental { 0033 0034 class RFieldBase; 0035 class RNTuple; 0036 class RNTupleDescriptor; 0037 0038 namespace Internal { 0039 class RNTupleColumnReader; 0040 class RPageSource; 0041 } 0042 0043 class RNTupleDS final : public ROOT::RDF::RDataSource { 0044 friend class Internal::RNTupleColumnReader; 0045 0046 /// The PrepareNextRanges() method populates the fNextRanges list with REntryRangeDS records. 0047 /// The GetEntryRanges() swaps fNextRanges and fCurrentRanges and uses the list of 0048 /// REntryRangeDS records to return the list of ranges ready to use by the RDF loop manager. 0049 struct REntryRangeDS { 0050 std::unique_ptr<ROOT::Experimental::Internal::RPageSource> fSource; 0051 ULong64_t fFirstEntry = 0; ///< First entry index in fSource 0052 /// End entry index in fSource, e.g. the number of entries in the range is fLastEntry - fFirstEntry 0053 ULong64_t fLastEntry = 0; 0054 }; 0055 0056 /// The first source is used to extract the schema and build the prototype fields. The page source 0057 /// is used to extract a clone of the descriptor to fPrincipalDescriptor. Afterwards it is moved 0058 /// into the first REntryRangeDS. 0059 std::unique_ptr<Internal::RPageSource> fPrincipalSource; 0060 /// A clone of the first pages source's descriptor. 0061 std::unique_ptr<RNTupleDescriptor> fPrincipalDescriptor; 0062 0063 /// The data source may be constructed with an ntuple name and a list of files 0064 std::string fNTupleName; 0065 std::vector<std::string> fFileNames; 0066 std::size_t fNextFileIndex = 0; ///< Index into fFileNames to the next file to process 0067 0068 /// We prepare a prototype field for every column. If a column reader is actually requested 0069 /// in GetColumnReaders(), we move a clone of the field into a new column reader for RDataFrame. 0070 /// Only the clone connects to the backing page store and acquires I/O resources. 0071 /// The field IDs are set in the context of the first source and used as keys in fFieldId2QualifiedName. 0072 std::vector<std::unique_ptr<ROOT::Experimental::RFieldBase>> fProtoFields; 0073 /// Connects the IDs of active proto fields and their subfields to their fully qualified name (a.b.c.d). 0074 /// This enables the column reader to rewire the field IDs when the file changes (chain), 0075 /// using the fully qualified name as a search key in the descriptor of the other page sources. 0076 std::unordered_map<ROOT::Experimental::DescriptorId_t, std::string> fFieldId2QualifiedName; 0077 std::vector<std::string> fColumnNames; 0078 std::vector<std::string> fColumnTypes; 0079 /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers 0080 /// to new page sources when the files in the chain change. 0081 std::vector<std::vector<Internal::RNTupleColumnReader *>> fActiveColumnReaders; 0082 0083 unsigned int fNSlots = 0; 0084 ULong64_t fSeenEntries = 0; ///< The number of entries so far returned by GetEntryRanges() 0085 std::vector<REntryRangeDS> fCurrentRanges; ///< Basis for the ranges returned by the last GetEntryRanges() call 0086 std::vector<REntryRangeDS> fNextRanges; ///< Basis for the ranges populated by the PrepareNextRanges() call 0087 /// Maps the first entries from the ranges of the last GetEntryRanges() call to their corresponding index in 0088 /// the fCurrentRanges vectors. This is necessary because the returned ranges get distributed arbitrarily 0089 /// onto slots. In the InitSlot method, the column readers use this map to find the correct range to connect to. 0090 std::unordered_map<ULong64_t, std::size_t> fFirstEntry2RangeIdx; 0091 0092 /// \brief Holds useful information about fields added to the RNTupleDS 0093 struct RFieldInfo { 0094 DescriptorId_t fFieldId; 0095 std::size_t fNRepetitions; 0096 // Enable `std::vector::emplace_back` for this type 0097 RFieldInfo(DescriptorId_t fieldId, std::size_t nRepetitions) : fFieldId(fieldId), fNRepetitions(nRepetitions) {} 0098 }; 0099 0100 /// Provides the RDF column "colName" given the field identified by fieldID. For records and collections, 0101 /// AddField recurses into the sub fields. The fieldInfos argument is a list of objects holding info 0102 /// about the fields of the outer collection(s) (w.r.t. fieldId). For instance, if fieldId refers to an 0103 /// `std::vector<Jet>`, with 0104 /// struct Jet { 0105 /// float pt; 0106 /// float eta; 0107 /// }; 0108 /// AddField will recurse into Jet.pt and Jet.eta and provide the two inner fields as std::vector<float> each. 0109 void AddField(const RNTupleDescriptor &desc, std::string_view colName, DescriptorId_t fieldId, 0110 std::vector<RFieldInfo> fieldInfos); 0111 0112 /// Populates fNextRanges with the next set of entry ranges. Opens files from the chain as necessary 0113 /// and aligns ranges with cluster boundaries for scheduling the tail of files. 0114 /// Upon return, the fNextRanges list is ordered. It has usually fNSlots elements; fewer if there 0115 /// is not enough work to give at least one cluster to every slot. 0116 void PrepareNextRanges(); 0117 0118 explicit RNTupleDS(std::unique_ptr<ROOT::Experimental::Internal::RPageSource> pageSource); 0119 0120 public: 0121 RNTupleDS(std::string_view ntupleName, std::string_view fileName); 0122 RNTupleDS(ROOT::Experimental::RNTuple *ntuple); 0123 RNTupleDS(std::string_view ntupleName, const std::vector<std::string> &fileNames); 0124 ~RNTupleDS(); 0125 0126 void SetNSlots(unsigned int nSlots) final; 0127 const std::vector<std::string> &GetColumnNames() const final { return fColumnNames; } 0128 bool HasColumn(std::string_view colName) const final; 0129 std::string GetTypeName(std::string_view colName) const final; 0130 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final; 0131 std::string GetLabel() final { return "RNTupleDS"; } 0132 0133 bool SetEntry(unsigned int slot, ULong64_t entry) final; 0134 0135 void Initialize() final; 0136 void InitSlot(unsigned int slot, ULong64_t firstEntry) final; 0137 void FinalizeSlot(unsigned int slot) final; 0138 void Finalize() final; 0139 0140 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase> 0141 GetColumnReaders(unsigned int /*slot*/, std::string_view /*name*/, const std::type_info &) final; 0142 0143 protected: 0144 Record_t GetColumnReadersImpl(std::string_view name, const std::type_info &) final; 0145 }; 0146 0147 } // ns Experimental 0148 0149 namespace RDF { 0150 namespace Experimental { 0151 RDataFrame FromRNTuple(std::string_view ntupleName, std::string_view fileName); 0152 RDataFrame FromRNTuple(std::string_view ntupleName, const std::vector<std::string> &fileNames); 0153 RDataFrame FromRNTuple(ROOT::Experimental::RNTuple *ntuple); 0154 } // namespace Experimental 0155 } // namespace RDF 0156 0157 } // ns ROOT 0158 0159 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |