![]() |
|
|||
File indexing completed on 2025-09-16 09:08:33
0001 /// \file ROOT/RNTupleImporter.hxx 0002 /// \ingroup NTuple ROOT7 0003 /// \author Jakob Blomer <jblomer@cern.ch> 0004 /// \date 2022-11-22 0005 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback 0006 /// is welcome! 0007 0008 /************************************************************************* 0009 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. * 0010 * All rights reserved. * 0011 * * 0012 * For the licensing terms see $ROOTSYS/LICENSE. * 0013 * For the list of contributors see $ROOTSYS/README/CREDITS. * 0014 *************************************************************************/ 0015 0016 #ifndef ROOT7_RNTuplerImporter 0017 #define ROOT7_RNTuplerImporter 0018 0019 #include <ROOT/REntry.hxx> 0020 #include <ROOT/RError.hxx> 0021 #include <ROOT/RField.hxx> 0022 #include <ROOT/RNTupleModel.hxx> 0023 #include <ROOT/RNTupleWriteOptions.hxx> 0024 #include <ROOT/RNTupleWriter.hxx> 0025 #include <string_view> 0026 0027 #include <TFile.h> 0028 #include <TTree.h> 0029 0030 #include <cstdlib> 0031 #include <functional> 0032 #include <map> 0033 #include <memory> 0034 #include <vector> 0035 0036 class TLeaf; 0037 0038 namespace ROOT { 0039 namespace Experimental { 0040 0041 // clang-format off 0042 /** 0043 \class ROOT::Experimental::RNTupleImporter 0044 \ingroup NTuple 0045 \brief Converts a TTree into an RNTuple 0046 0047 Example usage (see the ntpl008_import.C tutorial for a full example): 0048 0049 ~~~ {.cpp} 0050 #include <ROOT/RNTupleImporter.hxx> 0051 using ROOT::Experimental::RNTupleImporter; 0052 0053 auto importer = RNTupleImporter::Create("data.root", "TreeName", "output.root"); 0054 // As required: importer->SetNTupleName(), importer->SetWriteOptions(), ... 0055 importer->Import(); 0056 ~~~ 0057 0058 The output file is created if it does not exist, otherwise the ntuple is added to the existing file. 0059 Note that input file and output file can be identical if the ntuple is stored under a different name than the tree 0060 (use `SetNTupleName()`). 0061 0062 By default, the RNTuple is compressed with zstd, independent of the input compression. The compression settings 0063 (and other output parameters) can be changed by `SetWriteOptions()`. For example, to compress the imported RNTuple 0064 using lz4 (with compression level 4) instead: 0065 0066 ~~~ {.cpp} 0067 auto writeOptions = importer->GetWriteOptions(); 0068 writeOptions.SetCompression(404); 0069 importer->SetWriteOptions(writeOptions); 0070 ~~~ 0071 0072 Most RNTuple fields have a type identical to the corresponding TTree input branch. Exceptions are 0073 - C string branches are translated to `std::string` fields 0074 - C style arrays are translated to `std::array<...>` fields 0075 - Leaf lists are translated to untyped records 0076 - Leaf count arrays are translated to anonymous collections with generic names (`_collection0`, `_collection1`, etc.). 0077 In order to keep field names and branch names aligned, RNTuple projects the members of these collections and 0078 its collection counter to the input branch names. For instance, the following input leafs: 0079 ~~~ 0080 Int_t njets 0081 float jet_pt[njets] 0082 float jet_eta[njets] 0083 ~~~ 0084 will be converted to the following RNTuple schema: 0085 ~~~ 0086 _collection0 (untyped collection) 0087 |- float jet_pt 0088 |- float jet_eta 0089 std::size_t (RNTupleCardinality) njets (projected from _collection0 without subfields) 0090 ROOT::RVec<float> jet_pt (projected from _collection0.jet_pt) 0091 ROOT::RVec<float> jet_eta (projected from _collection0.jet_eta) 0092 ~~~ 0093 These projections are meta-data only operations and don't involve duplicating the data. 0094 0095 Current limitations of the importer: 0096 - No support for trees containing TClonesArray collections 0097 - Due to RNTuple currently storing data fully split, "don't split" markers are ignored 0098 - Some types are not available in RNTuple. Please refer to the 0099 [RNTuple specification](https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md) for 0100 an overview of all types currently supported. 0101 */ 0102 // clang-format on 0103 class RNTupleImporter { 0104 public: 0105 /// Used to make adjustments to the fields of the output model. 0106 using FieldModifier_t = std::function<void(ROOT::RFieldBase &)>; 0107 0108 /// Used to report every ~100 MB (compressed), and at the end about the status of the import. 0109 class RProgressCallback { 0110 public: 0111 virtual ~RProgressCallback() = default; 0112 void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) 0113 { 0114 Call(nbytesWritten, neventsWritten); 0115 } 0116 virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0; 0117 virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0; 0118 }; 0119 0120 private: 0121 struct RImportBranch { 0122 RImportBranch() = default; 0123 RImportBranch(const RImportBranch &other) = delete; 0124 RImportBranch(RImportBranch &&other) = default; 0125 RImportBranch &operator=(const RImportBranch &other) = delete; 0126 RImportBranch &operator=(RImportBranch &&other) = default; 0127 std::string fBranchName; ///< Top-level branch name from the input TTree 0128 std::unique_ptr<unsigned char[]> fBranchBuffer; ///< The destination of SetBranchAddress() for `fBranchName` 0129 }; 0130 0131 struct RImportField { 0132 RImportField() = default; 0133 ~RImportField() = default; 0134 RImportField(const RImportField &other) = delete; 0135 RImportField(RImportField &&other) = default; 0136 RImportField &operator=(const RImportField &other) = delete; 0137 RImportField &operator=(RImportField &&other) = default; 0138 0139 /// The field is kept during schema preparation and transferred to the fModel before the writing starts 0140 ROOT::RFieldBase *fField = nullptr; 0141 std::unique_ptr<ROOT::RFieldBase::RValue> fValue; ///< Set if a value is generated, only for transformed fields 0142 void *fFieldBuffer = nullptr; ///< Usually points to the corresponding RImportBranch::fBranchBuffer but not always 0143 }; 0144 0145 /// Base class to perform data transformations from TTree branches to RNTuple fields if necessary 0146 struct RImportTransformation { 0147 std::size_t fImportBranchIdx = 0; 0148 std::size_t fImportFieldIdx = 0; 0149 0150 RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx) 0151 : fImportBranchIdx(branchIdx), fImportFieldIdx(fieldIdx) 0152 { 0153 } 0154 virtual ~RImportTransformation() = default; 0155 virtual RResult<void> Transform(const RImportBranch &branch, RImportField &field) = 0; 0156 }; 0157 0158 /// When the schema is set up and the import started, it needs to be reset before the next Import() call 0159 /// can start. This RAII guard ensures that ResetSchema is called. 0160 struct RImportGuard { 0161 RNTupleImporter &fImporter; 0162 0163 explicit RImportGuard(RNTupleImporter &importer) : fImporter(importer) {} 0164 RImportGuard(const RImportGuard &) = delete; 0165 RImportGuard &operator=(const RImportGuard &) = delete; 0166 RImportGuard(RImportGuard &&) = delete; 0167 RImportGuard &operator=(RImportGuard &&) = delete; 0168 ~RImportGuard() { fImporter.ResetSchema(); } 0169 }; 0170 0171 /// Leaf count arrays require special treatment. They are translated into untyped collections of untyped records. 0172 /// This class does the bookkeeping of the sub-schema for these collections. 0173 struct RImportLeafCountCollection { 0174 RImportLeafCountCollection() = default; 0175 RImportLeafCountCollection(const RImportLeafCountCollection &other) = delete; 0176 RImportLeafCountCollection(RImportLeafCountCollection &&other) = default; 0177 RImportLeafCountCollection &operator=(const RImportLeafCountCollection &other) = delete; 0178 RImportLeafCountCollection &operator=(RImportLeafCountCollection &&other) = default; 0179 std::string fFieldName; ///< name of the untyped collection, e.g. `_collection0`, `_collection1`, etc. 0180 /// Stores count leaf GetMaximum() to create large enough buffers for the array leafs. 0181 /// Uses Int_t because that is the return type if TLeaf::GetMaximum(). 0182 Int_t fMaxLength = 0; 0183 /// The number of elements for the collection for a particular event. Used as a destination for SetBranchAddress() 0184 /// of the count leaf 0185 std::unique_ptr<Int_t> fCountVal; 0186 /// The leafs of the array as we encounter them traversing the TTree schema. 0187 /// Eventually, the fields are moved as leaves to an untyped collection of untyped records that in turn 0188 /// is attached to the RNTuple model. 0189 std::vector<std::unique_ptr<ROOT::RFieldBase>> fLeafFields; 0190 std::vector<size_t> fLeafBranchIndexes; ///< Points to the correspondings leaf branches in fImportBranches 0191 ROOT::RRecordField *fRecordField = 0192 nullptr; ///< Points to the item field of the untyped collection field in the model. 0193 std::vector<unsigned char> fFieldBuffer; ///< The collection field memory representation. Bound to the entry. 0194 }; 0195 0196 /// Transform a NULL terminated C string branch into an `std::string` field 0197 struct RCStringTransformation : public RImportTransformation { 0198 RCStringTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {} 0199 ~RCStringTransformation() override = default; 0200 RResult<void> Transform(const RImportBranch &branch, RImportField &field) final; 0201 }; 0202 0203 RNTupleImporter() = default; 0204 0205 std::unique_ptr<TFile> fSourceFile; 0206 TTree *fSourceTree; 0207 0208 std::string fDestFileName; 0209 std::string fNTupleName; 0210 std::unique_ptr<TFile> fDestFile; 0211 ROOT::RNTupleWriteOptions fWriteOptions; 0212 0213 /// Whether or not dot characters in branch names should be converted to underscores. If this option is not set and a 0214 /// branch with a '.' is encountered, the importer will throw an exception. 0215 bool fConvertDotsInBranchNames = false; 0216 0217 /// The maximum number of entries to import. When this value is -1 (default), import all entries. 0218 std::int64_t fMaxEntries = -1; 0219 0220 /// No standard output, conversely if set to false, schema information and progress is printed. 0221 bool fIsQuiet = false; 0222 std::unique_ptr<RProgressCallback> fProgressCallback; 0223 FieldModifier_t fFieldModifier; 0224 0225 std::unique_ptr<ROOT::RNTupleModel> fModel; 0226 std::unique_ptr<ROOT::REntry> fEntry; 0227 std::vector<RImportBranch> fImportBranches; 0228 std::vector<RImportField> fImportFields; 0229 /// Maps the count leaf to the information about the corresponding untyped collection 0230 std::map<std::string, RImportLeafCountCollection> fLeafCountCollections; 0231 /// The list of transformations to be performed for every entry 0232 std::vector<std::unique_ptr<RImportTransformation>> fImportTransformations; 0233 0234 ROOT::RResult<void> InitDestination(std::string_view destFileName); 0235 0236 void ResetSchema(); 0237 /// Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory 0238 /// buffers used for reading and writing. 0239 ROOT::RResult<void> PrepareSchema(); 0240 void ReportSchema(); 0241 0242 public: 0243 RNTupleImporter(const RNTupleImporter &other) = delete; 0244 RNTupleImporter &operator=(const RNTupleImporter &other) = delete; 0245 RNTupleImporter(RNTupleImporter &&other) = delete; 0246 RNTupleImporter &operator=(RNTupleImporter &&other) = delete; 0247 ~RNTupleImporter() = default; 0248 0249 /// Opens the input file for reading and the output file for writing (update). 0250 static std::unique_ptr<RNTupleImporter> 0251 Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName); 0252 0253 /// Directly uses the provided tree and opens the output file for writing (update). 0254 static std::unique_ptr<RNTupleImporter> Create(TTree *sourceTree, std::string_view destFileName); 0255 0256 ROOT::RNTupleWriteOptions GetWriteOptions() const { return fWriteOptions; } 0257 void SetWriteOptions(ROOT::RNTupleWriteOptions options) { fWriteOptions = options; } 0258 void SetNTupleName(const std::string &name) { fNTupleName = name; } 0259 void SetMaxEntries(std::uint64_t maxEntries) { fMaxEntries = maxEntries; }; 0260 0261 /// Whereas branch names may contain dots, RNTuple field names may not. By setting this option, dot characters are 0262 /// automatically converted into underscores to prevent the importer from throwing an exception. 0263 void SetConvertDotsInBranchNames(bool value) { fConvertDotsInBranchNames = value; } 0264 0265 /// Whether or not information and progress is printed to stdout. 0266 void SetIsQuiet(bool value) { fIsQuiet = value; } 0267 0268 /// Add custom method to adjust column representations. Will be called for every field of the frozen model 0269 /// before it is attached to the page sink 0270 void SetFieldModifier(FieldModifier_t modifier) { fFieldModifier = modifier; } 0271 0272 /// Import works in two steps: 0273 /// 1. PrepareSchema() calls SetBranchAddress() on all the TTree branches and creates the corresponding RNTuple 0274 /// fields and the model 0275 /// 2. An event loop reads every entry from the TTree, applies transformations where necessary, and writes the 0276 /// output entry to the RNTuple. 0277 void Import(); 0278 }; // class RNTupleImporter 0279 0280 } // namespace Experimental 0281 } // namespace ROOT 0282 0283 #endif
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |