|
|
|||
File indexing completed on 2026-05-07 08:51:29
0001 /// \file ROOT/RNTupleImporter.hxx 0002 /// \ingroup NTuple ROOT7 0003 /// \author Jakob Blomer <jblomer@cern.ch> 0004 /// \date 2022-11-22 0005 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback 0006 /// is welcome! 0007 0008 /************************************************************************* 0009 * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers. * 0010 * All rights reserved. * 0011 * * 0012 * For the licensing terms see $ROOTSYS/LICENSE. * 0013 * For the list of contributors see $ROOTSYS/README/CREDITS. * 0014 *************************************************************************/ 0015 0016 #ifndef ROOT7_RNTuplerImporter 0017 #define ROOT7_RNTuplerImporter 0018 0019 #include <ROOT/REntry.hxx> 0020 #include <ROOT/RError.hxx> 0021 #include <ROOT/RField.hxx> 0022 #include <ROOT/RNTupleModel.hxx> 0023 #include <ROOT/RNTupleWriteOptions.hxx> 0024 #include <ROOT/RNTupleWriter.hxx> 0025 #include <string_view> 0026 0027 #include <TFile.h> 0028 #include <TTree.h> 0029 0030 #include <cstdlib> 0031 #include <functional> 0032 #include <map> 0033 #include <memory> 0034 #include <vector> 0035 0036 class TLeaf; 0037 0038 namespace ROOT { 0039 namespace Experimental { 0040 0041 // clang-format off 0042 /** 0043 \class ROOT::Experimental::RNTupleImporter 0044 \ingroup NTuple 0045 \brief Converts a TTree into an RNTuple 0046 0047 Example usage (see the ntpl008_import.C tutorial for a full example): 0048 0049 ~~~ {.cpp} 0050 #include <ROOT/RNTupleImporter.hxx> 0051 using ROOT::Experimental::RNTupleImporter; 0052 0053 auto importer = RNTupleImporter::Create("data.root", "TreeName", "output.root"); 0054 // As required: importer->SetNTupleName(), importer->SetWriteOptions(), ... 0055 importer->Import(); 0056 ~~~ 0057 0058 The output file is created if it does not exist, otherwise the ntuple is added to the existing file. 0059 Directories in the output file are created as necessary, allowing ntuples to be stored in a nested structure (e.g. DirName/TreeName). 0060 Note that input file and output file can be identical if the ntuple is stored under a different name than the tree 0061 (use `SetNTupleName()`). 0062 0063 By default, the RNTuple is compressed with zstd, independent of the input compression. The compression settings 0064 (and other output parameters) can be changed by `SetWriteOptions()`. For example, to compress the imported RNTuple 0065 using lz4 (with compression level 4) instead: 0066 0067 ~~~ {.cpp} 0068 auto writeOptions = importer->GetWriteOptions(); 0069 writeOptions.SetCompression(404); 0070 importer->SetWriteOptions(writeOptions); 0071 ~~~ 0072 0073 Most RNTuple fields have a type identical to the corresponding TTree input branch. Exceptions are 0074 - C string branches are translated to `std::string` fields 0075 - C style arrays are translated to `std::array<...>` fields 0076 - Leaf lists are translated to untyped records 0077 - Leaf count arrays are translated to anonymous collections with generic names (`_collection0`, `_collection1`, etc.). 0078 In order to keep field names and branch names aligned, RNTuple projects the members of these collections and 0079 its collection counter to the input branch names. For instance, the following input leafs: 0080 ~~~ 0081 Int_t njets 0082 float jet_pt[njets] 0083 float jet_eta[njets] 0084 ~~~ 0085 will be converted to the following RNTuple schema: 0086 ~~~ 0087 _collection0 (untyped collection) 0088 |- float jet_pt 0089 |- float jet_eta 0090 std::size_t (RNTupleCardinality) njets (projected from _collection0 without subfields) 0091 ROOT::RVec<float> jet_pt (projected from _collection0.jet_pt) 0092 ROOT::RVec<float> jet_eta (projected from _collection0.jet_eta) 0093 ~~~ 0094 These projections are meta-data only operations and don't involve duplicating the data. 0095 0096 Current limitations of the importer: 0097 - No support for trees containing TClonesArray collections 0098 - Due to RNTuple currently storing data fully split, "don't split" markers are ignored 0099 - Some types are not available in RNTuple. Please refer to the 0100 [RNTuple specification](https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md) for 0101 an overview of all types currently supported. 0102 */ 0103 // clang-format on 0104 class RNTupleImporter { 0105 public: 0106 /// Used to make adjustments to the fields of the output model. 0107 using FieldModifier_t = std::function<void(ROOT::RFieldBase &)>; 0108 0109 /// Used to report every ~100 MB (compressed), and at the end about the status of the import. 0110 class RProgressCallback { 0111 public: 0112 virtual ~RProgressCallback() = default; 0113 void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) 0114 { 0115 Call(nbytesWritten, neventsWritten); 0116 } 0117 virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0; 0118 virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0; 0119 }; 0120 0121 private: 0122 struct RImportBranch { 0123 RImportBranch() = default; 0124 RImportBranch(const RImportBranch &other) = delete; 0125 RImportBranch(RImportBranch &&other) = default; 0126 RImportBranch &operator=(const RImportBranch &other) = delete; 0127 RImportBranch &operator=(RImportBranch &&other) = default; 0128 std::string fBranchName; ///< Top-level branch name from the input TTree 0129 std::unique_ptr<unsigned char[]> fBranchBuffer; ///< The destination of SetBranchAddress() for `fBranchName` 0130 }; 0131 0132 struct RImportField { 0133 RImportField() = default; 0134 ~RImportField() = default; 0135 RImportField(const RImportField &other) = delete; 0136 RImportField(RImportField &&other) = default; 0137 RImportField &operator=(const RImportField &other) = delete; 0138 RImportField &operator=(RImportField &&other) = default; 0139 0140 /// The field is kept during schema preparation and transferred to the fModel before the writing starts 0141 ROOT::RFieldBase *fField = nullptr; 0142 std::unique_ptr<ROOT::RFieldBase::RValue> fValue; ///< Set if a value is generated, only for transformed fields 0143 void *fFieldBuffer = nullptr; ///< Usually points to the corresponding RImportBranch::fBranchBuffer but not always 0144 }; 0145 0146 /// Base class to perform data transformations from TTree branches to RNTuple fields if necessary 0147 struct RImportTransformation { 0148 std::size_t fImportBranchIdx = 0; 0149 std::size_t fImportFieldIdx = 0; 0150 0151 RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx) 0152 : fImportBranchIdx(branchIdx), fImportFieldIdx(fieldIdx) 0153 { 0154 } 0155 virtual ~RImportTransformation() = default; 0156 virtual RResult<void> Transform(const RImportBranch &branch, RImportField &field) = 0; 0157 }; 0158 0159 /// When the schema is set up and the import started, it needs to be reset before the next Import() call 0160 /// can start. This RAII guard ensures that ResetSchema is called. 0161 struct RImportGuard { 0162 RNTupleImporter &fImporter; 0163 0164 explicit RImportGuard(RNTupleImporter &importer) : fImporter(importer) {} 0165 RImportGuard(const RImportGuard &) = delete; 0166 RImportGuard &operator=(const RImportGuard &) = delete; 0167 RImportGuard(RImportGuard &&) = delete; 0168 RImportGuard &operator=(RImportGuard &&) = delete; 0169 ~RImportGuard() { fImporter.ResetSchema(); } 0170 }; 0171 0172 /// Leaf count arrays require special treatment. They are translated into untyped collections of untyped records. 0173 /// This class does the bookkeeping of the sub-schema for these collections. 0174 struct RImportLeafCountCollection { 0175 RImportLeafCountCollection() = default; 0176 RImportLeafCountCollection(const RImportLeafCountCollection &other) = delete; 0177 RImportLeafCountCollection(RImportLeafCountCollection &&other) = default; 0178 RImportLeafCountCollection &operator=(const RImportLeafCountCollection &other) = delete; 0179 RImportLeafCountCollection &operator=(RImportLeafCountCollection &&other) = default; 0180 std::string fFieldName; ///< name of the untyped collection, e.g. `_collection0`, `_collection1`, etc. 0181 /// Stores count leaf GetMaximum() to create large enough buffers for the array leafs. 0182 /// Uses Int_t because that is the return type if TLeaf::GetMaximum(). 0183 Int_t fMaxLength = 0; 0184 /// The number of elements for the collection for a particular event. Used as a destination for SetBranchAddress() 0185 /// of the count leaf 0186 std::unique_ptr<Int_t> fCountVal; 0187 /// The leafs of the array as we encounter them traversing the TTree schema. 0188 /// Eventually, the fields are moved as leaves to an untyped collection of untyped records that in turn 0189 /// is attached to the RNTuple model. 0190 std::vector<std::unique_ptr<ROOT::RFieldBase>> fLeafFields; 0191 std::vector<size_t> fLeafBranchIndexes; ///< Points to the correspondings leaf branches in fImportBranches 0192 ROOT::RRecordField *fRecordField = 0193 nullptr; ///< Points to the item field of the untyped collection field in the model. 0194 std::vector<unsigned char> fFieldBuffer; ///< The collection field memory representation. Bound to the entry. 0195 }; 0196 0197 /// Transform a NULL terminated C string branch into an `std::string` field 0198 struct RCStringTransformation : public RImportTransformation { 0199 RCStringTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {} 0200 ~RCStringTransformation() override = default; 0201 RResult<void> Transform(const RImportBranch &branch, RImportField &field) final; 0202 }; 0203 0204 RNTupleImporter() = default; 0205 0206 std::unique_ptr<TFile> fSourceFile; 0207 TTree *fSourceTree; 0208 0209 std::string fDestFileName; 0210 std::string fNTupleName; 0211 std::unique_ptr<TFile> fDestFile; 0212 ROOT::RNTupleWriteOptions fWriteOptions; 0213 0214 /// Whether or not dot characters in branch names should be converted to underscores. If this option is not set and a 0215 /// branch with a '.' is encountered, the importer will throw an exception. 0216 bool fConvertDotsInBranchNames = false; 0217 0218 /// The maximum number of entries to import. When this value is -1 (default), import all entries. 0219 std::int64_t fMaxEntries = -1; 0220 0221 /// No standard output, conversely if set to false, schema information and progress is printed. 0222 bool fIsQuiet = false; 0223 std::unique_ptr<RProgressCallback> fProgressCallback; 0224 FieldModifier_t fFieldModifier; 0225 0226 std::unique_ptr<ROOT::RNTupleModel> fModel; 0227 std::unique_ptr<ROOT::REntry> fEntry; 0228 std::vector<RImportBranch> fImportBranches; 0229 std::vector<RImportField> fImportFields; 0230 /// Maps the count leaf to the information about the corresponding untyped collection 0231 std::map<std::string, RImportLeafCountCollection> fLeafCountCollections; 0232 /// The list of transformations to be performed for every entry 0233 std::vector<std::unique_ptr<RImportTransformation>> fImportTransformations; 0234 0235 ROOT::RResult<void> InitDestination(std::string_view destFileName); 0236 0237 void ResetSchema(); 0238 /// Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory 0239 /// buffers used for reading and writing. 0240 ROOT::RResult<void> PrepareSchema(); 0241 void ReportSchema(); 0242 0243 public: 0244 RNTupleImporter(const RNTupleImporter &other) = delete; 0245 RNTupleImporter &operator=(const RNTupleImporter &other) = delete; 0246 RNTupleImporter(RNTupleImporter &&other) = delete; 0247 RNTupleImporter &operator=(RNTupleImporter &&other) = delete; 0248 ~RNTupleImporter() = default; 0249 0250 /// Opens the input file for reading and the output file for writing (update). 0251 static std::unique_ptr<RNTupleImporter> 0252 Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName); 0253 0254 /// Directly uses the provided tree and opens the output file for writing (update). 0255 static std::unique_ptr<RNTupleImporter> Create(TTree *sourceTree, std::string_view destFileName); 0256 0257 ROOT::RNTupleWriteOptions GetWriteOptions() const { return fWriteOptions; } 0258 void SetWriteOptions(ROOT::RNTupleWriteOptions options) { fWriteOptions = options; } 0259 void SetNTupleName(const std::string &name) { fNTupleName = name; } 0260 void SetMaxEntries(std::uint64_t maxEntries) { fMaxEntries = maxEntries; }; 0261 0262 /// Whereas branch names may contain dots, RNTuple field names may not. By setting this option, dot characters are 0263 /// automatically converted into underscores to prevent the importer from throwing an exception. 0264 void SetConvertDotsInBranchNames(bool value) { fConvertDotsInBranchNames = value; } 0265 0266 /// Whether or not information and progress is printed to stdout. 0267 void SetIsQuiet(bool value) { fIsQuiet = value; } 0268 0269 /// Add custom method to adjust column representations. Will be called for every field of the frozen model 0270 /// before it is attached to the page sink 0271 void SetFieldModifier(FieldModifier_t modifier) { fFieldModifier = modifier; } 0272 0273 /// Import works in two steps: 0274 /// 1. PrepareSchema() calls SetBranchAddress() on all the TTree branches and creates the corresponding RNTuple 0275 /// fields and the model 0276 /// 2. An event loop reads every entry from the TTree, applies transformations where necessary, and writes the 0277 /// output entry to the RNTuple. 0278 void Import(); 0279 }; // class RNTupleImporter 0280 0281 } // namespace Experimental 0282 } // namespace ROOT 0283 0284 #endif
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|