Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:10:45

0001 /// \file ROOT/RNTupleImporter.hxx
0002 /// \ingroup NTuple ROOT7
0003 /// \author Jakob Blomer <jblomer@cern.ch>
0004 /// \date 2022-11-22
0005 /// \warning This is part of the ROOT 7 prototype! It will change without notice. It might trigger earthquakes. Feedback
0006 /// is welcome!
0007 
0008 /*************************************************************************
0009  * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers.               *
0010  * All rights reserved.                                                  *
0011  *                                                                       *
0012  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0013  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0014  *************************************************************************/
0015 
0016 #ifndef ROOT7_RNTuplerImporter
0017 #define ROOT7_RNTuplerImporter
0018 
0019 #include <ROOT/REntry.hxx>
0020 #include <ROOT/RError.hxx>
0021 #include <ROOT/RField.hxx>
0022 #include <ROOT/RNTupleCollectionWriter.hxx>
0023 #include <ROOT/RNTupleModel.hxx>
0024 #include <ROOT/RNTupleWriteOptions.hxx>
0025 #include <ROOT/RNTupleWriter.hxx>
0026 #include <string_view>
0027 
0028 #include <TFile.h>
0029 #include <TTree.h>
0030 
0031 #include <cstdlib>
0032 #include <map>
0033 #include <memory>
0034 #include <vector>
0035 
0036 class TLeaf;
0037 
0038 namespace ROOT {
0039 namespace Experimental {
0040 
0041 // clang-format off
0042 /**
0043 \class ROOT::Experimental::RNTupleImporter
0044 \ingroup NTuple
0045 \brief Converts a TTree into an RNTuple
0046 
0047 Example usage (see the ntpl008_import.C tutorial for a full example):
0048 
0049 ~~~ {.cpp}
0050 #include <ROOT/RNTupleImporter.hxx>
0051 using ROOT::Experimental::RNTupleImporter;
0052 
0053 auto importer = RNTupleImporter::Create("data.root", "TreeName", "output.root");
0054 // As required: importer->SetNTupleName(), importer->SetWriteOptions(), ...
0055 importer->Import();
0056 ~~~
0057 
0058 The output file is created if it does not exist, otherwise the ntuple is added to the existing file.
0059 Note that input file and output file can be identical if the ntuple is stored under a different name than the tree
0060 (use `SetNTupleName()`).
0061 
0062 By default, the RNTuple is compressed with zstd, independent of the input compression. The compression settings
0063 (and other output parameters) can be changed by `SetWriteOptions()`. For example, to compress the imported RNTuple
0064 using lz4 (with compression level 4) instead:
0065 
0066 ~~~ {.cpp}
0067 auto writeOptions = importer->GetWriteOptions();
0068 writeOptions.SetCompression(404);
0069 importer->SetWriteOptions(writeOptions);
0070 ~~~
0071 
0072 Most RNTuple fields have a type identical to the corresponding TTree input branch. Exceptions are
0073   - C string branches are translated to `std::string` fields
0074   - C style arrays are translated to `std::array<...>` fields
0075   - Leaf lists are translated to untyped records
0076   - Leaf count arrays are translated to anonymous collections with generic names (`_collection0`, `_collection1`, etc.).
0077     In order to keep field names and branch names aligned, RNTuple projects the members of these collections and
0078     its collection counter to the input branch names. For instance, the following input leafs:
0079 ~~~
0080 Int_t njets
0081 float jet_pt[njets]
0082 float jet_eta[njets]
0083 ~~~
0084     will be converted to the following RNTuple schema:
0085 ~~~
0086       _collection0 (untyped collection)
0087       |- float jet_pt
0088       |- float jet_eta
0089       std::size_t (RNTupleCardinality) njets   (projected from _collection0 without subfields)
0090       ROOT::RVec<float>                jet_pt  (projected from _collection0.jet_pt)
0091       ROOT::RVec<float>                jet_eta (projected from _collection0.jet_eta)
0092 ~~~
0093     These projections are meta-data only operations and don't involve duplicating the data.
0094 
0095 Current limitations of the importer:
0096   - No support for trees containing TClonesArray collections
0097   - Due to RNTuple currently storing data fully split, "don't split" markers are ignored
0098   - Some types are not available in RNTuple. Please refer to the
0099     [RNTuple specification](https://github.com/root-project/root/blob/master/tree/ntuple/v7/doc/specifications.md) for
0100     an overview of all types currently supported.
0101 */
0102 // clang-format on
0103 class RNTupleImporter {
0104 public:
0105    /// Used to report every ~50MB (compressed), and at the end about the status of the import.
0106    class RProgressCallback {
0107    public:
0108       virtual ~RProgressCallback() = default;
0109       void operator()(std::uint64_t nbytesWritten, std::uint64_t neventsWritten)
0110       {
0111          Call(nbytesWritten, neventsWritten);
0112       }
0113       virtual void Call(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0;
0114       virtual void Finish(std::uint64_t nbytesWritten, std::uint64_t neventsWritten) = 0;
0115    };
0116 
0117 private:
0118    struct RImportBranch {
0119       RImportBranch() = default;
0120       RImportBranch(const RImportBranch &other) = delete;
0121       RImportBranch(RImportBranch &&other) = default;
0122       RImportBranch &operator=(const RImportBranch &other) = delete;
0123       RImportBranch &operator=(RImportBranch &&other) = default;
0124       std::string fBranchName;                        ///< Top-level branch name from the input TTree
0125       std::unique_ptr<unsigned char[]> fBranchBuffer; ///< The destination of SetBranchAddress() for `fBranchName`
0126    };
0127 
0128    struct RImportField {
0129       RImportField() = default;
0130       ~RImportField() = default;
0131       RImportField(const RImportField &other) = delete;
0132       RImportField(RImportField &&other) = default;
0133       RImportField &operator=(const RImportField &other) = delete;
0134       RImportField &operator=(RImportField &&other) = default;
0135 
0136       /// The field is kept during schema preparation and transferred to the fModel before the writing starts
0137       RFieldBase *fField = nullptr;
0138       std::unique_ptr<RFieldBase::RValue> fValue; ///< Set if a value is generated, only for transformed fields
0139       void *fFieldBuffer = nullptr; ///< Usually points to the corresponding RImportBranch::fBranchBuffer but not always
0140       bool fIsInUntypedCollection = false; ///< Sub-fields of untyped collections (leaf count arrays in the input)
0141       bool fIsClass = false; ///< Field imported from a branch with stramer info (e.g., STL, user-defined class)
0142    };
0143 
0144    /// Base class to perform data transformations from TTree branches to RNTuple fields if necessary
0145    struct RImportTransformation {
0146       std::size_t fImportBranchIdx = 0;
0147       std::size_t fImportFieldIdx = 0;
0148 
0149       RImportTransformation(std::size_t branchIdx, std::size_t fieldIdx)
0150          : fImportBranchIdx(branchIdx), fImportFieldIdx(fieldIdx)
0151       {
0152       }
0153       virtual ~RImportTransformation() = default;
0154       virtual RResult<void> Transform(const RImportBranch &branch, RImportField &field) = 0;
0155       virtual void ResetEntry() = 0; // called at the end of an entry
0156    };
0157 
0158    /// When the schema is set up and the import started, it needs to be reset before the next Import() call
0159    /// can start. This RAII guard ensures that ResetSchema is called.
0160    struct RImportGuard {
0161       RNTupleImporter &fImporter;
0162 
0163       explicit RImportGuard(RNTupleImporter &importer) : fImporter(importer) {}
0164       RImportGuard(const RImportGuard &) = delete;
0165       RImportGuard &operator=(const RImportGuard &) = delete;
0166       RImportGuard(RImportGuard &&) = delete;
0167       RImportGuard &operator=(RImportGuard &&) = delete;
0168       ~RImportGuard() { fImporter.ResetSchema(); }
0169    };
0170 
0171    /// Leaf count arrays require special treatment. They are translated into RNTuple untyped collections.
0172    /// This class does the bookkeeping of the sub-schema for these collections.
0173    struct RImportLeafCountCollection {
0174       RImportLeafCountCollection() = default;
0175       RImportLeafCountCollection(const RImportLeafCountCollection &other) = delete;
0176       RImportLeafCountCollection(RImportLeafCountCollection &&other) = default;
0177       RImportLeafCountCollection &operator=(const RImportLeafCountCollection &other) = delete;
0178       RImportLeafCountCollection &operator=(RImportLeafCountCollection &&other) = default;
0179       std::unique_ptr<RNTupleModel> fCollectionModel;             ///< The model for the collection itself
0180       std::shared_ptr<RNTupleCollectionWriter> fCollectionWriter; ///< Used to fill the collection elements per event
0181       std::unique_ptr<REntry> fCollectionEntry; ///< Keeps the memory location of the collection members
0182       /// The number of elements for the collection for a particular event. Used as a destination for SetBranchAddress()
0183       /// of the count leaf
0184       std::unique_ptr<Int_t> fCountVal;
0185       std::vector<size_t> fImportFieldIndexes; ///< Points to the correspondings fields in fImportFields
0186       /// One transformation for every field, to copy the content of the array one by one
0187       std::vector<std::unique_ptr<RImportTransformation>> fTransformations;
0188       Int_t fMaxLength = 0;   ///< Stores count leaf GetMaximum() to create large enough buffers for the array leafs
0189       std::string fFieldName; ///< name of the untyped collection, e.g. `_collection0`, `_collection1`, etc.
0190    };
0191 
0192    /// Transform a NULL terminated C string branch into an `std::string` field
0193    struct RCStringTransformation : public RImportTransformation {
0194       RCStringTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {}
0195       ~RCStringTransformation() override = default;
0196       RResult<void> Transform(const RImportBranch &branch, RImportField &field) final;
0197       void ResetEntry() final {}
0198    };
0199 
0200    /// When writing the elements of a leaf count array, moves the data from the input array one-by-one
0201    /// to the memory locations of the fields of the corresponding untyped collection.
0202    /// TODO(jblomer): write arrays as a whole to RNTuple
0203    struct RLeafArrayTransformation : public RImportTransformation {
0204       std::int64_t fNum = 0;
0205       RLeafArrayTransformation(std::size_t b, std::size_t f) : RImportTransformation(b, f) {}
0206       ~RLeafArrayTransformation() override = default;
0207       RResult<void> Transform(const RImportBranch &branch, RImportField &field) final;
0208       void ResetEntry() final { fNum = 0; }
0209    };
0210 
0211    RNTupleImporter() = default;
0212 
0213    std::unique_ptr<TFile> fSourceFile;
0214    TTree *fSourceTree;
0215 
0216    std::string fDestFileName;
0217    std::string fNTupleName;
0218    std::unique_ptr<TFile> fDestFile;
0219    RNTupleWriteOptions fWriteOptions;
0220 
0221    /// Whether or not dot characters in branch names should be converted to underscores. If this option is not set and a
0222    /// branch with a '.' is encountered, the importer will throw an exception.
0223    bool fConvertDotsInBranchNames = false;
0224 
0225    /// The maximum number of entries to import. When this value is -1 (default), import all entries.
0226    std::int64_t fMaxEntries = -1;
0227 
0228    /// No standard output, conversely if set to false, schema information and progress is printed.
0229    bool fIsQuiet = false;
0230    std::unique_ptr<RProgressCallback> fProgressCallback;
0231 
0232    std::unique_ptr<RNTupleModel> fModel;
0233    std::unique_ptr<REntry> fEntry;
0234    std::vector<RImportBranch> fImportBranches;
0235    std::vector<RImportField> fImportFields;
0236    /// Maps the count leaf to the information about the corresponding untyped collection
0237    std::map<std::string, RImportLeafCountCollection> fLeafCountCollections;
0238    /// The list of transformations to be performed for every entry
0239    std::vector<std::unique_ptr<RImportTransformation>> fImportTransformations;
0240 
0241    ROOT::Experimental::RResult<void> InitDestination(std::string_view destFileName);
0242 
0243    void ResetSchema();
0244    /// Sets up the connection from TTree branches to RNTuple fields, including initialization of the memory
0245    /// buffers used for reading and writing.
0246    RResult<void> PrepareSchema();
0247    void ReportSchema();
0248 
0249 public:
0250    RNTupleImporter(const RNTupleImporter &other) = delete;
0251    RNTupleImporter &operator=(const RNTupleImporter &other) = delete;
0252    RNTupleImporter(RNTupleImporter &&other) = delete;
0253    RNTupleImporter &operator=(RNTupleImporter &&other) = delete;
0254    ~RNTupleImporter() = default;
0255 
0256    /// Opens the input file for reading and the output file for writing (update).
0257    static std::unique_ptr<RNTupleImporter>
0258    Create(std::string_view sourceFileName, std::string_view treeName, std::string_view destFileName);
0259 
0260    /// Directly uses the provided tree and opens the output file for writing (update).
0261    static std::unique_ptr<RNTupleImporter> Create(TTree *sourceTree, std::string_view destFileName);
0262 
0263    RNTupleWriteOptions GetWriteOptions() const { return fWriteOptions; }
0264    void SetWriteOptions(RNTupleWriteOptions options) { fWriteOptions = options; }
0265    void SetNTupleName(const std::string &name) { fNTupleName = name; }
0266    void SetMaxEntries(std::uint64_t maxEntries) { fMaxEntries = maxEntries; };
0267 
0268    /// Whereas branch names may contain dots, RNTuple field names may not. By setting this option, dot characters are
0269    /// automatically converted into underscores to prevent the importer from throwing an exception.
0270    void SetConvertDotsInBranchNames(bool value) { fConvertDotsInBranchNames = value; }
0271 
0272    /// Whether or not information and progress is printed to stdout.
0273    void SetIsQuiet(bool value) { fIsQuiet = value; }
0274 
0275    /// Import works in two steps:
0276    /// 1. PrepareSchema() calls SetBranchAddress() on all the TTree branches and creates the corresponding RNTuple
0277    ///    fields and the model
0278    /// 2. An event loop reads every entry from the TTree, applies transformations where necessary, and writes the
0279    ///    output entry to the RNTuple.
0280    void Import();
0281 }; // class RNTupleImporter
0282 
0283 } // namespace Experimental
0284 } // namespace ROOT
0285 
0286 #endif