Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-09-18 09:32:02

0001 // Author: Enric Tejedor CERN  10/2017
0002 
0003 /*************************************************************************
0004  * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers.               *
0005  * All rights reserved.                                                  *
0006  *                                                                       *
0007  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0008  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0009  *************************************************************************/
0010 
0011 #ifndef ROOT_RCSVTDS
0012 #define ROOT_RCSVTDS
0013 
0014 #include "ROOT/RDataFrame.hxx"
0015 #include "ROOT/RDataSource.hxx"
0016 
0017 #include <cstdint>
0018 #include <deque>
0019 #include <list>
0020 #include <unordered_map>
0021 #include <set>
0022 #include <memory>
0023 #include <vector>
0024 
0025 #include <TRegexp.h>
0026 
0027 namespace ROOT {
0028 
0029 namespace Internal {
0030 class RRawFile;
0031 }
0032 
0033 namespace RDF {
0034 
0035 class RCsvDS final : public ROOT::RDF::RDataSource {
0036 public:
0037    /// Options that control how the CSV file is parsed
0038    struct ROptions {
0039       /// The first line describes the columns. The names are used as RDF column names
0040       /// unless fColumnNames is not empty, in which case it replaces the given names.
0041       /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
0042       bool fHeaders = true;
0043       char fDelimiter = ',';             ///< Column delimiter character
0044       bool fLeftTrim = false;            ///< Leading whitespaces are removed
0045       bool fRightTrim = false;           ///< Trailing whitespaces are removed
0046       bool fSkipBlankLines = true;       ///< Ignore empty lines (after trimming, if trimming is enabled)
0047       std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
0048       std::int64_t fSkipLastNLines = 0;  ///< Ignore the last N lines of the file
0049       std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
0050       /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
0051       /// If it is the first character of the line (after trimming), the line is ignored altogether.
0052       /// Note that the comment character must not be part of the data, e.g. in strings.
0053       char fComment = '\0';
0054       /// Impose column names. This can be used if a header is missing or if the header has unparsable or
0055       /// unwanted column names. If this list is not empty, it must contain exactly as many elements as
0056       /// the number of columns in the CSV file.
0057       std::vector<std::string> fColumnNames;
0058       /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
0059       /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
0060       std::unordered_map<std::string, char> fColumnTypes;
0061    };
0062 
0063 private:
0064    // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
0065    using ColType_t = char;
0066    static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
0067 
0068    // Regular expressions for type inference
0069    static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
0070 
0071    ROptions fOptions;
0072    std::uint64_t fDataPos = 0;
0073    std::int64_t fDataLineNumber = 0;
0074    std::int64_t fLineNumber = 0;     // used to skip the last lines
0075    std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
0076    std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
0077    ULong64_t fEntryRangesRequested = 0ULL;
0078    ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
0079    std::vector<std::string> fHeaders; // the column names
0080    std::unordered_map<std::string, ColType_t> fColTypes;
0081    std::set<std::string> fColContainingEmpty; // store columns which had empty entry
0082    std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
0083    std::vector<std::vector<void *>> fColAddresses;         // fColAddresses[column][slot] (same ordering as fHeaders)
0084    std::vector<Record_t> fRecords;                         // fRecords[entry][column] (same ordering as fHeaders)
0085    std::vector<std::vector<double>> fDoubleEvtValues;      // one per column per slot
0086    std::vector<std::vector<Long64_t>> fLong64EvtValues;    // one per column per slot
0087    std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
0088    // This must be a deque to avoid the specialisation vector<bool>. This would not
0089    // work given that the pointer to the boolean in that case cannot be taken
0090    std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
0091 
0092    void Construct();
0093 
0094    bool Readln(std::string &line);
0095    void RewindToData();
0096    void FillHeaders(const std::string &);
0097    void FillRecord(const std::string &, Record_t &);
0098    void GenerateHeaders(size_t);
0099    std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
0100    void ValidateColTypes(std::vector<std::string> &) const;
0101    void InferColTypes(std::vector<std::string> &);
0102    void InferType(const std::string &, unsigned int);
0103    std::vector<std::string> ParseColumns(const std::string &);
0104    size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
0105    ColType_t GetType(std::string_view colName) const;
0106    void FreeRecords();
0107 
0108 protected:
0109    std::string AsString() final;
0110 
0111 public:
0112    RCsvDS(std::string_view fileName, const ROptions &options);
0113    RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
0114           std::unordered_map<std::string, char> &&colTypes = {});
0115    // Rule of five
0116    RCsvDS(const RCsvDS &) = delete;
0117    RCsvDS &operator=(const RCsvDS &) = delete;
0118    RCsvDS(RCsvDS &&) = delete;
0119    RCsvDS &operator=(RCsvDS &&) = delete;
0120    ~RCsvDS() final;
0121 
0122    void Finalize() final;
0123    std::size_t GetNFiles() const final { return 1; }
0124    const std::vector<std::string> &GetColumnNames() const final;
0125    std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
0126    std::string GetTypeName(std::string_view colName) const final;
0127    bool HasColumn(std::string_view colName) const final;
0128    bool SetEntry(unsigned int slot, ULong64_t entry) final;
0129    void SetNSlots(unsigned int nSlots) final;
0130    std::string GetLabel() final;
0131 };
0132 
0133 ////////////////////////////////////////////////////////////////////////////////////////////////
0134 /// \brief Factory method to create a CSV RDataFrame.
0135 /// \param[in] fileName Path of the CSV file.
0136 /// \param[in] options File parsing settings.
0137 RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
0138 
0139 ////////////////////////////////////////////////////////////////////////////////////////////////
0140 /// \brief Factory method to create a CSV RDataFrame.
0141 /// \param[in] fileName Path of the CSV file.
0142 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
0143 ///                        (default `true`).
0144 /// \param[in] delimiter Delimiter character (default ',').
0145 /// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
0146 /// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
0147 ///                      column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
0148 ///                      Long64_t, 'T' for std::string)
0149 RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
0150                    Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
0151 
0152 } // ns RDF
0153 
0154 } // ns ROOT
0155 
0156 #endif