Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-12-21 10:06:40

0001 // Author: Enric Tejedor CERN  10/2017
0002 
0003 /*************************************************************************
0004  * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers.               *
0005  * All rights reserved.                                                  *
0006  *                                                                       *
0007  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0008  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0009  *************************************************************************/
0010 
0011 #ifndef ROOT_RCSVDS
0012 #define ROOT_RCSVDS
0013 
0014 #include "ROOT/RDataFrame.hxx"
0015 #include "ROOT/RDataSource.hxx"
0016 
0017 #include <cstdint>
0018 #include <deque>
0019 #include <list>
0020 #include <unordered_map>
0021 #include <set>
0022 #include <memory>
0023 #include <vector>
0024 
0025 #include <TRegexp.h>
0026 
0027 namespace ROOT::Internal::RDF {
0028 class R__CLING_PTRCHECK(off) RCsvDSColumnReader final : public ROOT::Detail::RDF::RColumnReaderBase {
0029    void *fValuePtr;
0030    void *GetImpl(Long64_t) final { return fValuePtr; }
0031 
0032 public:
0033    RCsvDSColumnReader(void *valuePtr) : fValuePtr(valuePtr) {}
0034 };
0035 } // namespace ROOT::Internal::RDF
0036 
0037 namespace ROOT {
0038 
0039 namespace Internal {
0040 class RRawFile;
0041 }
0042 
0043 namespace RDF {
0044 
0045 class RCsvDS final : public ROOT::RDF::RDataSource {
0046 public:
0047    /// Options that control how the CSV file is parsed
0048    struct ROptions {
0049       /// The first line describes the columns. The names are used as RDF column names
0050       /// unless fColumnNames is not empty, in which case it replaces the given names.
0051       /// If both, fHeaders is false and fColumnNames is empty, generic column names Col1.n.Col$n$ are used.
0052       bool fHeaders = true;
0053       char fDelimiter = ',';             ///< Column delimiter character
0054       bool fLeftTrim = false;            ///< Leading whitespaces are removed
0055       bool fRightTrim = false;           ///< Trailing whitespaces are removed
0056       bool fSkipBlankLines = true;       ///< Ignore empty lines (after trimming, if trimming is enabled)
0057       std::int64_t fSkipFirstNLines = 0; ///< Ignore the first N lines of the file
0058       std::int64_t fSkipLastNLines = 0;  ///< Ignore the last N lines of the file
0059       std::int64_t fLinesChunkSize = -1; ///< Number of lines to read, -1 to read all
0060       /// Character indicating that the remainder of the line should be ignored, if different from '\0'.
0061       /// If it is the first character of the line (after trimming), the line is ignored altogether.
0062       /// Note that the comment character must not be part of the data, e.g. in strings.
0063       char fComment = '\0';
0064       /// Impose column names. This can be used if a header is missing or if the header has unparsable or
0065       /// unwanted column names. If this list is not empty, it must contain exactly as many elements as
0066       /// the number of columns in the CSV file.
0067       std::vector<std::string> fColumnNames;
0068       /// Specify custom column types, accepts an unordered map with keys being column name, values being type alias
0069       /// ('O' for boolean, 'D' for double, 'L' for Long64_t, 'T' for std::string)
0070       std::unordered_map<std::string, char> fColumnTypes;
0071    };
0072 
0073 private:
0074    // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
0075    using ColType_t = char;
0076    static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
0077 
0078    // Regular expressions for type inference
0079    static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
0080 
0081    ROptions fOptions;
0082    std::uint64_t fDataPos = 0;
0083    std::int64_t fDataLineNumber = 0;
0084    std::int64_t fLineNumber = 0;     // used to skip the last lines
0085    std::int64_t fMaxLineNumber = -1; // set to non-negative if fOptions.fSkipLastNLines is set
0086    std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
0087    ULong64_t fEntryRangesRequested = 0ULL;
0088    ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
0089    std::vector<std::string> fHeaders; // the column names
0090    std::unordered_map<std::string, ColType_t> fColTypes;
0091    std::set<std::string> fColContainingEmpty; // store columns which had empty entry
0092    std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
0093    std::vector<std::vector<void *>> fColAddresses;         // fColAddresses[column][slot] (same ordering as fHeaders)
0094    std::vector<Record_t> fRecords;                         // fRecords[entry][column] (same ordering as fHeaders)
0095    std::vector<std::vector<double>> fDoubleEvtValues;      // one per column per slot
0096    std::vector<std::vector<Long64_t>> fLong64EvtValues;    // one per column per slot
0097    std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
0098    // This must be a deque to avoid the specialisation vector<bool>. This would not
0099    // work given that the pointer to the boolean in that case cannot be taken
0100    std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
0101 
0102    void Construct();
0103 
0104    bool Readln(std::string &line);
0105    void RewindToData();
0106    void FillHeaders(const std::string &);
0107    void FillRecord(const std::string &, Record_t &);
0108    void GenerateHeaders(size_t);
0109    std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
0110    void ValidateColTypes(std::vector<std::string> &) const;
0111    void InferColTypes(std::vector<std::string> &);
0112    void InferType(const std::string &, unsigned int);
0113    std::vector<std::string> ParseColumns(const std::string &);
0114    size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
0115    ColType_t GetType(std::string_view colName) const;
0116    void FreeRecords();
0117 
0118 protected:
0119    std::string AsString() final;
0120 
0121 public:
0122    RCsvDS(std::string_view fileName, const ROptions &options);
0123    RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
0124           std::unordered_map<std::string, char> &&colTypes = {});
0125    // Rule of five
0126    RCsvDS(const RCsvDS &) = delete;
0127    RCsvDS &operator=(const RCsvDS &) = delete;
0128    RCsvDS(RCsvDS &&) = delete;
0129    RCsvDS &operator=(RCsvDS &&) = delete;
0130    ~RCsvDS() final;
0131 
0132    void Finalize() final;
0133    std::size_t GetNFiles() const final { return 1; }
0134    const std::vector<std::string> &GetColumnNames() const final;
0135    std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
0136    std::string GetTypeName(std::string_view colName) const final;
0137    bool HasColumn(std::string_view colName) const final;
0138    bool SetEntry(unsigned int slot, ULong64_t entry) final;
0139    void SetNSlots(unsigned int nSlots) final;
0140    std::string GetLabel() final;
0141 
0142    std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
0143    GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final;
0144 };
0145 
0146 ////////////////////////////////////////////////////////////////////////////////////////////////
0147 /// \brief Factory method to create a CSV RDataFrame.
0148 /// \param[in] fileName Path of the CSV file.
0149 /// \param[in] options File parsing settings.
0150 RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
0151 
0152 ////////////////////////////////////////////////////////////////////////////////////////////////
0153 /// \brief Factory method to create a CSV RDataFrame.
0154 /// \param[in] fileName Path of the CSV file.
0155 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
0156 ///                        (default `true`).
0157 /// \param[in] delimiter Delimiter character (default ',').
0158 /// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
0159 /// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
0160 ///                      column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
0161 ///                      Long64_t, 'T' for std::string)
0162 RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
0163                    Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
0164 
0165 } // ns RDF
0166 
0167 } // ns ROOT
0168 
0169 #endif