Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:10:37

0001 // Author: Enric Tejedor CERN  10/2017
0002 
0003 /*************************************************************************
0004  * Copyright (C) 1995-2022, Rene Brun and Fons Rademakers.               *
0005  * All rights reserved.                                                  *
0006  *                                                                       *
0007  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0008  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0009  *************************************************************************/
0010 
0011 #ifndef ROOT_RCSVTDS
0012 #define ROOT_RCSVTDS
0013 
0014 #include "ROOT/RDataFrame.hxx"
0015 #include "ROOT/RDataSource.hxx"
0016 
0017 #include <cstdint>
0018 #include <deque>
0019 #include <list>
0020 #include <unordered_map>
0021 #include <set>
0022 #include <memory>
0023 #include <vector>
0024 
0025 #include <TRegexp.h>
0026 
0027 namespace ROOT {
0028 
0029 namespace Internal {
0030 class RRawFile;
0031 }
0032 
0033 namespace RDF {
0034 
0035 class RCsvDS final : public ROOT::RDF::RDataSource {
0036 
0037 private:
0038    // Possible values are D, O, L, T. This is possible only because we treat double, bool, Long64_t and string
0039    using ColType_t = char;
0040    static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
0041 
0042    // Regular expressions for type inference
0043    static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
0044 
0045    std::uint64_t fDataPos = 0;
0046    bool fReadHeaders = false;
0047    unsigned int fNSlots = 0U;
0048    std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
0049    const char fDelimiter;
0050    const Long64_t fLinesChunkSize;
0051    ULong64_t fEntryRangesRequested = 0ULL;
0052    ULong64_t fProcessedLines = 0ULL; // marks the progress of the consumption of the csv lines
0053    std::vector<std::string> fHeaders; // the column names
0054    std::unordered_map<std::string, ColType_t> fColTypes;
0055    std::set<std::string> fColContainingEmpty; // store columns which had empty entry
0056    std::list<ColType_t> fColTypesList; // column types, order is the same as fHeaders, values the same as fColTypes
0057    std::vector<std::vector<void *>> fColAddresses;         // fColAddresses[column][slot] (same ordering as fHeaders)
0058    std::vector<Record_t> fRecords;                         // fRecords[entry][column] (same ordering as fHeaders)
0059    std::vector<std::vector<double>> fDoubleEvtValues;      // one per column per slot
0060    std::vector<std::vector<Long64_t>> fLong64EvtValues;    // one per column per slot
0061    std::vector<std::vector<std::string>> fStringEvtValues; // one per column per slot
0062    // This must be a deque to avoid the specialisation vector<bool>. This would not
0063    // work given that the pointer to the boolean in that case cannot be taken
0064    std::vector<std::deque<bool>> fBoolEvtValues; // one per column per slot
0065 
0066    void FillHeaders(const std::string &);
0067    void FillRecord(const std::string &, Record_t &);
0068    void GenerateHeaders(size_t);
0069    std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
0070    void ValidateColTypes(std::vector<std::string> &) const;
0071    void InferColTypes(std::vector<std::string> &);
0072    void InferType(const std::string &, unsigned int);
0073    std::vector<std::string> ParseColumns(const std::string &);
0074    size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
0075    ColType_t GetType(std::string_view colName) const;
0076    void FreeRecords();
0077 
0078 protected:
0079    std::string AsString() final;
0080 
0081 public:
0082    RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
0083           std::unordered_map<std::string, char> &&colTypes = {});
0084    void Finalize() final;
0085    ~RCsvDS();
0086    const std::vector<std::string> &GetColumnNames() const final;
0087    std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
0088    std::string GetTypeName(std::string_view colName) const final;
0089    bool HasColumn(std::string_view colName) const final;
0090    bool SetEntry(unsigned int slot, ULong64_t entry) final;
0091    void SetNSlots(unsigned int nSlots) final;
0092    std::string GetLabel() final;
0093 };
0094 
0095 ////////////////////////////////////////////////////////////////////////////////////////////////
0096 /// \brief Factory method to create a CSV RDataFrame.
0097 /// \param[in] fileName Path of the CSV file.
0098 /// \param[in] readHeaders `true` if the CSV file contains headers as first row, `false` otherwise
0099 ///                        (default `true`).
0100 /// \param[in] delimiter Delimiter character (default ',').
0101 /// \param[in] linesChunkSize bunch of lines to read, use -1 to read all
0102 /// \param[in] colTypes Allow user to specify custom column types, accepts an unordered map with keys being
0103 ///                      column type, values being type alias ('O' for boolean, 'D' for double, 'L' for
0104 ///                      Long64_t, 'T' for std::string)
0105 RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
0106                    Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
0107 
0108 } // ns RDF
0109 
0110 } // ns ROOT
0111 
0112 #endif