File indexing completed on 2025-12-21 10:06:40
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #ifndef ROOT_RCSVDS
0012 #define ROOT_RCSVDS
0013
0014 #include "ROOT/RDataFrame.hxx"
0015 #include "ROOT/RDataSource.hxx"
0016
0017 #include <cstdint>
0018 #include <deque>
0019 #include <list>
0020 #include <unordered_map>
0021 #include <set>
0022 #include <memory>
0023 #include <vector>
0024
0025 #include <TRegexp.h>
0026
0027 namespace ROOT::Internal::RDF {
0028 class R__CLING_PTRCHECK(off) RCsvDSColumnReader final : public ROOT::Detail::RDF::RColumnReaderBase {
0029 void *fValuePtr;
0030 void *GetImpl(Long64_t) final { return fValuePtr; }
0031
0032 public:
0033 RCsvDSColumnReader(void *valuePtr) : fValuePtr(valuePtr) {}
0034 };
0035 }
0036
0037 namespace ROOT {
0038
0039 namespace Internal {
0040 class RRawFile;
0041 }
0042
0043 namespace RDF {
0044
0045 class RCsvDS final : public ROOT::RDF::RDataSource {
0046 public:
0047
0048 struct ROptions {
0049
0050
0051
0052 bool fHeaders = true;
0053 char fDelimiter = ',';
0054 bool fLeftTrim = false;
0055 bool fRightTrim = false;
0056 bool fSkipBlankLines = true;
0057 std::int64_t fSkipFirstNLines = 0;
0058 std::int64_t fSkipLastNLines = 0;
0059 std::int64_t fLinesChunkSize = -1;
0060
0061
0062
0063 char fComment = '\0';
0064
0065
0066
0067 std::vector<std::string> fColumnNames;
0068
0069
0070 std::unordered_map<std::string, char> fColumnTypes;
0071 };
0072
0073 private:
0074
0075 using ColType_t = char;
0076 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
0077
0078
0079 static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
0080
0081 ROptions fOptions;
0082 std::uint64_t fDataPos = 0;
0083 std::int64_t fDataLineNumber = 0;
0084 std::int64_t fLineNumber = 0;
0085 std::int64_t fMaxLineNumber = -1;
0086 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
0087 ULong64_t fEntryRangesRequested = 0ULL;
0088 ULong64_t fProcessedLines = 0ULL;
0089 std::vector<std::string> fHeaders;
0090 std::unordered_map<std::string, ColType_t> fColTypes;
0091 std::set<std::string> fColContainingEmpty;
0092 std::list<ColType_t> fColTypesList;
0093 std::vector<std::vector<void *>> fColAddresses;
0094 std::vector<Record_t> fRecords;
0095 std::vector<std::vector<double>> fDoubleEvtValues;
0096 std::vector<std::vector<Long64_t>> fLong64EvtValues;
0097 std::vector<std::vector<std::string>> fStringEvtValues;
0098
0099
0100 std::vector<std::deque<bool>> fBoolEvtValues;
0101
0102 void Construct();
0103
0104 bool Readln(std::string &line);
0105 void RewindToData();
0106 void FillHeaders(const std::string &);
0107 void FillRecord(const std::string &, Record_t &);
0108 void GenerateHeaders(size_t);
0109 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
0110 void ValidateColTypes(std::vector<std::string> &) const;
0111 void InferColTypes(std::vector<std::string> &);
0112 void InferType(const std::string &, unsigned int);
0113 std::vector<std::string> ParseColumns(const std::string &);
0114 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
0115 ColType_t GetType(std::string_view colName) const;
0116 void FreeRecords();
0117
0118 protected:
0119 std::string AsString() final;
0120
0121 public:
0122 RCsvDS(std::string_view fileName, const ROptions &options);
0123 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
0124 std::unordered_map<std::string, char> &&colTypes = {});
0125
0126 RCsvDS(const RCsvDS &) = delete;
0127 RCsvDS &operator=(const RCsvDS &) = delete;
0128 RCsvDS(RCsvDS &&) = delete;
0129 RCsvDS &operator=(RCsvDS &&) = delete;
0130 ~RCsvDS() final;
0131
0132 void Finalize() final;
0133 std::size_t GetNFiles() const final { return 1; }
0134 const std::vector<std::string> &GetColumnNames() const final;
0135 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
0136 std::string GetTypeName(std::string_view colName) const final;
0137 bool HasColumn(std::string_view colName) const final;
0138 bool SetEntry(unsigned int slot, ULong64_t entry) final;
0139 void SetNSlots(unsigned int nSlots) final;
0140 std::string GetLabel() final;
0141
0142 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
0143 GetColumnReaders(unsigned int slot, std::string_view colName, const std::type_info &tid) final;
0144 };
0145
0146
0147
0148
0149
0150 RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
0151
0152
0153
0154
0155
0156
0157
0158
0159
0160
0161
0162 RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
0163 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
0164
0165 }
0166
0167 }
0168
0169 #endif