File indexing completed on 2025-09-18 09:32:02
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011 #ifndef ROOT_RCSVTDS
0012 #define ROOT_RCSVTDS
0013
0014 #include "ROOT/RDataFrame.hxx"
0015 #include "ROOT/RDataSource.hxx"
0016
0017 #include <cstdint>
0018 #include <deque>
0019 #include <list>
0020 #include <unordered_map>
0021 #include <set>
0022 #include <memory>
0023 #include <vector>
0024
0025 #include <TRegexp.h>
0026
0027 namespace ROOT {
0028
0029 namespace Internal {
0030 class RRawFile;
0031 }
0032
0033 namespace RDF {
0034
0035 class RCsvDS final : public ROOT::RDF::RDataSource {
0036 public:
0037
0038 struct ROptions {
0039
0040
0041
0042 bool fHeaders = true;
0043 char fDelimiter = ',';
0044 bool fLeftTrim = false;
0045 bool fRightTrim = false;
0046 bool fSkipBlankLines = true;
0047 std::int64_t fSkipFirstNLines = 0;
0048 std::int64_t fSkipLastNLines = 0;
0049 std::int64_t fLinesChunkSize = -1;
0050
0051
0052
0053 char fComment = '\0';
0054
0055
0056
0057 std::vector<std::string> fColumnNames;
0058
0059
0060 std::unordered_map<std::string, char> fColumnTypes;
0061 };
0062
0063 private:
0064
0065 using ColType_t = char;
0066 static const std::unordered_map<ColType_t, std::string> fgColTypeMap;
0067
0068
0069 static const TRegexp fgIntRegex, fgDoubleRegex1, fgDoubleRegex2, fgDoubleRegex3, fgTrueRegex, fgFalseRegex;
0070
0071 ROptions fOptions;
0072 std::uint64_t fDataPos = 0;
0073 std::int64_t fDataLineNumber = 0;
0074 std::int64_t fLineNumber = 0;
0075 std::int64_t fMaxLineNumber = -1;
0076 std::unique_ptr<ROOT::Internal::RRawFile> fCsvFile;
0077 ULong64_t fEntryRangesRequested = 0ULL;
0078 ULong64_t fProcessedLines = 0ULL;
0079 std::vector<std::string> fHeaders;
0080 std::unordered_map<std::string, ColType_t> fColTypes;
0081 std::set<std::string> fColContainingEmpty;
0082 std::list<ColType_t> fColTypesList;
0083 std::vector<std::vector<void *>> fColAddresses;
0084 std::vector<Record_t> fRecords;
0085 std::vector<std::vector<double>> fDoubleEvtValues;
0086 std::vector<std::vector<Long64_t>> fLong64EvtValues;
0087 std::vector<std::vector<std::string>> fStringEvtValues;
0088
0089
0090 std::vector<std::deque<bool>> fBoolEvtValues;
0091
0092 void Construct();
0093
0094 bool Readln(std::string &line);
0095 void RewindToData();
0096 void FillHeaders(const std::string &);
0097 void FillRecord(const std::string &, Record_t &);
0098 void GenerateHeaders(size_t);
0099 std::vector<void *> GetColumnReadersImpl(std::string_view, const std::type_info &) final;
0100 void ValidateColTypes(std::vector<std::string> &) const;
0101 void InferColTypes(std::vector<std::string> &);
0102 void InferType(const std::string &, unsigned int);
0103 std::vector<std::string> ParseColumns(const std::string &);
0104 size_t ParseValue(const std::string &, std::vector<std::string> &, size_t);
0105 ColType_t GetType(std::string_view colName) const;
0106 void FreeRecords();
0107
0108 protected:
0109 std::string AsString() final;
0110
0111 public:
0112 RCsvDS(std::string_view fileName, const ROptions &options);
0113 RCsvDS(std::string_view fileName, bool readHeaders = true, char delimiter = ',', Long64_t linesChunkSize = -1LL,
0114 std::unordered_map<std::string, char> &&colTypes = {});
0115
0116 RCsvDS(const RCsvDS &) = delete;
0117 RCsvDS &operator=(const RCsvDS &) = delete;
0118 RCsvDS(RCsvDS &&) = delete;
0119 RCsvDS &operator=(RCsvDS &&) = delete;
0120 ~RCsvDS() final;
0121
0122 void Finalize() final;
0123 std::size_t GetNFiles() const final { return 1; }
0124 const std::vector<std::string> &GetColumnNames() const final;
0125 std::vector<std::pair<ULong64_t, ULong64_t>> GetEntryRanges() final;
0126 std::string GetTypeName(std::string_view colName) const final;
0127 bool HasColumn(std::string_view colName) const final;
0128 bool SetEntry(unsigned int slot, ULong64_t entry) final;
0129 void SetNSlots(unsigned int nSlots) final;
0130 std::string GetLabel() final;
0131 };
0132
0133
0134
0135
0136
0137 RDataFrame FromCSV(std::string_view fileName, const RCsvDS::ROptions &options);
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149 RDataFrame FromCSV(std::string_view fileName, bool readHeaders = true, char delimiter = ',',
0150 Long64_t linesChunkSize = -1LL, std::unordered_map<std::string, char> &&colTypes = {});
0151
0152 }
0153
0154 }
0155
0156 #endif