Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-02-21 09:58:12

0001 // SPDX-License-Identifier: MIT
0002 // Copyright 2015,2018-2020 Moritz Kiehn
0003 //
0004 // Permission is hereby granted, free of charge, to any person obtaining a copy
0005 // of this software and associated documentation files (the "Software"), to deal
0006 // in the Software without restriction, including without limitation the rights
0007 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
0008 // copies of the Software, and to permit persons to whom the Software is
0009 // furnished to do so, subject to the following conditions:
0010 //
0011 // The above copyright notice and this permission notice shall be included in
0012 // all copies or substantial portions of the Software.
0013 //
0014 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
0015 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
0016 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
0017 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
0018 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
0019 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0020 // SOFTWARE.
0021 
0022 /// \file
0023 /// \brief   Read/write (d)elimiter-(s)eparated (v)alues text files
0024 /// \author  Moritz Kiehn <msmk@cern.ch>
0025 /// \date    2019-09-09, Split dsv i/o from the namedtuple library
0026 
0027 #pragma once
0028 
0029 #include <algorithm>
0030 #include <array>
0031 #include <fstream>
0032 #include <limits>
0033 #include <sstream>
0034 #include <stdexcept>
0035 #include <string>
0036 #include <tuple>
0037 #include <type_traits>
0038 #include <utility>
0039 #include <vector>
0040 
0041 namespace dfe {
0042 namespace io_dsv_impl {
0043 
0044 /// Write arbitrary data as delimiter-separated values into a text file.
0045 template<char Delimiter>
0046 class DsvWriter {
0047 public:
0048   DsvWriter() = delete;
0049   DsvWriter(const DsvWriter&) = delete;
0050   DsvWriter(DsvWriter&&) = default;
0051   ~DsvWriter() = default;
0052   DsvWriter& operator=(const DsvWriter&) = delete;
0053   DsvWriter& operator=(DsvWriter&&) = default;
0054 
0055   /// Create a file at the given path. Overwrites existing data.
0056   ///
0057   /// \param columns    Column names, fixes the number of columns for the file
0058   /// \param path       Path to the output file
0059   /// \param precision  Output floating point precision
0060   DsvWriter(
0061     const std::vector<std::string>& columns, const std::string& path,
0062     int precision = std::numeric_limits<double>::max_digits10);
0063 
0064   /// Append arguments as a new row to the file.
0065   ///
0066   /// Each argument corresponds to one column. The writer ensures that the
0067   /// number of columns written match the number of columns that were specified
0068   /// during construction.
0069   ///
0070   /// \note `std::vector` arguments are automatically unpacked and each entry
0071   ///       is written as a separate column.
0072   template<typename Arg0, typename... Args>
0073   void append(Arg0&& arg0, Args&&... args);
0074 
0075 private:
0076   std::ofstream m_file;
0077   std::size_t m_num_columns;
0078 
0079   // enable_if to prevent this overload to be used for std::vector<T> as well
0080   template<typename T>
0081   static std::enable_if_t<
0082     std::is_arithmetic<std::decay_t<T>>::value
0083       or std::is_convertible<T, std::string>::value,
0084     unsigned>
0085   write(T&& x, std::ostream& os);
0086   template<typename T, typename Allocator>
0087   static unsigned write(const std::vector<T, Allocator>& xs, std::ostream& os);
0088 };
0089 
0090 /// Read arbitrary data as delimiter-separated values from a text file.
0091 template<char Delimiter>
0092 class DsvReader {
0093 public:
0094   DsvReader() = delete;
0095   DsvReader(const DsvReader&) = delete;
0096   DsvReader(DsvReader&&) = default;
0097   ~DsvReader() = default;
0098   DsvReader& operator=(const DsvReader&) = delete;
0099   DsvReader& operator=(DsvReader&&) = default;
0100 
0101   /// Open a file at the given path.
0102   ///
0103   /// \param path Path to the input file
0104   DsvReader(const std::string& path);
0105 
0106   /// Read the next line from the file.
0107   ///
0108   /// \returns true   if the line was successfully read
0109   /// \returns false  if no more lines are available
0110   bool read(std::vector<std::string>& columns);
0111 
0112   /// Return the number of lines read so far.
0113   std::size_t num_lines() const { return m_num_lines; }
0114 
0115 private:
0116   std::ifstream m_file;
0117   std::string m_line;
0118   std::size_t m_num_lines = 0;
0119 };
0120 
0121 /// Write records as delimiter-separated values into a text file.
0122 template<char Delimiter, typename NamedTuple>
0123 class NamedTupleDsvWriter {
0124 public:
0125   NamedTupleDsvWriter() = delete;
0126   NamedTupleDsvWriter(const NamedTupleDsvWriter&) = delete;
0127   NamedTupleDsvWriter(NamedTupleDsvWriter&&) = default;
0128   ~NamedTupleDsvWriter() = default;
0129   NamedTupleDsvWriter& operator=(const NamedTupleDsvWriter&) = delete;
0130   NamedTupleDsvWriter& operator=(NamedTupleDsvWriter&&) = default;
0131 
0132   /// Create a file at the given path. Overwrites existing data.
0133   ///
0134   /// \param path       Path to the output file
0135   /// \param precision  Output floating point precision
0136   NamedTupleDsvWriter(
0137     const std::string& path,
0138     int precision = std::numeric_limits<double>::max_digits10)
0139     : m_writer(colum_names(), path, precision) {}
0140 
0141   /// Append a record to the file.
0142   void append(const NamedTuple& record) {
0143     append_impl(
0144       record, std::make_index_sequence<
0145                 std::tuple_size<typename NamedTuple::Tuple>::value>{});
0146   }
0147 
0148 private:
0149   DsvWriter<Delimiter> m_writer;
0150 
0151   static std::vector<std::string> colum_names() {
0152     const auto& from_record = NamedTuple::names();
0153     return {from_record.begin(), from_record.end()};
0154   }
0155   template<std::size_t... I>
0156   void append_impl(const NamedTuple& values, std::index_sequence<I...>) {
0157     using std::get;
0158     m_writer.append(get<I>(values)...);
0159   }
0160 };
0161 
0162 // string conversion helper functions
0163 
0164 template<typename T>
0165 static void
0166 parse(const std::string& str, T& value) {
0167   // TODO use somthing w/ lower overhead then stringstream e.g. std::from_chars
0168   std::istringstream is(str);
0169   is >> value;
0170 }
0171 
0172 /// Read records as delimiter-separated values from a text file.
0173 ///
0174 /// The reader is strict about its input format to avoid ambiguities. If
0175 /// header verification is disabled, the first line will be skipped and each
0176 /// line must contain exactly as many columns as there are tuple members in
0177 /// exactly the same order. If header verification is enabled, the first line
0178 /// is interpreted as the header. Names in the header must match exactly to
0179 /// the tuple members but can be in arbitrary order. The file can contain
0180 /// extra columns that are not tuple members. Each following row must have
0181 /// exactly the same number of columns as the header.
0182 template<char Delimiter, typename NamedTuple>
0183 class NamedTupleDsvReader {
0184 public:
0185   NamedTupleDsvReader() = delete;
0186   NamedTupleDsvReader(const NamedTupleDsvReader&) = delete;
0187   NamedTupleDsvReader(NamedTupleDsvReader&&) = default;
0188   ~NamedTupleDsvReader() = default;
0189   NamedTupleDsvReader& operator=(const NamedTupleDsvReader&) = delete;
0190   NamedTupleDsvReader& operator=(NamedTupleDsvReader&&) = default;
0191 
0192   /// Open a file at the given path.
0193   ///
0194   /// \param path              Path to the input file
0195   /// \param optional_columns  Record columns that can be missing in the file
0196   /// \param verify_header     true to check header column names, false to skip
0197   ///
0198   /// The set of optional columns must match names in the record. When allowing
0199   /// optional columns, header verification must be set to true.
0200   NamedTupleDsvReader(
0201     const std::string& path,
0202     const std::vector<std::string>& optional_columns = {},
0203     bool verify_header = true);
0204 
0205   /// Read the next record from the file.
0206   ///
0207   /// Extra columns in the file will be ignored. Elements of the record that
0208   /// correspond to missing, optional columns will not be set and retain
0209   /// their value.
0210   ///
0211   /// \returns true   if a record was successfully read
0212   /// \returns false  if no more records are available
0213   bool read(NamedTuple& record);
0214 
0215   /// Read the next record and any extra columns from the file.
0216   ///
0217   /// \returns true   if a record was successfully read
0218   /// \returns false  if no more records are available
0219   template<typename T>
0220   bool read(NamedTuple& record, std::vector<T>& extra);
0221 
0222   /// Return the number of additional columns that are not part of the tuple.
0223   std::size_t num_extra_columns() const { return m_extra_columns.size(); }
0224   /// Return the number of records read so far.
0225   std::size_t num_records() const { return m_reader.num_lines() - 1u; }
0226 
0227 private:
0228   // the equivalent std::tuple-like type
0229   using Tuple = typename NamedTuple::Tuple;
0230 
0231   DsvReader<Delimiter> m_reader;
0232   std::vector<std::string> m_columns;
0233   // #columns is fixed to a reasonable value after reading the header
0234   std::size_t m_num_columns = SIZE_MAX;
0235   // map tuple index to column index in the file, SIZE_MAX for missing elements
0236   std::array<std::size_t, std::tuple_size<Tuple>::value> m_tuple_column_map;
0237   // column indices that do not map to a tuple items
0238   std::vector<std::size_t> m_extra_columns;
0239 
0240   void use_default_columns();
0241   void parse_header(const std::vector<std::string>& optional_columns);
0242   template<std::size_t... I>
0243   void parse_record(NamedTuple& record, std::index_sequence<I...>) const {
0244     // see namedtuple_impl::print_tuple for explanation
0245     // allow different column ordering on file and optional columns
0246     using Vacuum = int[];
0247     (void)Vacuum{(parse_element<I>(record), 0)...};
0248   }
0249   template<std::size_t I>
0250   void parse_element(NamedTuple& record) const {
0251     using std::get;
0252     if (m_tuple_column_map[I] != SIZE_MAX) {
0253       parse(m_columns[m_tuple_column_map[I]], get<I>(record));
0254     }
0255   }
0256 };
0257 
0258 // implementation writer
0259 
0260 template<char Delimiter>
0261 inline DsvWriter<Delimiter>::DsvWriter(
0262   const std::vector<std::string>& columns, const std::string& path,
0263   int precision)
0264   : m_file(
0265     path, std::ios_base::binary | std::ios_base::out | std::ios_base::trunc)
0266   , m_num_columns(columns.size()) {
0267   if (not m_file.is_open() or m_file.fail()) {
0268     throw std::runtime_error("Could not open file '" + path + "'");
0269   }
0270   m_file.precision(precision);
0271   if (m_num_columns == 0) {
0272     throw std::invalid_argument("No columns were specified");
0273   }
0274   // write column names as header row
0275   append(columns);
0276 }
0277 
0278 template<char Delimiter>
0279 template<typename Arg0, typename... Args>
0280 inline void
0281 DsvWriter<Delimiter>::append(Arg0&& arg0, Args&&... args) {
0282   // we can only check how many columns were written after they have been
0283   // written. write to temporary first to prevent bad data on file.
0284   std::stringstream line;
0285   // ensure consistent formatting
0286   line.precision(m_file.precision());
0287   unsigned written_columns[] = {
0288     // write the first item without a delimiter and store columns written
0289     write(std::forward<Arg0>(arg0), line),
0290     // for all other items, write the delimiter followed by the item itself
0291     // (<expr1>, <expr2>) use the comma operator (yep, ',' in c++ is a weird
0292     // but helpful operator) to execute both expression and return the return
0293     // value of the last one, i.e. here thats the number of columns written.
0294     // the ... pack expansion creates this expression for all arguments
0295     (line << Delimiter, write(std::forward<Args>(args), line))...,
0296   };
0297   line << '\n';
0298   // validate that the total number of written columns matches the specs.
0299   unsigned total_columns = 0;
0300   for (auto nc : written_columns) {
0301     total_columns += nc;
0302   }
0303   if (total_columns < m_num_columns) {
0304     throw std::invalid_argument("Not enough columns");
0305   }
0306   if (m_num_columns < total_columns) {
0307     throw std::invalid_argument("Too many columns");
0308   }
0309   // write the line to disk and check that it actually happened
0310   m_file << line.rdbuf();
0311   if (not m_file.good()) {
0312     throw std::runtime_error("Could not write data to file");
0313   }
0314 }
0315 
0316 template<char Delimiter>
0317 template<typename T>
0318 inline std::enable_if_t<
0319   std::is_arithmetic<std::decay_t<T>>::value
0320     or std::is_convertible<T, std::string>::value,
0321   unsigned>
0322 DsvWriter<Delimiter>::write(T&& x, std::ostream& os) {
0323   os << x;
0324   return 1u;
0325 }
0326 
0327 template<char Delimiter>
0328 template<typename T, typename Allocator>
0329 inline unsigned
0330 DsvWriter<Delimiter>::write(
0331   const std::vector<T, Allocator>& xs, std::ostream& os) {
0332   unsigned n = 0;
0333   for (const auto& x : xs) {
0334     if (0 < n) {
0335       os << Delimiter;
0336     }
0337     os << x;
0338     n += 1;
0339   }
0340   return n;
0341 }
0342 
0343 // implementation reader
0344 
0345 template<char Delimiter>
0346 inline DsvReader<Delimiter>::DsvReader(const std::string& path)
0347   : m_file(path, std::ios_base::binary | std::ios_base::in) {
0348   if (not m_file.is_open() or m_file.fail()) {
0349     throw std::runtime_error("Could not open file '" + path + "'");
0350   }
0351 }
0352 
0353 template<char Delimiter>
0354 inline bool
0355 DsvReader<Delimiter>::read(std::vector<std::string>& columns) {
0356   // read the next line and check for both end-of-file and errors
0357   std::getline(m_file, m_line);
0358   if (m_file.eof()) {
0359     return false;
0360   }
0361   if (m_file.fail()) {
0362     throw std::runtime_error(
0363       "Could not read line " + std::to_string(m_num_lines));
0364   }
0365   m_num_lines += 1;
0366 
0367   // split the line into columns
0368   columns.clear();
0369   for (std::string::size_type pos = 0; pos < m_line.size();) {
0370     auto del = m_line.find_first_of(Delimiter, pos);
0371     if (del == std::string::npos) {
0372       // reached the end of the line; also determines the last column
0373       columns.emplace_back(m_line, pos);
0374       break;
0375     } else {
0376       columns.emplace_back(m_line, pos, del - pos);
0377       // start next column search after the delimiter
0378       pos = del + 1;
0379     }
0380   }
0381   return true;
0382 }
0383 
0384 // implementation named tuple reader
0385 
0386 template<char Delimiter, typename NamedTuple>
0387 inline NamedTupleDsvReader<Delimiter, NamedTuple>::NamedTupleDsvReader(
0388   const std::string& path, const std::vector<std::string>& optional_columns,
0389   bool verify_header)
0390   : m_reader(path) {
0391   // optional columns only work if we verify the header
0392   if ((not optional_columns.empty()) and (not verify_header)) {
0393     throw std::runtime_error(
0394       "Optional columns can not be used without header verification");
0395   }
0396   // first line is always the header
0397   if (not m_reader.read(m_columns)) {
0398     throw std::runtime_error("Could not read header from '" + path + "'");
0399   }
0400   if (verify_header) {
0401     parse_header(optional_columns);
0402   } else {
0403     use_default_columns();
0404   }
0405 }
0406 
0407 template<char Delimiter, typename NamedTuple>
0408 inline bool
0409 NamedTupleDsvReader<Delimiter, NamedTuple>::read(NamedTuple& record) {
0410   if (not m_reader.read(m_columns)) {
0411     return false;
0412   }
0413   // check for consistent entries per-line
0414   if (m_columns.size() < m_num_columns) {
0415     throw std::runtime_error(
0416       "Too few columns in line " + std::to_string(m_reader.num_lines()));
0417   }
0418   if (m_num_columns < m_columns.size()) {
0419     throw std::runtime_error(
0420       "Too many columns in line " + std::to_string(m_reader.num_lines()));
0421   }
0422   // convert to tuple
0423   parse_record(
0424     record, std::make_index_sequence<std::tuple_size<Tuple>::value>{});
0425   return true;
0426 }
0427 
0428 template<char Delimiter, typename NamedTuple>
0429 template<typename T>
0430 inline bool
0431 NamedTupleDsvReader<Delimiter, NamedTuple>::read(
0432   NamedTuple& record, std::vector<T>& extra) {
0433   // parse columns belonging to the regular record
0434   if (not read(record)) {
0435     return false;
0436   }
0437   // parse extra columns
0438   extra.resize(m_extra_columns.size());
0439   for (std::size_t i = 0; i < m_extra_columns.size(); ++i) {
0440     parse(m_columns[m_extra_columns[i]], extra[i]);
0441   }
0442   return true;
0443 }
0444 
0445 template<char Delimiter, typename NamedTuple>
0446 inline void
0447 NamedTupleDsvReader<Delimiter, NamedTuple>::use_default_columns() {
0448   // assume row content is identical in content and order to the tuple
0449   m_num_columns = std::tuple_size<Tuple>::value;
0450   for (std::size_t i = 0; i < m_tuple_column_map.size(); ++i) {
0451     m_tuple_column_map[i] = i;
0452   }
0453   // no extra columns by construction
0454   m_extra_columns.clear();
0455 }
0456 
0457 template<char Delimiter, typename NamedTuple>
0458 inline void
0459 NamedTupleDsvReader<Delimiter, NamedTuple>::parse_header(
0460   const std::vector<std::string>& optional_columns) {
0461   const auto& names = NamedTuple::names();
0462 
0463   // the number of header columns fixes the expected number of data columns
0464   m_num_columns = m_columns.size();
0465 
0466   // check that all non-optional columns are available
0467   for (const auto& name : names) {
0468     // no need to for availability if the column is optional
0469     auto o = std::find(optional_columns.begin(), optional_columns.end(), name);
0470     if (o != optional_columns.end()) {
0471       continue;
0472     }
0473     // missing, non-optional column mean we can not continue
0474     auto c = std::find(m_columns.begin(), m_columns.end(), name);
0475     if (c == m_columns.end()) {
0476       throw std::runtime_error("Missing header column '" + name + "'");
0477     }
0478   }
0479 
0480   // ensure missing columns are correctly marked as such
0481   m_tuple_column_map.fill(SIZE_MAX);
0482 
0483   // determine column-tuple mapping and extra column indices
0484   m_extra_columns.clear();
0485   for (std::size_t i = 0; i < m_columns.size(); ++i) {
0486     // find the position of the column in the tuple.
0487     auto it = std::find(names.begin(), names.end(), m_columns[i]);
0488     if (it != names.end()) {
0489       // establish mapping between column and tuple item position
0490       m_tuple_column_map[std::distance(names.begin(), it)] = i;
0491     } else {
0492       // record non-tuple columns
0493       m_extra_columns.push_back(i);
0494     }
0495   }
0496 }
0497 
0498 } // namespace io_dsv_impl
0499 
0500 /// Write arbitrary data as comma-separated values into as text file.
0501 using CsvWriter = io_dsv_impl::DsvWriter<','>;
0502 
0503 /// Write arbitrary data as tab-separated values into as text file.
0504 using TsvWriter = io_dsv_impl::DsvWriter<'\t'>;
0505 
0506 /// Write tuple-like records as comma-separated values into a text file.
0507 template<typename T>
0508 using NamedTupleCsvWriter = io_dsv_impl::NamedTupleDsvWriter<',', T>;
0509 
0510 /// Read tuple-like records from a comma-separated file.
0511 template<typename T>
0512 using NamedTupleCsvReader = io_dsv_impl::NamedTupleDsvReader<',', T>;
0513 
0514 /// Write tuple-like records as tab-separated values into a text file.
0515 template<typename T>
0516 using NamedTupleTsvWriter = io_dsv_impl::NamedTupleDsvWriter<'\t', T>;
0517 
0518 /// Read tuple-like records from a tab-separated file.
0519 template<typename T>
0520 using NamedTupleTsvReader = io_dsv_impl::NamedTupleDsvReader<'\t', T>;
0521 
0522 } // namespace dfe