Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:57

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string>
0023 #include <unordered_map>
0024 #include <vector>
0025 
0026 #include "arrow/csv/invalid_row.h"
0027 #include "arrow/csv/type_fwd.h"
0028 #include "arrow/io/interfaces.h"
0029 #include "arrow/status.h"
0030 #include "arrow/util/visibility.h"
0031 
0032 namespace arrow {
0033 
0034 class DataType;
0035 class TimestampParser;
0036 
0037 namespace csv {
0038 
0039 // Silly workaround for https://github.com/michaeljones/breathe/issues/453
0040 constexpr char kDefaultEscapeChar = '\\';
0041 
0042 struct ARROW_EXPORT ParseOptions {
0043   // Parsing options
0044 
0045   /// Field delimiter
0046   char delimiter = ',';
0047   /// Whether quoting is used
0048   bool quoting = true;
0049   /// Quoting character (if `quoting` is true)
0050   char quote_char = '"';
0051   /// Whether a quote inside a value is double-quoted
0052   bool double_quote = true;
0053   /// Whether escaping is used
0054   bool escaping = false;
0055   /// Escaping character (if `escaping` is true)
0056   char escape_char = kDefaultEscapeChar;
0057   /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
0058   bool newlines_in_values = false;
0059   /// Whether empty lines are ignored.  If false, an empty line represents
0060   /// a single empty value (assuming a one-column CSV file).
0061   bool ignore_empty_lines = true;
0062   /// A handler function for rows which do not have the correct number of columns
0063   InvalidRowHandler invalid_row_handler;
0064 
0065   /// Create parsing options with default values
0066   static ParseOptions Defaults();
0067 
0068   /// \brief Test that all set options are valid
0069   Status Validate() const;
0070 };
0071 
0072 struct ARROW_EXPORT ConvertOptions {
0073   // Conversion options
0074 
0075   /// Whether to check UTF8 validity of string columns
0076   bool check_utf8 = true;
0077   /// Optional per-column types (disabling type inference on those columns)
0078   std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
0079   /// Recognized spellings for null values
0080   std::vector<std::string> null_values;
0081   /// Recognized spellings for boolean true values
0082   std::vector<std::string> true_values;
0083   /// Recognized spellings for boolean false values
0084   std::vector<std::string> false_values;
0085 
0086   /// Whether string / binary columns can have null values.
0087   ///
0088   /// If true, then strings in "null_values" are considered null for string columns.
0089   /// If false, then all strings are valid string values.
0090   bool strings_can_be_null = false;
0091 
0092   /// Whether quoted values can be null.
0093   ///
0094   /// If true, then strings in "null_values" are also considered null when they
0095   /// appear quoted in the CSV file. Otherwise, quoted values are never considered null.
0096   bool quoted_strings_can_be_null = true;
0097 
0098   /// Whether to try to automatically dict-encode string / binary data.
0099   /// If true, then when type inference detects a string or binary column,
0100   /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
0101   /// (per chunk), after which it switches to regular encoding.
0102   ///
0103   /// This setting is ignored for non-inferred columns (those in `column_types`).
0104   bool auto_dict_encode = false;
0105   int32_t auto_dict_max_cardinality = 50;
0106 
0107   /// Decimal point character for floating-point and decimal data
0108   char decimal_point = '.';
0109 
0110   // XXX Should we have a separate FilterOptions?
0111 
0112   /// If non-empty, indicates the names of columns from the CSV file that should
0113   /// be actually read and converted (in the vector's order).
0114   /// Columns not in this vector will be ignored.
0115   std::vector<std::string> include_columns;
0116   /// If false, columns in `include_columns` but not in the CSV file will error out.
0117   /// If true, columns in `include_columns` but not in the CSV file will produce
0118   /// a column of nulls (whose type is selected using `column_types`,
0119   /// or null by default)
0120   /// This option is ignored if `include_columns` is empty.
0121   bool include_missing_columns = false;
0122 
0123   /// User-defined timestamp parsers, using the virtual parser interface in
0124   /// arrow/util/value_parsing.h. More than one parser can be specified, and
0125   /// the CSV conversion logic will try parsing values starting from the
0126   /// beginning of this vector. If no parsers are specified, we use the default
0127   /// built-in ISO-8601 parser.
0128   std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
0129 
0130   /// Create conversion options with default values, including conventional
0131   /// values for `null_values`, `true_values` and `false_values`
0132   static ConvertOptions Defaults();
0133 
0134   /// \brief Test that all set options are valid
0135   Status Validate() const;
0136 };
0137 
0138 struct ARROW_EXPORT ReadOptions {
0139   // Reader options
0140 
0141   /// Whether to use the global CPU thread pool
0142   bool use_threads = true;
0143 
0144   /// \brief Block size we request from the IO layer.
0145   ///
0146   /// This will determine multi-threading granularity as well as
0147   /// the size of individual record batches.
0148   /// Minimum valid value for block size is 1
0149   int32_t block_size = 1 << 20;  // 1 MB
0150 
0151   /// Number of header rows to skip (not including the row of column names, if any)
0152   int32_t skip_rows = 0;
0153 
0154   /// Number of rows to skip after the column names are read, if any
0155   int32_t skip_rows_after_names = 0;
0156 
0157   /// Column names for the target table.
0158   /// If empty, fall back on autogenerate_column_names.
0159   std::vector<std::string> column_names;
0160 
0161   /// Whether to autogenerate column names if `column_names` is empty.
0162   /// If true, column names will be of the form "f0", "f1"...
0163   /// If false, column names will be read from the first CSV row after `skip_rows`.
0164   bool autogenerate_column_names = false;
0165 
0166   /// Create read options with default values
0167   static ReadOptions Defaults();
0168 
0169   /// \brief Test that all set options are valid
0170   Status Validate() const;
0171 };
0172 
0173 /// \brief Quoting style for CSV writing
0174 enum class ARROW_EXPORT QuotingStyle {
0175   /// Only enclose values in quotes which need them, because their CSV rendering can
0176   /// contain quotes itself (e.g. strings or binary values)
0177   Needed,
0178   /// Enclose all valid values in quotes. Nulls are not quoted. May cause readers to
0179   /// interpret all values as strings if schema is inferred.
0180   AllValid,
0181   /// Do not enclose any values in quotes. Prevents values from containing quotes ("),
0182   /// cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values
0183   /// contain these characters, an error is caused when attempting to write.
0184   None
0185 };
0186 
0187 struct ARROW_EXPORT WriteOptions {
0188   /// Whether to write an initial header line with column names
0189   bool include_header = true;
0190 
0191   /// \brief Maximum number of rows processed at a time
0192   ///
0193   /// The CSV writer converts and writes data in batches of N rows.
0194   /// This number can impact performance.
0195   int32_t batch_size = 1024;
0196 
0197   /// Field delimiter
0198   char delimiter = ',';
0199 
0200   /// \brief The string to write for null values. Quotes are not allowed in this string.
0201   std::string null_string;
0202 
0203   /// \brief IO context for writing.
0204   io::IOContext io_context;
0205 
0206   /// \brief The end of line character to use for ending rows
0207   std::string eol = "\n";
0208 
0209   /// \brief Quoting style
0210   QuotingStyle quoting_style = QuotingStyle::Needed;
0211 
0212   /// Create write options with default values
0213   static WriteOptions Defaults();
0214 
0215   /// \brief Test that all set options are valid
0216   Status Validate() const;
0217 };
0218 
0219 }  // namespace csv
0220 }  // namespace arrow