![]() |
|
|||
File indexing completed on 2025-08-28 08:26:57
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <string> 0023 #include <unordered_map> 0024 #include <vector> 0025 0026 #include "arrow/csv/invalid_row.h" 0027 #include "arrow/csv/type_fwd.h" 0028 #include "arrow/io/interfaces.h" 0029 #include "arrow/status.h" 0030 #include "arrow/util/visibility.h" 0031 0032 namespace arrow { 0033 0034 class DataType; 0035 class TimestampParser; 0036 0037 namespace csv { 0038 0039 // Silly workaround for https://github.com/michaeljones/breathe/issues/453 0040 constexpr char kDefaultEscapeChar = '\\'; 0041 0042 struct ARROW_EXPORT ParseOptions { 0043 // Parsing options 0044 0045 /// Field delimiter 0046 char delimiter = ','; 0047 /// Whether quoting is used 0048 bool quoting = true; 0049 /// Quoting character (if `quoting` is true) 0050 char quote_char = '"'; 0051 /// Whether a quote inside a value is double-quoted 0052 bool double_quote = true; 0053 /// Whether escaping is used 0054 bool escaping = false; 0055 /// Escaping character (if `escaping` is true) 0056 char escape_char = kDefaultEscapeChar; 0057 /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters 0058 bool newlines_in_values = false; 0059 /// Whether empty lines are ignored. If false, an empty line represents 0060 /// a single empty value (assuming a one-column CSV file). 0061 bool ignore_empty_lines = true; 0062 /// A handler function for rows which do not have the correct number of columns 0063 InvalidRowHandler invalid_row_handler; 0064 0065 /// Create parsing options with default values 0066 static ParseOptions Defaults(); 0067 0068 /// \brief Test that all set options are valid 0069 Status Validate() const; 0070 }; 0071 0072 struct ARROW_EXPORT ConvertOptions { 0073 // Conversion options 0074 0075 /// Whether to check UTF8 validity of string columns 0076 bool check_utf8 = true; 0077 /// Optional per-column types (disabling type inference on those columns) 0078 std::unordered_map<std::string, std::shared_ptr<DataType>> column_types; 0079 /// Recognized spellings for null values 0080 std::vector<std::string> null_values; 0081 /// Recognized spellings for boolean true values 0082 std::vector<std::string> true_values; 0083 /// Recognized spellings for boolean false values 0084 std::vector<std::string> false_values; 0085 0086 /// Whether string / binary columns can have null values. 0087 /// 0088 /// If true, then strings in "null_values" are considered null for string columns. 0089 /// If false, then all strings are valid string values. 0090 bool strings_can_be_null = false; 0091 0092 /// Whether quoted values can be null. 0093 /// 0094 /// If true, then strings in "null_values" are also considered null when they 0095 /// appear quoted in the CSV file. Otherwise, quoted values are never considered null. 0096 bool quoted_strings_can_be_null = true; 0097 0098 /// Whether to try to automatically dict-encode string / binary data. 0099 /// If true, then when type inference detects a string or binary column, 0100 /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values 0101 /// (per chunk), after which it switches to regular encoding. 0102 /// 0103 /// This setting is ignored for non-inferred columns (those in `column_types`). 0104 bool auto_dict_encode = false; 0105 int32_t auto_dict_max_cardinality = 50; 0106 0107 /// Decimal point character for floating-point and decimal data 0108 char decimal_point = '.'; 0109 0110 // XXX Should we have a separate FilterOptions? 0111 0112 /// If non-empty, indicates the names of columns from the CSV file that should 0113 /// be actually read and converted (in the vector's order). 0114 /// Columns not in this vector will be ignored. 0115 std::vector<std::string> include_columns; 0116 /// If false, columns in `include_columns` but not in the CSV file will error out. 0117 /// If true, columns in `include_columns` but not in the CSV file will produce 0118 /// a column of nulls (whose type is selected using `column_types`, 0119 /// or null by default) 0120 /// This option is ignored if `include_columns` is empty. 0121 bool include_missing_columns = false; 0122 0123 /// User-defined timestamp parsers, using the virtual parser interface in 0124 /// arrow/util/value_parsing.h. More than one parser can be specified, and 0125 /// the CSV conversion logic will try parsing values starting from the 0126 /// beginning of this vector. If no parsers are specified, we use the default 0127 /// built-in ISO-8601 parser. 0128 std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers; 0129 0130 /// Create conversion options with default values, including conventional 0131 /// values for `null_values`, `true_values` and `false_values` 0132 static ConvertOptions Defaults(); 0133 0134 /// \brief Test that all set options are valid 0135 Status Validate() const; 0136 }; 0137 0138 struct ARROW_EXPORT ReadOptions { 0139 // Reader options 0140 0141 /// Whether to use the global CPU thread pool 0142 bool use_threads = true; 0143 0144 /// \brief Block size we request from the IO layer. 0145 /// 0146 /// This will determine multi-threading granularity as well as 0147 /// the size of individual record batches. 0148 /// Minimum valid value for block size is 1 0149 int32_t block_size = 1 << 20; // 1 MB 0150 0151 /// Number of header rows to skip (not including the row of column names, if any) 0152 int32_t skip_rows = 0; 0153 0154 /// Number of rows to skip after the column names are read, if any 0155 int32_t skip_rows_after_names = 0; 0156 0157 /// Column names for the target table. 0158 /// If empty, fall back on autogenerate_column_names. 0159 std::vector<std::string> column_names; 0160 0161 /// Whether to autogenerate column names if `column_names` is empty. 0162 /// If true, column names will be of the form "f0", "f1"... 0163 /// If false, column names will be read from the first CSV row after `skip_rows`. 0164 bool autogenerate_column_names = false; 0165 0166 /// Create read options with default values 0167 static ReadOptions Defaults(); 0168 0169 /// \brief Test that all set options are valid 0170 Status Validate() const; 0171 }; 0172 0173 /// \brief Quoting style for CSV writing 0174 enum class ARROW_EXPORT QuotingStyle { 0175 /// Only enclose values in quotes which need them, because their CSV rendering can 0176 /// contain quotes itself (e.g. strings or binary values) 0177 Needed, 0178 /// Enclose all valid values in quotes. Nulls are not quoted. May cause readers to 0179 /// interpret all values as strings if schema is inferred. 0180 AllValid, 0181 /// Do not enclose any values in quotes. Prevents values from containing quotes ("), 0182 /// cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values 0183 /// contain these characters, an error is caused when attempting to write. 0184 None 0185 }; 0186 0187 struct ARROW_EXPORT WriteOptions { 0188 /// Whether to write an initial header line with column names 0189 bool include_header = true; 0190 0191 /// \brief Maximum number of rows processed at a time 0192 /// 0193 /// The CSV writer converts and writes data in batches of N rows. 0194 /// This number can impact performance. 0195 int32_t batch_size = 1024; 0196 0197 /// Field delimiter 0198 char delimiter = ','; 0199 0200 /// \brief The string to write for null values. Quotes are not allowed in this string. 0201 std::string null_string; 0202 0203 /// \brief IO context for writing. 0204 io::IOContext io_context; 0205 0206 /// \brief The end of line character to use for ending rows 0207 std::string eol = "\n"; 0208 0209 /// \brief Quoting style 0210 QuotingStyle quoting_style = QuotingStyle::Needed; 0211 0212 /// Create write options with default values 0213 static WriteOptions Defaults(); 0214 0215 /// \brief Test that all set options are valid 0216 Status Validate() const; 0217 }; 0218 0219 } // namespace csv 0220 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |