![]() |
|
|||
File indexing completed on 2025-08-28 08:26:57
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <memory> 0021 0022 #include "arrow/csv/options.h" // IWYU pragma: keep 0023 #include "arrow/io/interfaces.h" 0024 #include "arrow/record_batch.h" 0025 #include "arrow/result.h" 0026 #include "arrow/type.h" 0027 #include "arrow/type_fwd.h" 0028 #include "arrow/util/future.h" 0029 #include "arrow/util/thread_pool.h" 0030 #include "arrow/util/visibility.h" 0031 0032 namespace arrow { 0033 namespace io { 0034 class InputStream; 0035 } // namespace io 0036 0037 namespace csv { 0038 0039 /// A class that reads an entire CSV file into a Arrow Table 0040 class ARROW_EXPORT TableReader { 0041 public: 0042 virtual ~TableReader() = default; 0043 0044 /// Read the entire CSV file and convert it to a Arrow Table 0045 virtual Result<std::shared_ptr<Table>> Read() = 0; 0046 /// Read the entire CSV file and convert it to a Arrow Table 0047 virtual Future<std::shared_ptr<Table>> ReadAsync() = 0; 0048 0049 /// Create a TableReader instance 0050 static Result<std::shared_ptr<TableReader>> Make(io::IOContext io_context, 0051 std::shared_ptr<io::InputStream> input, 0052 const ReadOptions&, 0053 const ParseOptions&, 0054 const ConvertOptions&); 0055 }; 0056 0057 /// \brief A class that reads a CSV file incrementally 0058 /// 0059 /// Caveats: 0060 /// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`. 0061 /// - Type inference is done on the first block and types are frozen afterwards; 0062 /// to make sure the right data types are inferred, either set 0063 /// `ReadOptions::block_size` to a large enough value, or use 0064 /// `ConvertOptions::column_types` to set the desired data types explicitly. 0065 class ARROW_EXPORT StreamingReader : public RecordBatchReader { 0066 public: 0067 virtual ~StreamingReader() = default; 0068 0069 virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0; 0070 0071 /// \brief Return the number of bytes which have been read and processed 0072 /// 0073 /// The returned number includes CSV bytes which the StreamingReader has 0074 /// finished processing, but not bytes for which some processing (e.g. 0075 /// CSV parsing or conversion to Arrow layout) is still ongoing. 0076 /// 0077 /// Furthermore, the following rules apply: 0078 /// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before 0079 /// any records are returned. 0080 /// - bytes read while parsing the header are counted as being read before any 0081 /// records are returned. 0082 /// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the 0083 /// first batch is returned. 0084 virtual int64_t bytes_read() const = 0; 0085 0086 /// Create a StreamingReader instance 0087 /// 0088 /// This involves some I/O as the first batch must be loaded during the creation process 0089 /// so it is returned as a future 0090 /// 0091 /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out 0092 /// parsing (see ARROW-11889) 0093 static Future<std::shared_ptr<StreamingReader>> MakeAsync( 0094 io::IOContext io_context, std::shared_ptr<io::InputStream> input, 0095 arrow::internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&, 0096 const ConvertOptions&); 0097 0098 static Result<std::shared_ptr<StreamingReader>> Make( 0099 io::IOContext io_context, std::shared_ptr<io::InputStream> input, 0100 const ReadOptions&, const ParseOptions&, const ConvertOptions&); 0101 }; 0102 0103 /// \brief Count the logical rows of data in a CSV file (i.e. the 0104 /// number of rows you would get if you read the file into a table). 0105 ARROW_EXPORT 0106 Future<int64_t> CountRowsAsync(io::IOContext io_context, 0107 std::shared_ptr<io::InputStream> input, 0108 arrow::internal::Executor* cpu_executor, 0109 const ReadOptions&, const ParseOptions&); 0110 0111 } // namespace csv 0112 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |