Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:57

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <memory>
0021 
0022 #include "arrow/csv/options.h"  // IWYU pragma: keep
0023 #include "arrow/io/interfaces.h"
0024 #include "arrow/record_batch.h"
0025 #include "arrow/result.h"
0026 #include "arrow/type.h"
0027 #include "arrow/type_fwd.h"
0028 #include "arrow/util/future.h"
0029 #include "arrow/util/thread_pool.h"
0030 #include "arrow/util/visibility.h"
0031 
0032 namespace arrow {
0033 namespace io {
0034 class InputStream;
0035 }  // namespace io
0036 
0037 namespace csv {
0038 
0039 /// A class that reads an entire CSV file into a Arrow Table
0040 class ARROW_EXPORT TableReader {
0041  public:
0042   virtual ~TableReader() = default;
0043 
0044   /// Read the entire CSV file and convert it to a Arrow Table
0045   virtual Result<std::shared_ptr<Table>> Read() = 0;
0046   /// Read the entire CSV file and convert it to a Arrow Table
0047   virtual Future<std::shared_ptr<Table>> ReadAsync() = 0;
0048 
0049   /// Create a TableReader instance
0050   static Result<std::shared_ptr<TableReader>> Make(io::IOContext io_context,
0051                                                    std::shared_ptr<io::InputStream> input,
0052                                                    const ReadOptions&,
0053                                                    const ParseOptions&,
0054                                                    const ConvertOptions&);
0055 };
0056 
0057 /// \brief A class that reads a CSV file incrementally
0058 ///
0059 /// Caveats:
0060 /// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`.
0061 /// - Type inference is done on the first block and types are frozen afterwards;
0062 ///   to make sure the right data types are inferred, either set
0063 ///   `ReadOptions::block_size` to a large enough value, or use
0064 ///   `ConvertOptions::column_types` to set the desired data types explicitly.
0065 class ARROW_EXPORT StreamingReader : public RecordBatchReader {
0066  public:
0067   virtual ~StreamingReader() = default;
0068 
0069   virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0;
0070 
0071   /// \brief Return the number of bytes which have been read and processed
0072   ///
0073   /// The returned number includes CSV bytes which the StreamingReader has
0074   /// finished processing, but not bytes for which some processing (e.g.
0075   /// CSV parsing or conversion to Arrow layout) is still ongoing.
0076   ///
0077   /// Furthermore, the following rules apply:
0078   /// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before
0079   /// any records are returned.
0080   /// - bytes read while parsing the header are counted as being read before any
0081   /// records are returned.
0082   /// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the
0083   /// first batch is returned.
0084   virtual int64_t bytes_read() const = 0;
0085 
0086   /// Create a StreamingReader instance
0087   ///
0088   /// This involves some I/O as the first batch must be loaded during the creation process
0089   /// so it is returned as a future
0090   ///
0091   /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out
0092   /// parsing (see ARROW-11889)
0093   static Future<std::shared_ptr<StreamingReader>> MakeAsync(
0094       io::IOContext io_context, std::shared_ptr<io::InputStream> input,
0095       arrow::internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&,
0096       const ConvertOptions&);
0097 
0098   static Result<std::shared_ptr<StreamingReader>> Make(
0099       io::IOContext io_context, std::shared_ptr<io::InputStream> input,
0100       const ReadOptions&, const ParseOptions&, const ConvertOptions&);
0101 };
0102 
0103 /// \brief Count the logical rows of data in a CSV file (i.e. the
0104 /// number of rows you would get if you read the file into a table).
0105 ARROW_EXPORT
0106 Future<int64_t> CountRowsAsync(io::IOContext io_context,
0107                                std::shared_ptr<io::InputStream> input,
0108                                arrow::internal::Executor* cpu_executor,
0109                                const ReadOptions&, const ParseOptions&);
0110 
0111 }  // namespace csv
0112 }  // namespace arrow