|
|
|||
File indexing completed on 2026-04-17 08:28:54
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <string> 0023 #include <vector> 0024 0025 #include "arrow/io/caching.h" 0026 #include "arrow/util/type_fwd.h" 0027 #include "parquet/metadata.h" // IWYU pragma: keep 0028 #include "parquet/platform.h" 0029 #include "parquet/properties.h" 0030 0031 namespace parquet { 0032 0033 class ColumnReader; 0034 class FileMetaData; 0035 class PageIndexReader; 0036 class BloomFilterReader; 0037 class PageReader; 0038 class RowGroupMetaData; 0039 0040 namespace internal { 0041 class RecordReader; 0042 } 0043 0044 class PARQUET_EXPORT RowGroupReader { 0045 public: 0046 // Forward declare a virtual class 'Contents' to aid dependency injection and more 0047 // easily create test fixtures 0048 // An implementation of the Contents class is defined in the .cc file 0049 struct Contents { 0050 virtual ~Contents() {} 0051 virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0; 0052 virtual const RowGroupMetaData* metadata() const = 0; 0053 virtual const ReaderProperties* properties() const = 0; 0054 }; 0055 0056 explicit RowGroupReader(std::unique_ptr<Contents> contents); 0057 0058 // Returns the rowgroup metadata 0059 const RowGroupMetaData* metadata() const; 0060 0061 // Construct a ColumnReader for the indicated row group-relative 0062 // column. Ownership is shared with the RowGroupReader. 0063 std::shared_ptr<ColumnReader> Column(int i); 0064 0065 // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group. 0066 // Ownership is shared with the RowGroupReader. 0067 std::shared_ptr<internal::RecordReader> RecordReader(int i, 0068 bool read_dictionary = false); 0069 0070 // Construct a ColumnReader, trying to enable exposed encoding. 0071 // 0072 // For dictionary encoding, currently we only support column chunks that are fully 0073 // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded. 0074 // If a column chunk uses dictionary encoding but then falls back to plain encoding, the 0075 // encoding will not be exposed. 0076 // 0077 // The returned column reader provides an API GetExposedEncoding() for the 0078 // users to check the exposed encoding and determine how to read the batches. 0079 // 0080 // \note API EXPERIMENTAL 0081 std::shared_ptr<ColumnReader> ColumnWithExposeEncoding( 0082 int i, ExposedEncoding encoding_to_expose); 0083 0084 // Construct a RecordReader, trying to enable exposed encoding. 0085 // 0086 // For dictionary encoding, currently we only support column chunks that are 0087 // fully dictionary encoded byte arrays. The caller should verify if the reader can read 0088 // and expose the dictionary by checking the reader's read_dictionary(). If a column 0089 // chunk uses dictionary encoding but then falls back to plain encoding, the returned 0090 // reader will read decoded data without exposing the dictionary. 0091 // 0092 // \note API EXPERIMENTAL 0093 std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding( 0094 int i, ExposedEncoding encoding_to_expose); 0095 0096 std::unique_ptr<PageReader> GetColumnPageReader(int i); 0097 0098 private: 0099 // Holds a pointer to an instance of Contents implementation 0100 std::unique_ptr<Contents> contents_; 0101 }; 0102 0103 class PARQUET_EXPORT ParquetFileReader { 0104 public: 0105 // Declare a virtual class 'Contents' to aid dependency injection and more 0106 // easily create test fixtures 0107 // An implementation of the Contents class is defined in the .cc file 0108 struct PARQUET_EXPORT Contents { 0109 static std::unique_ptr<Contents> Open( 0110 std::shared_ptr<::arrow::io::RandomAccessFile> source, 0111 const ReaderProperties& props = default_reader_properties(), 0112 std::shared_ptr<FileMetaData> metadata = NULLPTR); 0113 0114 static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync( 0115 std::shared_ptr<::arrow::io::RandomAccessFile> source, 0116 const ReaderProperties& props = default_reader_properties(), 0117 std::shared_ptr<FileMetaData> metadata = NULLPTR); 0118 0119 virtual ~Contents() = default; 0120 // Perform any cleanup associated with the file contents 0121 virtual void Close() = 0; 0122 virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0; 0123 virtual std::shared_ptr<FileMetaData> metadata() const = 0; 0124 virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0; 0125 virtual BloomFilterReader& GetBloomFilterReader() = 0; 0126 }; 0127 0128 ParquetFileReader(); 0129 ~ParquetFileReader(); 0130 0131 // Create a file reader instance from an Arrow file object. Thread-safety is 0132 // the responsibility of the file implementation 0133 static std::unique_ptr<ParquetFileReader> Open( 0134 std::shared_ptr<::arrow::io::RandomAccessFile> source, 0135 const ReaderProperties& props = default_reader_properties(), 0136 std::shared_ptr<FileMetaData> metadata = NULLPTR); 0137 0138 // API Convenience to open a serialized Parquet file on disk, using Arrow IO 0139 // interfaces. 0140 static std::unique_ptr<ParquetFileReader> OpenFile( 0141 const std::string& path, bool memory_map = false, 0142 const ReaderProperties& props = default_reader_properties(), 0143 std::shared_ptr<FileMetaData> metadata = NULLPTR); 0144 0145 // Asynchronously open a file reader from an Arrow file object. 0146 // Does not throw - all errors are reported through the Future. 0147 static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync( 0148 std::shared_ptr<::arrow::io::RandomAccessFile> source, 0149 const ReaderProperties& props = default_reader_properties(), 0150 std::shared_ptr<FileMetaData> metadata = NULLPTR); 0151 0152 void Open(std::unique_ptr<Contents> contents); 0153 void Close(); 0154 0155 // The RowGroupReader is owned by the FileReader 0156 std::shared_ptr<RowGroupReader> RowGroup(int i); 0157 0158 // Returns the file metadata. Only one instance is ever created 0159 std::shared_ptr<FileMetaData> metadata() const; 0160 0161 /// Returns the PageIndexReader. Only one instance is ever created. 0162 /// 0163 /// If the file does not have the page index, nullptr may be returned. 0164 /// Because it pays to check existence of page index in the file, it 0165 /// is possible to return a non null value even if page index does 0166 /// not exist. It is the caller's responsibility to check the return 0167 /// value and follow-up calls to PageIndexReader. 0168 /// 0169 /// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader. 0170 /// Initialize GetPageIndexReader() is not thread-safety. 0171 std::shared_ptr<PageIndexReader> GetPageIndexReader(); 0172 0173 /// Returns the BloomFilterReader. Only one instance is ever created. 0174 /// 0175 /// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader. 0176 /// Initialize GetBloomFilterReader() is not thread-safety. 0177 BloomFilterReader& GetBloomFilterReader(); 0178 0179 /// Pre-buffer the specified column indices in all row groups. 0180 /// 0181 /// Readers can optionally call this to cache the necessary slices 0182 /// of the file in-memory before deserialization. Arrow readers can 0183 /// automatically do this via an option. This is intended to 0184 /// increase performance when reading from high-latency filesystems 0185 /// (e.g. Amazon S3). 0186 /// 0187 /// After calling this, creating readers for row groups/column 0188 /// indices that were not buffered may fail. Creating multiple 0189 /// readers for the a subset of the buffered regions is 0190 /// acceptable. This may be called again to buffer a different set 0191 /// of row groups/columns. 0192 /// 0193 /// If memory usage is a concern, note that data will remain 0194 /// buffered in memory until either \a PreBuffer() is called again, 0195 /// or the reader itself is destructed. Reading - and buffering - 0196 /// only one row group at a time may be useful. 0197 /// 0198 /// This method may throw. 0199 void PreBuffer(const std::vector<int>& row_groups, 0200 const std::vector<int>& column_indices, 0201 const ::arrow::io::IOContext& ctx, 0202 const ::arrow::io::CacheOptions& options); 0203 0204 /// Retrieve the list of byte ranges that would need to be read to retrieve 0205 /// the data for the specified row groups and column indices. 0206 /// 0207 /// A reader can optionally call this if they wish to handle their own 0208 /// caching and management of file reads (or offload them to other readers). 0209 /// Unlike PreBuffer, this method will not perform any actual caching or 0210 /// reads, instead just using the file metadata to determine the byte ranges 0211 /// that would need to be read if you were to consume the entirety of the column 0212 /// chunks for the provided columns in the specified row groups. 0213 /// 0214 /// If row_groups or column_indices are empty, then the result of this will be empty. 0215 /// 0216 /// hole_size_limit represents the maximum distance, in bytes, between two 0217 /// consecutive ranges; beyond this value, ranges will not be combined. The default 0218 /// value is 1MB. 0219 /// 0220 /// range_size_limit is the maximum size in bytes of a combined range; if combining 0221 /// two consecutive ranges would produce a range larger than this, they are not 0222 /// combined. The default values is 64MB. This *must* be larger than hole_size_limit. 0223 /// 0224 /// This will not take into account page indexes or any other predicate push down 0225 /// benefits that may be available. 0226 ::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges( 0227 const std::vector<int>& row_groups, const std::vector<int>& column_indices, 0228 int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); 0229 0230 /// Wait for the specified row groups and column indices to be pre-buffered. 0231 /// 0232 /// After the returned Future completes, reading the specified row 0233 /// groups/columns will not block. 0234 /// 0235 /// PreBuffer must be called first. This method does not throw. 0236 ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups, 0237 const std::vector<int>& column_indices) const; 0238 0239 private: 0240 // Holds a pointer to an instance of Contents implementation 0241 std::unique_ptr<Contents> contents_; 0242 }; 0243 0244 // Read only Parquet file metadata 0245 std::shared_ptr<FileMetaData> PARQUET_EXPORT 0246 ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source); 0247 0248 /// \brief Scan all values in file. Useful for performance testing 0249 /// \param[in] columns the column numbers to scan. If empty scans all 0250 /// \param[in] column_batch_size number of values to read at a time when scanning column 0251 /// \param[in] reader a ParquetFileReader instance 0252 /// \return number of semantic rows in file 0253 PARQUET_EXPORT 0254 int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size, 0255 ParquetFileReader* reader); 0256 0257 } // namespace parquet
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|