Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:54

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string>
0023 #include <vector>
0024 
0025 #include "arrow/io/caching.h"
0026 #include "arrow/util/type_fwd.h"
0027 #include "parquet/metadata.h"  // IWYU pragma: keep
0028 #include "parquet/platform.h"
0029 #include "parquet/properties.h"
0030 
0031 namespace parquet {
0032 
0033 class ColumnReader;
0034 class FileMetaData;
0035 class PageIndexReader;
0036 class BloomFilterReader;
0037 class PageReader;
0038 class RowGroupMetaData;
0039 
0040 namespace internal {
0041 class RecordReader;
0042 }
0043 
0044 class PARQUET_EXPORT RowGroupReader {
0045  public:
0046   // Forward declare a virtual class 'Contents' to aid dependency injection and more
0047   // easily create test fixtures
0048   // An implementation of the Contents class is defined in the .cc file
0049   struct Contents {
0050     virtual ~Contents() {}
0051     virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
0052     virtual const RowGroupMetaData* metadata() const = 0;
0053     virtual const ReaderProperties* properties() const = 0;
0054   };
0055 
0056   explicit RowGroupReader(std::unique_ptr<Contents> contents);
0057 
0058   // Returns the rowgroup metadata
0059   const RowGroupMetaData* metadata() const;
0060 
0061   // Construct a ColumnReader for the indicated row group-relative
0062   // column. Ownership is shared with the RowGroupReader.
0063   std::shared_ptr<ColumnReader> Column(int i);
0064 
0065   // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group.
0066   // Ownership is shared with the RowGroupReader.
0067   std::shared_ptr<internal::RecordReader> RecordReader(int i,
0068                                                        bool read_dictionary = false);
0069 
0070   // Construct a ColumnReader, trying to enable exposed encoding.
0071   //
0072   // For dictionary encoding, currently we only support column chunks that are fully
0073   // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
0074   // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
0075   // encoding will not be exposed.
0076   //
0077   // The returned column reader provides an API GetExposedEncoding() for the
0078   // users to check the exposed encoding and determine how to read the batches.
0079   //
0080   // \note API EXPERIMENTAL
0081   std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
0082       int i, ExposedEncoding encoding_to_expose);
0083 
0084   // Construct a RecordReader, trying to enable exposed encoding.
0085   //
0086   // For dictionary encoding, currently we only support column chunks that are
0087   // fully dictionary encoded byte arrays. The caller should verify if the reader can read
0088   // and expose the dictionary by checking the reader's read_dictionary(). If a column
0089   // chunk uses dictionary encoding but then falls back to plain encoding, the returned
0090   // reader will read decoded data without exposing the dictionary.
0091   //
0092   // \note API EXPERIMENTAL
0093   std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding(
0094       int i, ExposedEncoding encoding_to_expose);
0095 
0096   std::unique_ptr<PageReader> GetColumnPageReader(int i);
0097 
0098  private:
0099   // Holds a pointer to an instance of Contents implementation
0100   std::unique_ptr<Contents> contents_;
0101 };
0102 
0103 class PARQUET_EXPORT ParquetFileReader {
0104  public:
0105   // Declare a virtual class 'Contents' to aid dependency injection and more
0106   // easily create test fixtures
0107   // An implementation of the Contents class is defined in the .cc file
0108   struct PARQUET_EXPORT Contents {
0109     static std::unique_ptr<Contents> Open(
0110         std::shared_ptr<::arrow::io::RandomAccessFile> source,
0111         const ReaderProperties& props = default_reader_properties(),
0112         std::shared_ptr<FileMetaData> metadata = NULLPTR);
0113 
0114     static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
0115         std::shared_ptr<::arrow::io::RandomAccessFile> source,
0116         const ReaderProperties& props = default_reader_properties(),
0117         std::shared_ptr<FileMetaData> metadata = NULLPTR);
0118 
0119     virtual ~Contents() = default;
0120     // Perform any cleanup associated with the file contents
0121     virtual void Close() = 0;
0122     virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
0123     virtual std::shared_ptr<FileMetaData> metadata() const = 0;
0124     virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0;
0125     virtual BloomFilterReader& GetBloomFilterReader() = 0;
0126   };
0127 
0128   ParquetFileReader();
0129   ~ParquetFileReader();
0130 
0131   // Create a file reader instance from an Arrow file object. Thread-safety is
0132   // the responsibility of the file implementation
0133   static std::unique_ptr<ParquetFileReader> Open(
0134       std::shared_ptr<::arrow::io::RandomAccessFile> source,
0135       const ReaderProperties& props = default_reader_properties(),
0136       std::shared_ptr<FileMetaData> metadata = NULLPTR);
0137 
0138   // API Convenience to open a serialized Parquet file on disk, using Arrow IO
0139   // interfaces.
0140   static std::unique_ptr<ParquetFileReader> OpenFile(
0141       const std::string& path, bool memory_map = false,
0142       const ReaderProperties& props = default_reader_properties(),
0143       std::shared_ptr<FileMetaData> metadata = NULLPTR);
0144 
0145   // Asynchronously open a file reader from an Arrow file object.
0146   // Does not throw - all errors are reported through the Future.
0147   static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
0148       std::shared_ptr<::arrow::io::RandomAccessFile> source,
0149       const ReaderProperties& props = default_reader_properties(),
0150       std::shared_ptr<FileMetaData> metadata = NULLPTR);
0151 
0152   void Open(std::unique_ptr<Contents> contents);
0153   void Close();
0154 
0155   // The RowGroupReader is owned by the FileReader
0156   std::shared_ptr<RowGroupReader> RowGroup(int i);
0157 
0158   // Returns the file metadata. Only one instance is ever created
0159   std::shared_ptr<FileMetaData> metadata() const;
0160 
0161   /// Returns the PageIndexReader. Only one instance is ever created.
0162   ///
0163   /// If the file does not have the page index, nullptr may be returned.
0164   /// Because it pays to check existence of page index in the file, it
0165   /// is possible to return a non null value even if page index does
0166   /// not exist. It is the caller's responsibility to check the return
0167   /// value and follow-up calls to PageIndexReader.
0168   ///
0169   /// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader.
0170   /// Initialize GetPageIndexReader() is not thread-safety.
0171   std::shared_ptr<PageIndexReader> GetPageIndexReader();
0172 
0173   /// Returns the BloomFilterReader. Only one instance is ever created.
0174   ///
0175   /// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader.
0176   /// Initialize GetBloomFilterReader() is not thread-safety.
0177   BloomFilterReader& GetBloomFilterReader();
0178 
0179   /// Pre-buffer the specified column indices in all row groups.
0180   ///
0181   /// Readers can optionally call this to cache the necessary slices
0182   /// of the file in-memory before deserialization. Arrow readers can
0183   /// automatically do this via an option. This is intended to
0184   /// increase performance when reading from high-latency filesystems
0185   /// (e.g. Amazon S3).
0186   ///
0187   /// After calling this, creating readers for row groups/column
0188   /// indices that were not buffered may fail. Creating multiple
0189   /// readers for the a subset of the buffered regions is
0190   /// acceptable. This may be called again to buffer a different set
0191   /// of row groups/columns.
0192   ///
0193   /// If memory usage is a concern, note that data will remain
0194   /// buffered in memory until either \a PreBuffer() is called again,
0195   /// or the reader itself is destructed. Reading - and buffering -
0196   /// only one row group at a time may be useful.
0197   ///
0198   /// This method may throw.
0199   void PreBuffer(const std::vector<int>& row_groups,
0200                  const std::vector<int>& column_indices,
0201                  const ::arrow::io::IOContext& ctx,
0202                  const ::arrow::io::CacheOptions& options);
0203 
0204   /// Retrieve the list of byte ranges that would need to be read to retrieve
0205   /// the data for the specified row groups and column indices.
0206   ///
0207   /// A reader can optionally call this if they wish to handle their own
0208   /// caching and management of file reads (or offload them to other readers).
0209   /// Unlike PreBuffer, this method will not perform any actual caching or
0210   /// reads, instead just using the file metadata to determine the byte ranges
0211   /// that would need to be read if you were to consume the entirety of the column
0212   /// chunks for the provided columns in the specified row groups.
0213   ///
0214   /// If row_groups or column_indices are empty, then the result of this will be empty.
0215   ///
0216   /// hole_size_limit represents the maximum distance, in bytes, between two
0217   /// consecutive ranges; beyond this value, ranges will not be combined. The default
0218   /// value is 1MB.
0219   ///
0220   /// range_size_limit is the maximum size in bytes of a combined range; if combining
0221   /// two consecutive ranges would produce a range larger than this, they are not
0222   /// combined. The default values is 64MB. This *must* be larger than hole_size_limit.
0223   ///
0224   /// This will not take into account page indexes or any other predicate push down
0225   /// benefits that may be available.
0226   ::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
0227       const std::vector<int>& row_groups, const std::vector<int>& column_indices,
0228       int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024);
0229 
0230   /// Wait for the specified row groups and column indices to be pre-buffered.
0231   ///
0232   /// After the returned Future completes, reading the specified row
0233   /// groups/columns will not block.
0234   ///
0235   /// PreBuffer must be called first. This method does not throw.
0236   ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
0237                                  const std::vector<int>& column_indices) const;
0238 
0239  private:
0240   // Holds a pointer to an instance of Contents implementation
0241   std::unique_ptr<Contents> contents_;
0242 };
0243 
0244 // Read only Parquet file metadata
0245 std::shared_ptr<FileMetaData> PARQUET_EXPORT
0246 ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
0247 
0248 /// \brief Scan all values in file. Useful for performance testing
0249 /// \param[in] columns the column numbers to scan. If empty scans all
0250 /// \param[in] column_batch_size number of values to read at a time when scanning column
0251 /// \param[in] reader a ParquetFileReader instance
0252 /// \return number of semantic rows in file
0253 PARQUET_EXPORT
0254 int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
0255                          ParquetFileReader* reader);
0256 
0257 }  // namespace parquet