Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:54

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include "arrow/io/interfaces.h"
0021 #include "parquet/encryption/type_fwd.h"
0022 #include "parquet/type_fwd.h"
0023 #include "parquet/types.h"
0024 
0025 #include <optional>
0026 #include <vector>
0027 
0028 namespace parquet {
0029 
0030 /// \brief ColumnIndex is a proxy around format::ColumnIndex.
0031 class PARQUET_EXPORT ColumnIndex {
0032  public:
0033   /// \brief Create a ColumnIndex from a serialized thrift message.
0034   static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr,
0035                                            const void* serialized_index,
0036                                            uint32_t index_len,
0037                                            const ReaderProperties& properties,
0038                                            Decryptor* decryptor = NULLPTR);
0039 
0040   virtual ~ColumnIndex() = default;
0041 
0042   /// \brief A bitmap with a bit set for each data page that has only null values.
0043   ///
0044   /// The length of this vector is equal to the number of data pages in the column.
0045   virtual const std::vector<bool>& null_pages() const = 0;
0046 
0047   /// \brief A vector of encoded lower bounds for each data page in this column.
0048   ///
0049   /// `null_pages` should be inspected first, as only pages with non-null values
0050   /// may have their lower bounds populated.
0051   virtual const std::vector<std::string>& encoded_min_values() const = 0;
0052 
0053   /// \brief A vector of encoded upper bounds for each data page in this column.
0054   ///
0055   /// `null_pages` should be inspected first, as only pages with non-null values
0056   /// may have their upper bounds populated.
0057   virtual const std::vector<std::string>& encoded_max_values() const = 0;
0058 
0059   /// \brief The ordering of lower and upper bounds.
0060   ///
0061   /// The boundary order applies across all lower bounds, and all upper bounds,
0062   /// respectively. However, the order between lower bounds and upper bounds
0063   /// cannot be derived from this.
0064   virtual BoundaryOrder::type boundary_order() const = 0;
0065 
0066   /// \brief Whether per-page null count information is available.
0067   virtual bool has_null_counts() const = 0;
0068 
0069   /// \brief An optional vector with the number of null values in each data page.
0070   ///
0071   /// `has_null_counts` should be called first to determine if this information is
0072   /// available.
0073   virtual const std::vector<int64_t>& null_counts() const = 0;
0074 
0075   /// \brief A vector of page indices for non-null pages.
0076   virtual const std::vector<int32_t>& non_null_page_indices() const = 0;
0077 
0078   /// \brief Whether definition level histogram is available.
0079   virtual bool has_definition_level_histograms() const = 0;
0080 
0081   /// \brief Whether repetition level histogram is available.
0082   virtual bool has_repetition_level_histograms() const = 0;
0083 
0084   /// \brief List of definition level histograms for each page concatenated together.
0085   virtual const std::vector<int64_t>& definition_level_histograms() const = 0;
0086 
0087   /// \brief List of repetition level histograms for each page concatenated together.
0088   virtual const std::vector<int64_t>& repetition_level_histograms() const = 0;
0089 };
0090 
0091 /// \brief Typed implementation of ColumnIndex.
0092 template <typename DType>
0093 class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex {
0094  public:
0095   using T = typename DType::c_type;
0096 
0097   /// \brief A vector of lower bounds for each data page in this column.
0098   ///
0099   /// This is like `encoded_min_values`, but with the values decoded according to
0100   /// the column's physical type.
0101   /// `min_values` and `max_values` can be used together with `boundary_order`
0102   /// in order to prune some data pages when searching for specific values.
0103   virtual const std::vector<T>& min_values() const = 0;
0104 
0105   /// \brief A vector of upper bounds for each data page in this column.
0106   ///
0107   /// Just like `min_values`, but for upper bounds instead of lower bounds.
0108   virtual const std::vector<T>& max_values() const = 0;
0109 };
0110 
0111 using BoolColumnIndex = TypedColumnIndex<BooleanType>;
0112 using Int32ColumnIndex = TypedColumnIndex<Int32Type>;
0113 using Int64ColumnIndex = TypedColumnIndex<Int64Type>;
0114 using FloatColumnIndex = TypedColumnIndex<FloatType>;
0115 using DoubleColumnIndex = TypedColumnIndex<DoubleType>;
0116 using ByteArrayColumnIndex = TypedColumnIndex<ByteArrayType>;
0117 using FLBAColumnIndex = TypedColumnIndex<FLBAType>;
0118 
0119 /// \brief PageLocation is a proxy around format::PageLocation.
0120 struct PARQUET_EXPORT PageLocation {
0121   /// File offset of the data page.
0122   int64_t offset;
0123   /// Total compressed size of the data page and header.
0124   int32_t compressed_page_size;
0125   /// Row id of the first row in the page within the row group.
0126   int64_t first_row_index;
0127 };
0128 
0129 /// \brief OffsetIndex is a proxy around format::OffsetIndex.
0130 class PARQUET_EXPORT OffsetIndex {
0131  public:
0132   /// \brief Create a OffsetIndex from a serialized thrift message.
0133   static std::unique_ptr<OffsetIndex> Make(const void* serialized_index,
0134                                            uint32_t index_len,
0135                                            const ReaderProperties& properties,
0136                                            Decryptor* decryptor = NULLPTR);
0137 
0138   virtual ~OffsetIndex() = default;
0139 
0140   /// \brief A vector of locations for each data page in this column.
0141   virtual const std::vector<PageLocation>& page_locations() const = 0;
0142 
0143   /// \brief A vector of unencoded/uncompressed size of each page for BYTE_ARRAY types,
0144   /// or empty for other types.
0145   virtual const std::vector<int64_t>& unencoded_byte_array_data_bytes() const = 0;
0146 };
0147 
0148 /// \brief Interface for reading the page index for a Parquet row group.
0149 class PARQUET_EXPORT RowGroupPageIndexReader {
0150  public:
0151   virtual ~RowGroupPageIndexReader() = default;
0152 
0153   /// \brief Read column index of a column chunk.
0154   ///
0155   /// \param[in] i column ordinal of the column chunk.
0156   /// \returns column index of the column or nullptr if it does not exist.
0157   /// \throws ParquetException if the index is out of bound.
0158   virtual std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) = 0;
0159 
0160   /// \brief Read offset index of a column chunk.
0161   ///
0162   /// \param[in] i column ordinal of the column chunk.
0163   /// \returns offset index of the column or nullptr if it does not exist.
0164   /// \throws ParquetException if the index is out of bound.
0165   virtual std::shared_ptr<OffsetIndex> GetOffsetIndex(int32_t i) = 0;
0166 };
0167 
0168 struct PageIndexSelection {
0169   /// Specifies whether to read the column index.
0170   bool column_index = false;
0171   /// Specifies whether to read the offset index.
0172   bool offset_index = false;
0173 };
0174 
0175 PARQUET_EXPORT
0176 std::ostream& operator<<(std::ostream& out, const PageIndexSelection& params);
0177 
0178 struct RowGroupIndexReadRange {
0179   /// Base start and total size of column index of all column chunks in a row group.
0180   /// If none of the column chunks have column index, it is set to std::nullopt.
0181   std::optional<::arrow::io::ReadRange> column_index = std::nullopt;
0182   /// Base start and total size of offset index of all column chunks in a row group.
0183   /// If none of the column chunks have offset index, it is set to std::nullopt.
0184   std::optional<::arrow::io::ReadRange> offset_index = std::nullopt;
0185 };
0186 
0187 /// \brief Interface for reading the page index for a Parquet file.
0188 class PARQUET_EXPORT PageIndexReader {
0189  public:
0190   virtual ~PageIndexReader() = default;
0191 
0192   /// \brief Create a PageIndexReader instance.
0193   /// \returns a PageIndexReader instance.
0194   /// WARNING: The returned PageIndexReader references to all the input parameters, so
0195   /// it must not outlive all of the input parameters. Usually these input parameters
0196   /// come from the same ParquetFileReader object, so it must not outlive the reader
0197   /// that creates this PageIndexReader.
0198   static std::shared_ptr<PageIndexReader> Make(
0199       ::arrow::io::RandomAccessFile* input, std::shared_ptr<FileMetaData> file_metadata,
0200       const ReaderProperties& properties,
0201       InternalFileDecryptor* file_decryptor = NULLPTR);
0202 
0203   /// \brief Get the page index reader of a specific row group.
0204   /// \param[in] i row group ordinal to get page index reader.
0205   /// \returns RowGroupPageIndexReader of the specified row group. A nullptr may or may
0206   ///          not be returned if the page index for the row group is unavailable. It is
0207   ///          the caller's responsibility to check the return value of follow-up calls
0208   ///          to the RowGroupPageIndexReader.
0209   /// \throws ParquetException if the index is out of bound.
0210   virtual std::shared_ptr<RowGroupPageIndexReader> RowGroup(int i) = 0;
0211 
0212   /// \brief Advise the reader which part of page index will be read later.
0213   ///
0214   /// The PageIndexReader can optionally prefetch and cache page index that
0215   /// may be read later to get better performance.
0216   ///
0217   /// The contract of this function is as below:
0218   /// 1) If WillNeed() has not been called for a specific row group and the page index
0219   ///    exists, follow-up calls to get column index or offset index of all columns in
0220   ///    this row group SHOULD NOT FAIL, but the performance may not be optimal.
0221   /// 2) If WillNeed() has been called for a specific row group, follow-up calls to get
0222   ///    page index are limited to columns and index type requested by WillNeed().
0223   ///    So it MAY FAIL if columns that are not requested by WillNeed() are requested.
0224   /// 3) Later calls to WillNeed() MAY OVERRIDE previous calls of same row groups.
0225   /// For example,
0226   /// 1) If WillNeed() is not called for row group 0, then follow-up calls to read
0227   ///    column index and/or offset index of all columns of row group 0 should not
0228   ///    fail if its page index exists.
0229   /// 2) If WillNeed() is called for columns 0 and 1 for row group 0, then follow-up
0230   ///    call to read page index of column 2 for row group 0 MAY FAIL even if its
0231   ///    page index exists.
0232   /// 3) If WillNeed() is called for row group 0 with offset index only, then
0233   ///    follow-up call to read column index of row group 0 MAY FAIL even if
0234   ///    the column index of this column exists.
0235   /// 4) If WillNeed() is called for columns 0 and 1 for row group 0, then later
0236   ///    call to WillNeed() for columns 1 and 2 for row group 0. The later one
0237   ///    overrides previous call and only columns 1 and 2 of row group 0 are allowed
0238   ///    to access.
0239   ///
0240   /// \param[in] row_group_indices list of row group ordinal to read page index later.
0241   /// \param[in] column_indices list of column ordinal to read page index later. If it is
0242   ///            empty, it means all columns in the row group will be read.
0243   /// \param[in] selection which kind of page index is required later.
0244   virtual void WillNeed(const std::vector<int32_t>& row_group_indices,
0245                         const std::vector<int32_t>& column_indices,
0246                         const PageIndexSelection& selection) = 0;
0247 
0248   /// \brief Advise the reader page index of these row groups will not be read anymore.
0249   ///
0250   /// The PageIndexReader implementation has the opportunity to cancel any prefetch or
0251   /// release resource that are related to these row groups.
0252   ///
0253   /// \param[in] row_group_indices list of row group ordinal that whose page index will
0254   /// not be accessed anymore.
0255   virtual void WillNotNeed(const std::vector<int32_t>& row_group_indices) = 0;
0256 
0257   /// \brief Determine the column index and offset index ranges for the given row group.
0258   ///
0259   /// \param[in] row_group_metadata row group metadata to get column chunk metadata.
0260   /// \param[in] columns list of column ordinals to get page index. If the list is empty,
0261   ///            it means all columns in the row group.
0262   /// \returns RowGroupIndexReadRange of the specified row group. Throws ParquetException
0263   ///          if the selected column ordinal is out of bound or metadata of page index
0264   ///          is corrupted.
0265   static RowGroupIndexReadRange DeterminePageIndexRangesInRowGroup(
0266       const RowGroupMetaData& row_group_metadata, const std::vector<int32_t>& columns);
0267 };
0268 
0269 /// \brief Interface for collecting column index of data pages in a column chunk.
0270 class PARQUET_EXPORT ColumnIndexBuilder {
0271  public:
0272   /// \brief API convenience to create a ColumnIndexBuilder.
0273   static std::unique_ptr<ColumnIndexBuilder> Make(const ColumnDescriptor* descr);
0274 
0275   virtual ~ColumnIndexBuilder() = default;
0276 
0277   /// \brief Add statistics of a data page.
0278   ///
0279   /// If the ColumnIndexBuilder has seen any corrupted statistics, it will
0280   /// not update statistics anymore.
0281   ///
0282   /// \param stats Page statistics in the encoded form.
0283   /// \param size_stats Size statistics of the page if available.
0284   virtual void AddPage(const EncodedStatistics& stats,
0285                        const SizeStatistics& size_stats) = 0;
0286 
0287   /// \brief Complete the column index.
0288   ///
0289   /// Once called, AddPage() can no longer be called.
0290   /// WriteTo() and Build() can only called after Finish() has been called.
0291   virtual void Finish() = 0;
0292 
0293   /// \brief Serialize the column index thrift message.
0294   ///
0295   /// If the ColumnIndexBuilder has seen any corrupted statistics, it will
0296   /// not write any data to the sink.
0297   ///
0298   /// \param[out] sink output stream to write the serialized message.
0299   /// \param[in] encryptor encryptor to encrypt the serialized column index.
0300   virtual void WriteTo(::arrow::io::OutputStream* sink,
0301                        Encryptor* encryptor = NULLPTR) const = 0;
0302 
0303   /// \brief Create a ColumnIndex directly.
0304   ///
0305   /// \return If the ColumnIndexBuilder has seen any corrupted statistics, it simply
0306   /// returns nullptr. Otherwise the column index is built and returned.
0307   virtual std::unique_ptr<ColumnIndex> Build() const = 0;
0308 };
0309 
0310 /// \brief Interface for collecting offset index of data pages in a column chunk.
0311 class PARQUET_EXPORT OffsetIndexBuilder {
0312  public:
0313   /// \brief API convenience to create a OffsetIndexBuilder.
0314   static std::unique_ptr<OffsetIndexBuilder> Make();
0315 
0316   virtual ~OffsetIndexBuilder() = default;
0317 
0318   /// \brief Add page location and size stats of a data page.
0319   virtual void AddPage(int64_t offset, int32_t compressed_page_size,
0320                        int64_t first_row_index,
0321                        std::optional<int64_t> unencoded_byte_array_length = {}) = 0;
0322 
0323   /// \brief Add page location and size stats of a data page.
0324   void AddPage(const PageLocation& page_location, const SizeStatistics& size_stats);
0325 
0326   /// \brief Complete the offset index.
0327   ///
0328   /// In the buffered row group mode, data pages are flushed into memory
0329   /// sink and the OffsetIndexBuilder has only collected the relative offset
0330   /// which requires adjustment once they are flushed to the file.
0331   ///
0332   /// \param final_position Final stream offset to add for page offset adjustment.
0333   virtual void Finish(int64_t final_position) = 0;
0334 
0335   /// \brief Serialize the offset index thrift message.
0336   ///
0337   /// \param[out] sink output stream to write the serialized message.
0338   /// \param[in] encryptor encryptor to encrypt the serialized offset index.
0339   virtual void WriteTo(::arrow::io::OutputStream* sink,
0340                        Encryptor* encryptor = NULLPTR) const = 0;
0341 
0342   /// \brief Create an OffsetIndex directly.
0343   virtual std::unique_ptr<OffsetIndex> Build() const = 0;
0344 };
0345 
0346 /// \brief Interface for collecting page index of a parquet file.
0347 class PARQUET_EXPORT PageIndexBuilder {
0348  public:
0349   /// \brief API convenience to create a PageIndexBuilder.
0350   static std::unique_ptr<PageIndexBuilder> Make(
0351       const SchemaDescriptor* schema, InternalFileEncryptor* file_encryptor = NULLPTR);
0352 
0353   virtual ~PageIndexBuilder() = default;
0354 
0355   /// \brief Start a new row group.
0356   virtual void AppendRowGroup() = 0;
0357 
0358   /// \brief Get the ColumnIndexBuilder from column ordinal.
0359   ///
0360   /// \param i Column ordinal.
0361   /// \return ColumnIndexBuilder for the column and its memory ownership belongs to
0362   /// the PageIndexBuilder.
0363   virtual ColumnIndexBuilder* GetColumnIndexBuilder(int32_t i) = 0;
0364 
0365   /// \brief Get the OffsetIndexBuilder from column ordinal.
0366   ///
0367   /// \param i Column ordinal.
0368   /// \return OffsetIndexBuilder for the column and its memory ownership belongs to
0369   /// the PageIndexBuilder.
0370   virtual OffsetIndexBuilder* GetOffsetIndexBuilder(int32_t i) = 0;
0371 
0372   /// \brief Complete the page index builder and no more write is allowed.
0373   virtual void Finish() = 0;
0374 
0375   /// \brief Serialize the page index thrift message.
0376   ///
0377   /// Only valid column indexes and offset indexes are serialized and their locations
0378   /// are set.
0379   ///
0380   /// \param[out] sink The output stream to write the page index.
0381   /// \param[out] location The location of all page index to the start of sink.
0382   virtual void WriteTo(::arrow::io::OutputStream* sink,
0383                        PageIndexLocation* location) const = 0;
0384 };
0385 
0386 }  // namespace parquet