|
|
|||
File indexing completed on 2026-04-17 08:28:54
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include "arrow/io/interfaces.h" 0021 #include "parquet/encryption/type_fwd.h" 0022 #include "parquet/type_fwd.h" 0023 #include "parquet/types.h" 0024 0025 #include <optional> 0026 #include <vector> 0027 0028 namespace parquet { 0029 0030 /// \brief ColumnIndex is a proxy around format::ColumnIndex. 0031 class PARQUET_EXPORT ColumnIndex { 0032 public: 0033 /// \brief Create a ColumnIndex from a serialized thrift message. 0034 static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr, 0035 const void* serialized_index, 0036 uint32_t index_len, 0037 const ReaderProperties& properties, 0038 Decryptor* decryptor = NULLPTR); 0039 0040 virtual ~ColumnIndex() = default; 0041 0042 /// \brief A bitmap with a bit set for each data page that has only null values. 0043 /// 0044 /// The length of this vector is equal to the number of data pages in the column. 0045 virtual const std::vector<bool>& null_pages() const = 0; 0046 0047 /// \brief A vector of encoded lower bounds for each data page in this column. 0048 /// 0049 /// `null_pages` should be inspected first, as only pages with non-null values 0050 /// may have their lower bounds populated. 0051 virtual const std::vector<std::string>& encoded_min_values() const = 0; 0052 0053 /// \brief A vector of encoded upper bounds for each data page in this column. 0054 /// 0055 /// `null_pages` should be inspected first, as only pages with non-null values 0056 /// may have their upper bounds populated. 0057 virtual const std::vector<std::string>& encoded_max_values() const = 0; 0058 0059 /// \brief The ordering of lower and upper bounds. 0060 /// 0061 /// The boundary order applies across all lower bounds, and all upper bounds, 0062 /// respectively. However, the order between lower bounds and upper bounds 0063 /// cannot be derived from this. 0064 virtual BoundaryOrder::type boundary_order() const = 0; 0065 0066 /// \brief Whether per-page null count information is available. 0067 virtual bool has_null_counts() const = 0; 0068 0069 /// \brief An optional vector with the number of null values in each data page. 0070 /// 0071 /// `has_null_counts` should be called first to determine if this information is 0072 /// available. 0073 virtual const std::vector<int64_t>& null_counts() const = 0; 0074 0075 /// \brief A vector of page indices for non-null pages. 0076 virtual const std::vector<int32_t>& non_null_page_indices() const = 0; 0077 0078 /// \brief Whether definition level histogram is available. 0079 virtual bool has_definition_level_histograms() const = 0; 0080 0081 /// \brief Whether repetition level histogram is available. 0082 virtual bool has_repetition_level_histograms() const = 0; 0083 0084 /// \brief List of definition level histograms for each page concatenated together. 0085 virtual const std::vector<int64_t>& definition_level_histograms() const = 0; 0086 0087 /// \brief List of repetition level histograms for each page concatenated together. 0088 virtual const std::vector<int64_t>& repetition_level_histograms() const = 0; 0089 }; 0090 0091 /// \brief Typed implementation of ColumnIndex. 0092 template <typename DType> 0093 class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex { 0094 public: 0095 using T = typename DType::c_type; 0096 0097 /// \brief A vector of lower bounds for each data page in this column. 0098 /// 0099 /// This is like `encoded_min_values`, but with the values decoded according to 0100 /// the column's physical type. 0101 /// `min_values` and `max_values` can be used together with `boundary_order` 0102 /// in order to prune some data pages when searching for specific values. 0103 virtual const std::vector<T>& min_values() const = 0; 0104 0105 /// \brief A vector of upper bounds for each data page in this column. 0106 /// 0107 /// Just like `min_values`, but for upper bounds instead of lower bounds. 0108 virtual const std::vector<T>& max_values() const = 0; 0109 }; 0110 0111 using BoolColumnIndex = TypedColumnIndex<BooleanType>; 0112 using Int32ColumnIndex = TypedColumnIndex<Int32Type>; 0113 using Int64ColumnIndex = TypedColumnIndex<Int64Type>; 0114 using FloatColumnIndex = TypedColumnIndex<FloatType>; 0115 using DoubleColumnIndex = TypedColumnIndex<DoubleType>; 0116 using ByteArrayColumnIndex = TypedColumnIndex<ByteArrayType>; 0117 using FLBAColumnIndex = TypedColumnIndex<FLBAType>; 0118 0119 /// \brief PageLocation is a proxy around format::PageLocation. 0120 struct PARQUET_EXPORT PageLocation { 0121 /// File offset of the data page. 0122 int64_t offset; 0123 /// Total compressed size of the data page and header. 0124 int32_t compressed_page_size; 0125 /// Row id of the first row in the page within the row group. 0126 int64_t first_row_index; 0127 }; 0128 0129 /// \brief OffsetIndex is a proxy around format::OffsetIndex. 0130 class PARQUET_EXPORT OffsetIndex { 0131 public: 0132 /// \brief Create a OffsetIndex from a serialized thrift message. 0133 static std::unique_ptr<OffsetIndex> Make(const void* serialized_index, 0134 uint32_t index_len, 0135 const ReaderProperties& properties, 0136 Decryptor* decryptor = NULLPTR); 0137 0138 virtual ~OffsetIndex() = default; 0139 0140 /// \brief A vector of locations for each data page in this column. 0141 virtual const std::vector<PageLocation>& page_locations() const = 0; 0142 0143 /// \brief A vector of unencoded/uncompressed size of each page for BYTE_ARRAY types, 0144 /// or empty for other types. 0145 virtual const std::vector<int64_t>& unencoded_byte_array_data_bytes() const = 0; 0146 }; 0147 0148 /// \brief Interface for reading the page index for a Parquet row group. 0149 class PARQUET_EXPORT RowGroupPageIndexReader { 0150 public: 0151 virtual ~RowGroupPageIndexReader() = default; 0152 0153 /// \brief Read column index of a column chunk. 0154 /// 0155 /// \param[in] i column ordinal of the column chunk. 0156 /// \returns column index of the column or nullptr if it does not exist. 0157 /// \throws ParquetException if the index is out of bound. 0158 virtual std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) = 0; 0159 0160 /// \brief Read offset index of a column chunk. 0161 /// 0162 /// \param[in] i column ordinal of the column chunk. 0163 /// \returns offset index of the column or nullptr if it does not exist. 0164 /// \throws ParquetException if the index is out of bound. 0165 virtual std::shared_ptr<OffsetIndex> GetOffsetIndex(int32_t i) = 0; 0166 }; 0167 0168 struct PageIndexSelection { 0169 /// Specifies whether to read the column index. 0170 bool column_index = false; 0171 /// Specifies whether to read the offset index. 0172 bool offset_index = false; 0173 }; 0174 0175 PARQUET_EXPORT 0176 std::ostream& operator<<(std::ostream& out, const PageIndexSelection& params); 0177 0178 struct RowGroupIndexReadRange { 0179 /// Base start and total size of column index of all column chunks in a row group. 0180 /// If none of the column chunks have column index, it is set to std::nullopt. 0181 std::optional<::arrow::io::ReadRange> column_index = std::nullopt; 0182 /// Base start and total size of offset index of all column chunks in a row group. 0183 /// If none of the column chunks have offset index, it is set to std::nullopt. 0184 std::optional<::arrow::io::ReadRange> offset_index = std::nullopt; 0185 }; 0186 0187 /// \brief Interface for reading the page index for a Parquet file. 0188 class PARQUET_EXPORT PageIndexReader { 0189 public: 0190 virtual ~PageIndexReader() = default; 0191 0192 /// \brief Create a PageIndexReader instance. 0193 /// \returns a PageIndexReader instance. 0194 /// WARNING: The returned PageIndexReader references to all the input parameters, so 0195 /// it must not outlive all of the input parameters. Usually these input parameters 0196 /// come from the same ParquetFileReader object, so it must not outlive the reader 0197 /// that creates this PageIndexReader. 0198 static std::shared_ptr<PageIndexReader> Make( 0199 ::arrow::io::RandomAccessFile* input, std::shared_ptr<FileMetaData> file_metadata, 0200 const ReaderProperties& properties, 0201 InternalFileDecryptor* file_decryptor = NULLPTR); 0202 0203 /// \brief Get the page index reader of a specific row group. 0204 /// \param[in] i row group ordinal to get page index reader. 0205 /// \returns RowGroupPageIndexReader of the specified row group. A nullptr may or may 0206 /// not be returned if the page index for the row group is unavailable. It is 0207 /// the caller's responsibility to check the return value of follow-up calls 0208 /// to the RowGroupPageIndexReader. 0209 /// \throws ParquetException if the index is out of bound. 0210 virtual std::shared_ptr<RowGroupPageIndexReader> RowGroup(int i) = 0; 0211 0212 /// \brief Advise the reader which part of page index will be read later. 0213 /// 0214 /// The PageIndexReader can optionally prefetch and cache page index that 0215 /// may be read later to get better performance. 0216 /// 0217 /// The contract of this function is as below: 0218 /// 1) If WillNeed() has not been called for a specific row group and the page index 0219 /// exists, follow-up calls to get column index or offset index of all columns in 0220 /// this row group SHOULD NOT FAIL, but the performance may not be optimal. 0221 /// 2) If WillNeed() has been called for a specific row group, follow-up calls to get 0222 /// page index are limited to columns and index type requested by WillNeed(). 0223 /// So it MAY FAIL if columns that are not requested by WillNeed() are requested. 0224 /// 3) Later calls to WillNeed() MAY OVERRIDE previous calls of same row groups. 0225 /// For example, 0226 /// 1) If WillNeed() is not called for row group 0, then follow-up calls to read 0227 /// column index and/or offset index of all columns of row group 0 should not 0228 /// fail if its page index exists. 0229 /// 2) If WillNeed() is called for columns 0 and 1 for row group 0, then follow-up 0230 /// call to read page index of column 2 for row group 0 MAY FAIL even if its 0231 /// page index exists. 0232 /// 3) If WillNeed() is called for row group 0 with offset index only, then 0233 /// follow-up call to read column index of row group 0 MAY FAIL even if 0234 /// the column index of this column exists. 0235 /// 4) If WillNeed() is called for columns 0 and 1 for row group 0, then later 0236 /// call to WillNeed() for columns 1 and 2 for row group 0. The later one 0237 /// overrides previous call and only columns 1 and 2 of row group 0 are allowed 0238 /// to access. 0239 /// 0240 /// \param[in] row_group_indices list of row group ordinal to read page index later. 0241 /// \param[in] column_indices list of column ordinal to read page index later. If it is 0242 /// empty, it means all columns in the row group will be read. 0243 /// \param[in] selection which kind of page index is required later. 0244 virtual void WillNeed(const std::vector<int32_t>& row_group_indices, 0245 const std::vector<int32_t>& column_indices, 0246 const PageIndexSelection& selection) = 0; 0247 0248 /// \brief Advise the reader page index of these row groups will not be read anymore. 0249 /// 0250 /// The PageIndexReader implementation has the opportunity to cancel any prefetch or 0251 /// release resource that are related to these row groups. 0252 /// 0253 /// \param[in] row_group_indices list of row group ordinal that whose page index will 0254 /// not be accessed anymore. 0255 virtual void WillNotNeed(const std::vector<int32_t>& row_group_indices) = 0; 0256 0257 /// \brief Determine the column index and offset index ranges for the given row group. 0258 /// 0259 /// \param[in] row_group_metadata row group metadata to get column chunk metadata. 0260 /// \param[in] columns list of column ordinals to get page index. If the list is empty, 0261 /// it means all columns in the row group. 0262 /// \returns RowGroupIndexReadRange of the specified row group. Throws ParquetException 0263 /// if the selected column ordinal is out of bound or metadata of page index 0264 /// is corrupted. 0265 static RowGroupIndexReadRange DeterminePageIndexRangesInRowGroup( 0266 const RowGroupMetaData& row_group_metadata, const std::vector<int32_t>& columns); 0267 }; 0268 0269 /// \brief Interface for collecting column index of data pages in a column chunk. 0270 class PARQUET_EXPORT ColumnIndexBuilder { 0271 public: 0272 /// \brief API convenience to create a ColumnIndexBuilder. 0273 static std::unique_ptr<ColumnIndexBuilder> Make(const ColumnDescriptor* descr); 0274 0275 virtual ~ColumnIndexBuilder() = default; 0276 0277 /// \brief Add statistics of a data page. 0278 /// 0279 /// If the ColumnIndexBuilder has seen any corrupted statistics, it will 0280 /// not update statistics anymore. 0281 /// 0282 /// \param stats Page statistics in the encoded form. 0283 /// \param size_stats Size statistics of the page if available. 0284 virtual void AddPage(const EncodedStatistics& stats, 0285 const SizeStatistics& size_stats) = 0; 0286 0287 /// \brief Complete the column index. 0288 /// 0289 /// Once called, AddPage() can no longer be called. 0290 /// WriteTo() and Build() can only called after Finish() has been called. 0291 virtual void Finish() = 0; 0292 0293 /// \brief Serialize the column index thrift message. 0294 /// 0295 /// If the ColumnIndexBuilder has seen any corrupted statistics, it will 0296 /// not write any data to the sink. 0297 /// 0298 /// \param[out] sink output stream to write the serialized message. 0299 /// \param[in] encryptor encryptor to encrypt the serialized column index. 0300 virtual void WriteTo(::arrow::io::OutputStream* sink, 0301 Encryptor* encryptor = NULLPTR) const = 0; 0302 0303 /// \brief Create a ColumnIndex directly. 0304 /// 0305 /// \return If the ColumnIndexBuilder has seen any corrupted statistics, it simply 0306 /// returns nullptr. Otherwise the column index is built and returned. 0307 virtual std::unique_ptr<ColumnIndex> Build() const = 0; 0308 }; 0309 0310 /// \brief Interface for collecting offset index of data pages in a column chunk. 0311 class PARQUET_EXPORT OffsetIndexBuilder { 0312 public: 0313 /// \brief API convenience to create a OffsetIndexBuilder. 0314 static std::unique_ptr<OffsetIndexBuilder> Make(); 0315 0316 virtual ~OffsetIndexBuilder() = default; 0317 0318 /// \brief Add page location and size stats of a data page. 0319 virtual void AddPage(int64_t offset, int32_t compressed_page_size, 0320 int64_t first_row_index, 0321 std::optional<int64_t> unencoded_byte_array_length = {}) = 0; 0322 0323 /// \brief Add page location and size stats of a data page. 0324 void AddPage(const PageLocation& page_location, const SizeStatistics& size_stats); 0325 0326 /// \brief Complete the offset index. 0327 /// 0328 /// In the buffered row group mode, data pages are flushed into memory 0329 /// sink and the OffsetIndexBuilder has only collected the relative offset 0330 /// which requires adjustment once they are flushed to the file. 0331 /// 0332 /// \param final_position Final stream offset to add for page offset adjustment. 0333 virtual void Finish(int64_t final_position) = 0; 0334 0335 /// \brief Serialize the offset index thrift message. 0336 /// 0337 /// \param[out] sink output stream to write the serialized message. 0338 /// \param[in] encryptor encryptor to encrypt the serialized offset index. 0339 virtual void WriteTo(::arrow::io::OutputStream* sink, 0340 Encryptor* encryptor = NULLPTR) const = 0; 0341 0342 /// \brief Create an OffsetIndex directly. 0343 virtual std::unique_ptr<OffsetIndex> Build() const = 0; 0344 }; 0345 0346 /// \brief Interface for collecting page index of a parquet file. 0347 class PARQUET_EXPORT PageIndexBuilder { 0348 public: 0349 /// \brief API convenience to create a PageIndexBuilder. 0350 static std::unique_ptr<PageIndexBuilder> Make( 0351 const SchemaDescriptor* schema, InternalFileEncryptor* file_encryptor = NULLPTR); 0352 0353 virtual ~PageIndexBuilder() = default; 0354 0355 /// \brief Start a new row group. 0356 virtual void AppendRowGroup() = 0; 0357 0358 /// \brief Get the ColumnIndexBuilder from column ordinal. 0359 /// 0360 /// \param i Column ordinal. 0361 /// \return ColumnIndexBuilder for the column and its memory ownership belongs to 0362 /// the PageIndexBuilder. 0363 virtual ColumnIndexBuilder* GetColumnIndexBuilder(int32_t i) = 0; 0364 0365 /// \brief Get the OffsetIndexBuilder from column ordinal. 0366 /// 0367 /// \param i Column ordinal. 0368 /// \return OffsetIndexBuilder for the column and its memory ownership belongs to 0369 /// the PageIndexBuilder. 0370 virtual OffsetIndexBuilder* GetOffsetIndexBuilder(int32_t i) = 0; 0371 0372 /// \brief Complete the page index builder and no more write is allowed. 0373 virtual void Finish() = 0; 0374 0375 /// \brief Serialize the page index thrift message. 0376 /// 0377 /// Only valid column indexes and offset indexes are serialized and their locations 0378 /// are set. 0379 /// 0380 /// \param[out] sink The output stream to write the page index. 0381 /// \param[out] location The location of all page index to the start of sink. 0382 virtual void WriteTo(::arrow::io::OutputStream* sink, 0383 PageIndexLocation* location) const = 0; 0384 }; 0385 0386 } // namespace parquet
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|