Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // This module defines an abstract interface for iterating through pages in a
0019 // Parquet column chunk within a row group. It could be extended in the future
0020 // to iterate through all data pages in all chunks in a file.
0021 
0022 #pragma once
0023 
0024 #include <cstdint>
0025 #include <memory>
0026 #include <optional>
0027 #include <string>
0028 
0029 #include "parquet/size_statistics.h"
0030 #include "parquet/statistics.h"
0031 #include "parquet/types.h"
0032 
0033 namespace parquet {
0034 
0035 // TODO: Parallel processing is not yet safe because of memory-ownership
0036 // semantics (the PageReader may or may not own the memory referenced by a
0037 // page)
0038 //
0039 // TODO(wesm): In the future Parquet implementations may store the crc code
0040 // in format::PageHeader. parquet-mr currently does not, so we also skip it
0041 // here, both on the read and write path
0042 class Page {
0043  public:
0044   Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
0045       : buffer_(buffer), type_(type) {}
0046 
0047   PageType::type type() const { return type_; }
0048 
0049   std::shared_ptr<Buffer> buffer() const { return buffer_; }
0050 
0051   // @returns: a pointer to the page's data
0052   const uint8_t* data() const { return buffer_->data(); }
0053 
0054   // @returns: the total size in bytes of the page's data buffer
0055   int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
0056 
0057  private:
0058   std::shared_ptr<Buffer> buffer_;
0059   PageType::type type_;
0060 };
0061 
0062 /// \brief Base type for DataPageV1 and DataPageV2 including common attributes
0063 class DataPage : public Page {
0064  public:
0065   int32_t num_values() const { return num_values_; }
0066   Encoding::type encoding() const { return encoding_; }
0067   int64_t uncompressed_size() const { return uncompressed_size_; }
0068   const EncodedStatistics& statistics() const { return statistics_; }
0069   /// Return the row ordinal within the row group to the first row in the data page.
0070   /// Currently it is only present from data pages created by ColumnWriter in order
0071   /// to collect page index.
0072   std::optional<int64_t> first_row_index() const { return first_row_index_; }
0073   const SizeStatistics& size_statistics() const { return size_statistics_; }
0074 
0075   virtual ~DataPage() = default;
0076 
0077  protected:
0078   DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
0079            Encoding::type encoding, int64_t uncompressed_size,
0080            EncodedStatistics statistics, std::optional<int64_t> first_row_index,
0081            SizeStatistics size_statistics)
0082       : Page(buffer, type),
0083         num_values_(num_values),
0084         encoding_(encoding),
0085         uncompressed_size_(uncompressed_size),
0086         statistics_(std::move(statistics)),
0087         first_row_index_(std::move(first_row_index)),
0088         size_statistics_(std::move(size_statistics)) {}
0089 
0090   int32_t num_values_;
0091   Encoding::type encoding_;
0092   int64_t uncompressed_size_;
0093   EncodedStatistics statistics_;
0094   /// Row ordinal within the row group to the first row in the data page.
0095   std::optional<int64_t> first_row_index_;
0096   SizeStatistics size_statistics_;
0097 };
0098 
0099 class DataPageV1 : public DataPage {
0100  public:
0101   DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
0102              Encoding::type encoding, Encoding::type definition_level_encoding,
0103              Encoding::type repetition_level_encoding, int64_t uncompressed_size,
0104              EncodedStatistics statistics = EncodedStatistics(),
0105              std::optional<int64_t> first_row_index = std::nullopt,
0106              SizeStatistics size_statistics = SizeStatistics())
0107       : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
0108                  std::move(statistics), std::move(first_row_index),
0109                  std::move(size_statistics)),
0110         definition_level_encoding_(definition_level_encoding),
0111         repetition_level_encoding_(repetition_level_encoding) {}
0112 
0113   Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
0114 
0115   Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
0116 
0117  private:
0118   Encoding::type definition_level_encoding_;
0119   Encoding::type repetition_level_encoding_;
0120 };
0121 
0122 class DataPageV2 : public DataPage {
0123  public:
0124   DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
0125              int32_t num_rows, Encoding::type encoding,
0126              int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
0127              int64_t uncompressed_size, bool is_compressed = false,
0128              EncodedStatistics statistics = EncodedStatistics(),
0129              std::optional<int64_t> first_row_index = std::nullopt,
0130              SizeStatistics size_statistics = SizeStatistics())
0131       : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
0132                  std::move(statistics), std::move(first_row_index),
0133                  std::move(size_statistics)),
0134         num_nulls_(num_nulls),
0135         num_rows_(num_rows),
0136         definition_levels_byte_length_(definition_levels_byte_length),
0137         repetition_levels_byte_length_(repetition_levels_byte_length),
0138         is_compressed_(is_compressed) {}
0139 
0140   int32_t num_nulls() const { return num_nulls_; }
0141 
0142   int32_t num_rows() const { return num_rows_; }
0143 
0144   int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
0145 
0146   int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
0147 
0148   bool is_compressed() const { return is_compressed_; }
0149 
0150  private:
0151   int32_t num_nulls_;
0152   int32_t num_rows_;
0153   int32_t definition_levels_byte_length_;
0154   int32_t repetition_levels_byte_length_;
0155   bool is_compressed_;
0156 };
0157 
0158 class DictionaryPage : public Page {
0159  public:
0160   DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
0161                  Encoding::type encoding, bool is_sorted = false)
0162       : Page(buffer, PageType::DICTIONARY_PAGE),
0163         num_values_(num_values),
0164         encoding_(encoding),
0165         is_sorted_(is_sorted) {}
0166 
0167   int32_t num_values() const { return num_values_; }
0168 
0169   Encoding::type encoding() const { return encoding_; }
0170 
0171   bool is_sorted() const { return is_sorted_; }
0172 
0173  private:
0174   int32_t num_values_;
0175   Encoding::type encoding_;
0176   bool is_sorted_;
0177 };
0178 
0179 }  // namespace parquet