Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:55

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <algorithm>
0021 #include <cstddef>
0022 #include <cstdint>
0023 #include <memory>
0024 #include <string>
0025 #include <utility>
0026 
0027 #include "parquet/platform.h"
0028 #include "parquet/types.h"
0029 
0030 namespace arrow {
0031 
0032 class Array;
0033 class BinaryArray;
0034 
0035 }  // namespace arrow
0036 
0037 namespace parquet {
0038 
0039 class ColumnDescriptor;
0040 
0041 // ----------------------------------------------------------------------
0042 // Value comparator interfaces
0043 
0044 /// \brief Base class for value comparators. Generally used with
0045 /// TypedComparator<T>
0046 class PARQUET_EXPORT Comparator {
0047  public:
0048   virtual ~Comparator() {}
0049 
0050   /// \brief Create a comparator explicitly from physical type and
0051   /// sort order
0052   /// \param[in] physical_type the physical type for the typed
0053   /// comparator
0054   /// \param[in] sort_order either SortOrder::SIGNED or
0055   /// SortOrder::UNSIGNED
0056   /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
0057   static std::shared_ptr<Comparator> Make(Type::type physical_type,
0058                                           SortOrder::type sort_order,
0059                                           int type_length = -1);
0060 
0061   /// \brief Create typed comparator inferring default sort order from
0062   /// ColumnDescriptor
0063   /// \param[in] descr the Parquet column schema
0064   static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
0065 };
0066 
0067 /// \brief Interface for comparison of physical types according to the
0068 /// semantics of a particular logical type.
0069 template <typename DType>
0070 class TypedComparator : public Comparator {
0071  public:
0072   using T = typename DType::c_type;
0073 
0074   /// \brief Scalar comparison of two elements, return true if first
0075   /// is strictly less than the second
0076   virtual bool Compare(const T& a, const T& b) const = 0;
0077 
0078   /// \brief Compute maximum and minimum elements in a batch of
0079   /// elements without any nulls
0080   virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) const = 0;
0081 
0082   /// \brief Compute minimum and maximum elements from an Arrow array. Only
0083   /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
0084   /// / arrow::BinaryArray
0085   virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) const = 0;
0086 
0087   /// \brief Compute maximum and minimum elements in a batch of
0088   /// elements with accompanying bitmap indicating which elements are
0089   /// included (bit set) and excluded (bit not set)
0090   ///
0091   /// \param[in] values the sequence of values
0092   /// \param[in] length the length of the sequence
0093   /// \param[in] valid_bits a bitmap indicating which elements are
0094   /// included (1) or excluded (0)
0095   /// \param[in] valid_bits_offset the bit offset into the bitmap of
0096   /// the first element in the sequence
0097   virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
0098                                           const uint8_t* valid_bits,
0099                                           int64_t valid_bits_offset) const = 0;
0100 };
0101 
0102 /// \brief Typed version of Comparator::Make
0103 template <typename DType>
0104 std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
0105                                                        SortOrder::type sort_order,
0106                                                        int type_length = -1) {
0107   return std::static_pointer_cast<TypedComparator<DType>>(
0108       Comparator::Make(physical_type, sort_order, type_length));
0109 }
0110 
0111 /// \brief Typed version of Comparator::Make
0112 template <typename DType>
0113 std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
0114   return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
0115 }
0116 
0117 // ----------------------------------------------------------------------
0118 
0119 /// \brief Structure represented encoded statistics to be written to
0120 /// and read from Parquet serialized metadata.
0121 class PARQUET_EXPORT EncodedStatistics {
0122   std::string max_, min_;
0123   bool is_signed_ = false;
0124 
0125  public:
0126   EncodedStatistics() = default;
0127 
0128   const std::string& max() const { return max_; }
0129   const std::string& min() const { return min_; }
0130 
0131   std::optional<bool> is_max_value_exact;
0132   std::optional<bool> is_min_value_exact;
0133 
0134   int64_t null_count = 0;
0135   int64_t distinct_count = 0;
0136 
0137   bool has_min = false;
0138   bool has_max = false;
0139   bool has_null_count = false;
0140   bool has_distinct_count = false;
0141 
0142   // When all values in the statistics are null, it is set to true.
0143   // Otherwise, at least one value is not null, or we are not sure at all.
0144   // Page index requires this information to decide whether a data page
0145   // is a null page or not.
0146   bool all_null_value = false;
0147 
0148   // From parquet-mr
0149   // Don't write stats larger than the max size rather than truncating. The
0150   // rationale is that some engines may use the minimum value in the page as
0151   // the true minimum for aggregations and there is no way to mark that a
0152   // value has been truncated and is a lower bound and not in the page.
0153   void ApplyStatSizeLimits(size_t length) {
0154     if (max_.length() > length) {
0155       has_max = false;
0156       max_.clear();
0157       is_max_value_exact = std::nullopt;
0158     }
0159     if (min_.length() > length) {
0160       has_min = false;
0161       min_.clear();
0162       is_min_value_exact = std::nullopt;
0163     }
0164   }
0165 
0166   // Clear Min Max.
0167   void ClearMinMax() {
0168     has_max = false;
0169     max_.clear();
0170     has_min = false;
0171     min_.clear();
0172   }
0173 
0174   bool is_set() const {
0175     return has_min || has_max || has_null_count || has_distinct_count;
0176   }
0177 
0178   bool is_signed() const { return is_signed_; }
0179 
0180   void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
0181 
0182   EncodedStatistics& set_max(std::string value) {
0183     max_ = std::move(value);
0184     has_max = true;
0185     return *this;
0186   }
0187 
0188   EncodedStatistics& set_min(std::string value) {
0189     min_ = std::move(value);
0190     has_min = true;
0191     return *this;
0192   }
0193 
0194   EncodedStatistics& set_null_count(int64_t value) {
0195     null_count = value;
0196     has_null_count = true;
0197     return *this;
0198   }
0199 
0200   EncodedStatistics& set_distinct_count(int64_t value) {
0201     distinct_count = value;
0202     has_distinct_count = true;
0203     return *this;
0204   }
0205 };
0206 
0207 /// \brief Base type for computing column statistics while writing a file
0208 class PARQUET_EXPORT Statistics {
0209  public:
0210   virtual ~Statistics() {}
0211 
0212   /// \brief Create a new statistics instance given a column schema
0213   /// definition
0214   /// \param[in] descr the column schema
0215   /// \param[in] pool a memory pool to use for any memory allocations, optional
0216   static std::shared_ptr<Statistics> Make(
0217       const ColumnDescriptor* descr,
0218       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0219 
0220   /// \brief Create a new statistics instance given a column schema
0221   /// definition and preexisting state
0222   /// \param[in] descr the column schema
0223   /// \param[in] encoded_min the encoded minimum value
0224   /// \param[in] encoded_max the encoded maximum value
0225   /// \param[in] num_values total number of values
0226   /// \param[in] null_count number of null values
0227   /// \param[in] distinct_count number of distinct values
0228   /// \param[in] has_min_max whether the min/max statistics are set
0229   /// \param[in] has_null_count whether the null_count statistics are set
0230   /// \param[in] has_distinct_count whether the distinct_count statistics are set
0231   /// \param[in] pool a memory pool to use for any memory allocations, optional
0232   static std::shared_ptr<Statistics> Make(
0233       const ColumnDescriptor* descr, const std::string& encoded_min,
0234       const std::string& encoded_max, int64_t num_values, int64_t null_count,
0235       int64_t distinct_count, bool has_min_max, bool has_null_count,
0236       bool has_distinct_count,
0237       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0238 
0239   /// \brief Create a new statistics instance given a column schema
0240   /// definition and preexisting state
0241   /// \param[in] descr the column schema
0242   /// \param[in] encoded_min the encoded minimum value
0243   /// \param[in] encoded_max the encoded maximum value
0244   /// \param[in] num_values total number of values
0245   /// \param[in] null_count number of null values
0246   /// \param[in] distinct_count number of distinct values
0247   /// \param[in] has_min_max whether the min/max statistics are set
0248   /// \param[in] has_null_count whether the null_count statistics are set
0249   /// \param[in] has_distinct_count whether the distinct_count statistics are set
0250   /// \param[in] is_min_value_exact whether the min value is exact
0251   /// \param[in] is_max_value_exact whether the max value is exact
0252   /// \param[in] pool a memory pool to use for any memory allocations, optional
0253   static std::shared_ptr<Statistics> Make(
0254       const ColumnDescriptor* descr, const std::string& encoded_min,
0255       const std::string& encoded_max, int64_t num_values, int64_t null_count,
0256       int64_t distinct_count, bool has_min_max, bool has_null_count,
0257       bool has_distinct_count, std::optional<bool> is_min_value_exact,
0258       std::optional<bool> is_max_value_exact,
0259       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0260 
0261   // Helper function to convert EncodedStatistics to Statistics.
0262   // EncodedStatistics does not contain number of non-null values, and it can be
0263   // passed using the num_values parameter.
0264   static std::shared_ptr<Statistics> Make(
0265       const ColumnDescriptor* descr, const EncodedStatistics* encoded_statistics,
0266       int64_t num_values = -1,
0267       ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
0268 
0269   /// \brief Return true if the count of null values is set
0270   virtual bool HasNullCount() const = 0;
0271 
0272   /// \brief The number of null values, may not be set
0273   virtual int64_t null_count() const = 0;
0274 
0275   /// \brief Return true if the count of distinct values is set
0276   virtual bool HasDistinctCount() const = 0;
0277 
0278   /// \brief The number of distinct values, may not be set
0279   virtual int64_t distinct_count() const = 0;
0280 
0281   /// \brief The number of non-null values in the column
0282   virtual int64_t num_values() const = 0;
0283 
0284   /// \brief Return true if both min and max statistics are set. Obtain
0285   /// with TypedStatistics<T>::min and max
0286   virtual bool HasMinMax() const = 0;
0287 
0288   /// \brief Reset state of object to initial (no data observed) state
0289   virtual void Reset() = 0;
0290 
0291   /// \brief Plain-encoded minimum value
0292   virtual std::string EncodeMin() const = 0;
0293 
0294   /// \brief Plain-encoded maximum value
0295   virtual std::string EncodeMax() const = 0;
0296 
0297   /// \brief Return the minimum value exact flag if set.
0298   /// It will be true if there was no truncation.
0299   virtual std::optional<bool> is_min_value_exact() const = 0;
0300 
0301   /// \brief Return the maximum value exact flag if set.
0302   /// It will be true if there was no truncation.
0303   virtual std::optional<bool> is_max_value_exact() const = 0;
0304 
0305   /// \brief The finalized encoded form of the statistics for transport
0306   virtual EncodedStatistics Encode() = 0;
0307 
0308   /// \brief The physical type of the column schema
0309   virtual Type::type physical_type() const = 0;
0310 
0311   /// \brief The full type descriptor from the column schema
0312   virtual const ColumnDescriptor* descr() const = 0;
0313 
0314   /// \brief Check two Statistics for equality
0315   virtual bool Equals(const Statistics& other) const = 0;
0316 
0317  protected:
0318   static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
0319                                           const void* max, int64_t num_values,
0320                                           int64_t null_count, int64_t distinct_count);
0321 };
0322 
0323 /// \brief A typed implementation of Statistics
0324 template <typename DType>
0325 class TypedStatistics : public Statistics {
0326  public:
0327   using T = typename DType::c_type;
0328 
0329   /// \brief The current minimum value
0330   virtual const T& min() const = 0;
0331 
0332   /// \brief The current maximum value
0333   virtual const T& max() const = 0;
0334 
0335   /// \brief Update state with state of another Statistics object
0336   virtual void Merge(const TypedStatistics<DType>& other) = 0;
0337 
0338   /// \brief Batch statistics update
0339   virtual void Update(const T* values, int64_t num_values, int64_t null_count) = 0;
0340 
0341   /// \brief Batch statistics update with supplied validity bitmap
0342   /// \param[in] values pointer to column values
0343   /// \param[in] valid_bits Pointer to bitmap representing if values are non-null.
0344   /// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of
0345   ///                              data begins.
0346   /// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
0347   ///                              when calculating statistics. This can be smaller than
0348   ///                              num_values+null_count as null_count can include nulls
0349   ///                              from parents while num_spaced_values does not.
0350   /// \param[in] num_values Number of values that are not null.
0351   /// \param[in] null_count Number of values that are null.
0352   virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
0353                             int64_t valid_bits_offset, int64_t num_spaced_values,
0354                             int64_t num_values, int64_t null_count) = 0;
0355 
0356   /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
0357   /// conversion to a primitive Parquet C type. Only implemented for certain
0358   /// Parquet type / Arrow type combinations like BYTE_ARRAY /
0359   /// arrow::BinaryArray
0360   ///
0361   /// If update_counts is true then the null_count and num_values will be updated
0362   /// based on the null_count of values.  Set to false if these are updated
0363   /// elsewhere (e.g. when updating a dictionary where the counts are taken from
0364   /// the indices and not the values)
0365   virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
0366 
0367   /// \brief Set min and max values to particular values
0368   virtual void SetMinMax(const T& min, const T& max) = 0;
0369 
0370   /// \brief Increments the null count directly
0371   /// Use Update to extract the null count from data.  Use this if you determine
0372   /// the null count through some other means (e.g. dictionary arrays where the
0373   /// null count is determined from the indices)
0374   virtual void IncrementNullCount(int64_t n) = 0;
0375 
0376   /// \brief Increments the number of values directly
0377   /// The same note on IncrementNullCount applies here
0378   virtual void IncrementNumValues(int64_t n) = 0;
0379 };
0380 
0381 using BoolStatistics = TypedStatistics<BooleanType>;
0382 using Int32Statistics = TypedStatistics<Int32Type>;
0383 using Int64Statistics = TypedStatistics<Int64Type>;
0384 using FloatStatistics = TypedStatistics<FloatType>;
0385 using DoubleStatistics = TypedStatistics<DoubleType>;
0386 using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
0387 using FLBAStatistics = TypedStatistics<FLBAType>;
0388 
0389 /// \brief Typed version of Statistics::Make
0390 template <typename DType>
0391 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
0392     const ColumnDescriptor* descr,
0393     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0394   return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
0395 }
0396 
0397 /// \brief Create Statistics initialized to a particular state
0398 /// \param[in] min the minimum value
0399 /// \param[in] max the minimum value
0400 /// \param[in] num_values number of values
0401 /// \param[in] null_count number of null values
0402 /// \param[in] distinct_count number of distinct values
0403 template <typename DType>
0404 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
0405                                                        const typename DType::c_type& max,
0406                                                        int64_t num_values,
0407                                                        int64_t null_count,
0408                                                        int64_t distinct_count) {
0409   return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
0410       DType::type_num, &min, &max, num_values, null_count, distinct_count));
0411 }
0412 
0413 /// \brief Typed version of Statistics::Make
0414 template <typename DType>
0415 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
0416     const ColumnDescriptor* descr, const std::string& encoded_min,
0417     const std::string& encoded_max, int64_t num_values, int64_t null_count,
0418     int64_t distinct_count, bool has_min_max, bool has_null_count,
0419     bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0420   return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
0421       descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
0422       has_min_max, has_null_count, has_distinct_count,
0423       /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool));
0424 }
0425 
0426 /// \brief Typed version of Statistics::Make
0427 template <typename DType>
0428 std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
0429     const ColumnDescriptor* descr, const std::string& encoded_min,
0430     const std::string& encoded_max, int64_t num_values, int64_t null_count,
0431     int64_t distinct_count, bool has_min_max, bool has_null_count,
0432     bool has_distinct_count, std::optional<bool> is_min_value_exact,
0433     std::optional<bool> is_max_value_exact,
0434     ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
0435   return std::static_pointer_cast<TypedStatistics<DType>>(
0436       Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
0437                        distinct_count, has_min_max, has_null_count, has_distinct_count,
0438                        is_min_value_exact, is_max_value_exact, pool));
0439 }
0440 
0441 }  // namespace parquet