include/parquet/size_statistics.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <cstdint>
0021 #include <iosfwd>
0022 #include <optional>
0023 #include <vector>
0024
0025 #include "arrow/util/span.h"
0026 #include "parquet/platform.h"
0027 #include "parquet/type_fwd.h"
0028
0029 namespace parquet {
0030
0031 /// A structure for capturing metadata for estimating the unencoded,
0032 /// uncompressed size of data written. This is useful for readers to estimate
0033 /// how much memory is needed to reconstruct data in their memory model and for
0034 /// fine-grained filter push down on nested structures (the histograms contained
0035 /// in this structure can help determine the number of nulls at a particular
0036 /// nesting level and maximum length of lists).
0037 struct PARQUET_EXPORT SizeStatistics {
0038   /// When present, there is expected to be one element corresponding to each
0039   /// definition (i.e. size=max definition+1) where each element
0040   /// represents the number of times the definition level was observed in the
0041   /// data.
0042   ///
0043   /// This field may be omitted (a.k.a. zero-length vector) if max_definition_level
0044   /// is 0 without loss of information.
0045   std::vector<int64_t> definition_level_histogram;
0046
0047   /// Same as definition_level_histogram except for repetition levels.
0048   ///
0049   /// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level
0050   /// is 0 without loss of information.
0051   std::vector<int64_t> repetition_level_histogram;
0052
0053   /// The number of physical bytes stored for BYTE_ARRAY data values assuming
0054   /// no encoding. This is exclusive of the bytes needed to store the length of
0055   /// each byte array. In other words, this field is equivalent to the `(size
0056   /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
0057   /// written)`. To determine unencoded sizes of other types readers can use
0058   /// schema information multiplied by the number of non-null and null values.
0059   /// The number of null/non-null values can be inferred from the histograms
0060   /// below.
0061   ///
0062   /// For example, if a column chunk is dictionary-encoded with dictionary
0063   /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
0064   /// then this value for that data page should be 7 (1 + 1 + 2 + 3).
0065   ///
0066   /// This field should only be set for types that use BYTE_ARRAY as their
0067   /// physical type.
0068   std::optional<int64_t> unencoded_byte_array_data_bytes;
0069
0070   /// \brief Check if the SizeStatistics is set.
0071   bool is_set() const {
0072     return !repetition_level_histogram.empty() || !definition_level_histogram.empty() ||
0073            unencoded_byte_array_data_bytes.has_value();
0074   }
0075
0076   /// \brief Increment the unencoded byte array data bytes.
0077   void IncrementUnencodedByteArrayDataBytes(int64_t value);
0078
0079   /// \brief Merge two SizeStatistics.
0080   /// \throws ParquetException if SizeStatistics to merge is not compatible.
0081   void Merge(const SizeStatistics& other);
0082
0083   /// \brief Validate the SizeStatistics
0084   /// \throws ParquetException if the histograms don't have the right length,
0085   /// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column.
0086   void Validate(const ColumnDescriptor* descr) const;
0087
0088   /// \brief Reset the SizeStatistics to be empty.
0089   void Reset();
0090
0091   /// \brief Make an empty SizeStatistics object for specific type.
0092   static std::unique_ptr<SizeStatistics> Make(const ColumnDescriptor* descr);
0093 };
0094
0095 PARQUET_EXPORT
0096 std::ostream& operator<<(std::ostream&, const SizeStatistics&);
0097
0098 PARQUET_EXPORT
0099 void UpdateLevelHistogram(::arrow::util::span<const int16_t> levels,
0100                           ::arrow::util::span<int64_t> histogram);
0101
0102 }  // namespace parquet