|
|
|||
File indexing completed on 2026-04-17 08:28:55
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <iosfwd> 0022 #include <optional> 0023 #include <vector> 0024 0025 #include "arrow/util/span.h" 0026 #include "parquet/platform.h" 0027 #include "parquet/type_fwd.h" 0028 0029 namespace parquet { 0030 0031 /// A structure for capturing metadata for estimating the unencoded, 0032 /// uncompressed size of data written. This is useful for readers to estimate 0033 /// how much memory is needed to reconstruct data in their memory model and for 0034 /// fine-grained filter push down on nested structures (the histograms contained 0035 /// in this structure can help determine the number of nulls at a particular 0036 /// nesting level and maximum length of lists). 0037 struct PARQUET_EXPORT SizeStatistics { 0038 /// When present, there is expected to be one element corresponding to each 0039 /// definition (i.e. size=max definition+1) where each element 0040 /// represents the number of times the definition level was observed in the 0041 /// data. 0042 /// 0043 /// This field may be omitted (a.k.a. zero-length vector) if max_definition_level 0044 /// is 0 without loss of information. 0045 std::vector<int64_t> definition_level_histogram; 0046 0047 /// Same as definition_level_histogram except for repetition levels. 0048 /// 0049 /// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level 0050 /// is 0 without loss of information. 0051 std::vector<int64_t> repetition_level_histogram; 0052 0053 /// The number of physical bytes stored for BYTE_ARRAY data values assuming 0054 /// no encoding. This is exclusive of the bytes needed to store the length of 0055 /// each byte array. In other words, this field is equivalent to the `(size 0056 /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values 0057 /// written)`. To determine unencoded sizes of other types readers can use 0058 /// schema information multiplied by the number of non-null and null values. 0059 /// The number of null/non-null values can be inferred from the histograms 0060 /// below. 0061 /// 0062 /// For example, if a column chunk is dictionary-encoded with dictionary 0063 /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], 0064 /// then this value for that data page should be 7 (1 + 1 + 2 + 3). 0065 /// 0066 /// This field should only be set for types that use BYTE_ARRAY as their 0067 /// physical type. 0068 std::optional<int64_t> unencoded_byte_array_data_bytes; 0069 0070 /// \brief Check if the SizeStatistics is set. 0071 bool is_set() const { 0072 return !repetition_level_histogram.empty() || !definition_level_histogram.empty() || 0073 unencoded_byte_array_data_bytes.has_value(); 0074 } 0075 0076 /// \brief Increment the unencoded byte array data bytes. 0077 void IncrementUnencodedByteArrayDataBytes(int64_t value); 0078 0079 /// \brief Merge two SizeStatistics. 0080 /// \throws ParquetException if SizeStatistics to merge is not compatible. 0081 void Merge(const SizeStatistics& other); 0082 0083 /// \brief Validate the SizeStatistics 0084 /// \throws ParquetException if the histograms don't have the right length, 0085 /// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column. 0086 void Validate(const ColumnDescriptor* descr) const; 0087 0088 /// \brief Reset the SizeStatistics to be empty. 0089 void Reset(); 0090 0091 /// \brief Make an empty SizeStatistics object for specific type. 0092 static std::unique_ptr<SizeStatistics> Make(const ColumnDescriptor* descr); 0093 }; 0094 0095 PARQUET_EXPORT 0096 std::ostream& operator<<(std::ostream&, const SizeStatistics&); 0097 0098 PARQUET_EXPORT 0099 void UpdateLevelHistogram(::arrow::util::span<const int16_t> levels, 0100 ::arrow::util::span<int64_t> histogram); 0101 0102 } // namespace parquet
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|