|
|
|||
File indexing completed on 2026-04-17 08:28:53
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 #include <optional> 0023 0024 #include "parquet/platform.h" 0025 #include "parquet/types.h" 0026 0027 namespace parquet::geospatial { 0028 0029 /// \brief The maximum number of dimensions represented by a geospatial type 0030 /// (i.e., X, Y, Z, and M) 0031 inline constexpr int kMaxDimensions = 4; 0032 0033 /// \brief NaN, used to represent bounds for which predicate pushdown cannnot 0034 /// be applied (e.g., because a writer did not provide bounds for a given dimension) 0035 inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN(); 0036 0037 /// \brief Structure represented encoded statistics to be written to and read from Parquet 0038 /// serialized metadata. 0039 /// 0040 /// See the Parquet Thrift definition and GeoStatistics for the specific definition 0041 /// of field values. 0042 struct PARQUET_EXPORT EncodedGeoStatistics { 0043 bool xy_bounds_present{false}; 0044 double xmin{kNaN}; 0045 double xmax{kNaN}; 0046 double ymin{kNaN}; 0047 double ymax{kNaN}; 0048 0049 bool z_bounds_present{false}; 0050 double zmin{kNaN}; 0051 double zmax{kNaN}; 0052 0053 bool m_bounds_present{false}; 0054 double mmin{kNaN}; 0055 double mmax{kNaN}; 0056 0057 bool geospatial_types_present() const { return !geospatial_types.empty(); } 0058 std::vector<int32_t> geospatial_types; 0059 }; 0060 0061 class GeoStatisticsImpl; 0062 0063 /// \brief Base type for computing geospatial column statistics while writing a file 0064 /// or representing them when reading a file 0065 /// 0066 /// These statistics track the minimum and maximum value (omitting NaN values) of the 0067 /// four possible dimensions (X, Y, Z, and M) and the distinct set of geometry 0068 /// type/dimension combinations (e.g., point XY, linestring XYZM) present in the data. 0069 /// Any of these individual components may be "invalid": for example, when reading a 0070 /// Parquet file, information about individual components obtained from the column 0071 /// chunk metadata may have been missing or deemed unusable. Orthogonally, 0072 /// any of these individual components may be "empty": for example, when using 0073 /// GeoStatistics to accumulate bounds whilst writing, if all geometries in a column chunk 0074 /// are null, all ranges (X, Y, Z, and M) will be empty. If all geometries in a column 0075 /// chunk contain only XY coordinates (the most common case), the Z and M ranges will 0076 /// be empty but the X and Y ranges will contain finite bounds. Empty ranges are 0077 /// considered "valid" because they are known to represent exactly zero values (in 0078 /// contrast to an invalid range, whose contents is completely unknown). These concepts 0079 /// are all necessary for this object to accurately represent (1) accumulated or partially 0080 /// accumulated statistics during the writing process and (2) deserialized statistics read 0081 /// from the column chunk metadata during the reading process. 0082 /// 0083 /// EXPERIMENTAL 0084 class PARQUET_EXPORT GeoStatistics { 0085 public: 0086 GeoStatistics(); 0087 explicit GeoStatistics(const EncodedGeoStatistics& encoded); 0088 0089 ~GeoStatistics(); 0090 0091 /// \brief Return true if bounds, geometry types, and validity are identical 0092 bool Equals(const GeoStatistics& other) const; 0093 0094 /// \brief Update these statistics based on previously calculated or decoded statistics 0095 /// 0096 /// Merging statistics with wraparound X values is not currently supported. Merging 0097 /// two GeoStatistics where one or both has a wraparound X range will result in these 0098 /// statistics having an X dimension marked as invalid. 0099 void Merge(const GeoStatistics& other); 0100 0101 /// \brief Update these statistics based on values 0102 void Update(const ByteArray* values, int64_t num_values); 0103 0104 /// \brief Update these statistics based on the non-null elements of values 0105 void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits, 0106 int64_t valid_bits_offset, int64_t num_spaced_values, 0107 int64_t num_values); 0108 0109 /// \brief Update these statistics based on the non-null elements of values 0110 /// 0111 /// Currently, BinaryArray and LargeBinaryArray input is supported. 0112 void Update(const ::arrow::Array& values); 0113 0114 /// \brief Return these statistics to an empty state 0115 void Reset(); 0116 0117 /// \brief Encode the statistics for serializing to Thrift 0118 /// 0119 /// If invalid WKB was encountered or if the statistics contain NaN 0120 /// for any reason, Encode() will return nullopt to indicate that 0121 /// statistics should not be written to thrift. 0122 std::optional<EncodedGeoStatistics> Encode() const; 0123 0124 /// \brief Returns false if invalid WKB was encountered 0125 bool is_valid() const; 0126 0127 /// \brief Reset existing statistics and populate them from previously-encoded ones 0128 void Decode(const EncodedGeoStatistics& encoded); 0129 0130 /// \brief Minimum values in XYZM order 0131 /// 0132 /// For dimensions where dimension_valid() is false, the value will be NaN. For 0133 /// dimensions where dimension_empty() is true, the value will be +Inf. 0134 /// 0135 /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this 0136 /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf, 0137 /// xmin]. This implementation does not yet generate these types of bounds but they may 0138 /// be encountered in statistics when reading a Parquet file. 0139 std::array<double, kMaxDimensions> lower_bound() const; 0140 0141 /// \brief Maximum values in XYZM order 0142 /// 0143 /// For dimensions where dimension_valid() is false, the value will be NaN. For 0144 /// dimensions where dimension_empty() is true, the value will be -Inf. 0145 /// 0146 /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this 0147 /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf, 0148 /// xmin]. This implementation does not yet generate these types of bounds but they may 0149 /// be encountered in statistics when reading a Parquet file. 0150 std::array<double, kMaxDimensions> upper_bound() const; 0151 0152 /// \brief Dimension emptiness in XYZM order 0153 /// 0154 /// True for a given dimension if and only if zero non-NaN values were encountered 0155 /// in that dimension and dimension_valid() is true for that dimension. 0156 /// 0157 /// When calculating statistics, zero or more of these values may be true because 0158 /// this implementation calculates bounds for all dimensions; however, it may be 0159 /// true that zero coordinates were encountered in a given dimension. For example, 0160 /// dimension_empty() will return four true values if Update() was not called 0161 /// or if Update() was called with only null values. If Update() was provided 0162 /// one or more geometries with X and Y dimensions but not Z or M dimensions, 0163 /// dimension_empty() will return true, true, false, false. 0164 /// 0165 /// For statistics read from a Parquet file, dimension_empty() will always contain 0166 /// false values because there is no mechanism to communicate an empty interval 0167 /// in the Thrift metadata. 0168 std::array<bool, kMaxDimensions> dimension_empty() const; 0169 0170 /// \brief Dimension validity (i.e. presence) in XYZM order 0171 /// 0172 /// When calculating statistics, this will always be true because this implementation 0173 /// calculates statistics for all dimensions. When reading a Parquet file, one or more 0174 /// of these values may be false because the file may not have provided bounds for all 0175 /// dimensions. 0176 /// 0177 /// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the 0178 /// canonical values of those outputs for the dimensions where dimension_valid() is 0179 /// false. 0180 std::array<bool, kMaxDimensions> dimension_valid() const; 0181 0182 /// \brief Return the geometry type codes 0183 /// 0184 /// This implementation always returns sorted output with no duplicates. When 0185 /// calculating statistics, a value will always be returned (although the returned 0186 /// vector may be empty if Update() was never called or was only called with null 0187 /// values). When reading a Parquet file, std::nullopt may be returned because 0188 /// the file may not have provided this information. 0189 std::optional<std::vector<int32_t>> geometry_types() const; 0190 0191 /// \brief Return a string representation of these statistics 0192 std::string ToString() const; 0193 0194 private: 0195 std::unique_ptr<GeoStatisticsImpl> impl_; 0196 }; 0197 0198 } // namespace parquet::geospatial
| [ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
|
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
|