Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 #include <optional>
0023 
0024 #include "parquet/platform.h"
0025 #include "parquet/types.h"
0026 
0027 namespace parquet::geospatial {
0028 
0029 /// \brief The maximum number of dimensions represented by a geospatial type
0030 /// (i.e., X, Y, Z, and M)
0031 inline constexpr int kMaxDimensions = 4;
0032 
0033 /// \brief NaN, used to represent bounds for which predicate pushdown cannnot
0034 /// be applied (e.g., because a writer did not provide bounds for a given dimension)
0035 inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN();
0036 
0037 /// \brief Structure represented encoded statistics to be written to and read from Parquet
0038 /// serialized metadata.
0039 ///
0040 /// See the Parquet Thrift definition and GeoStatistics for the specific definition
0041 /// of field values.
0042 struct PARQUET_EXPORT EncodedGeoStatistics {
0043   bool xy_bounds_present{false};
0044   double xmin{kNaN};
0045   double xmax{kNaN};
0046   double ymin{kNaN};
0047   double ymax{kNaN};
0048 
0049   bool z_bounds_present{false};
0050   double zmin{kNaN};
0051   double zmax{kNaN};
0052 
0053   bool m_bounds_present{false};
0054   double mmin{kNaN};
0055   double mmax{kNaN};
0056 
0057   bool geospatial_types_present() const { return !geospatial_types.empty(); }
0058   std::vector<int32_t> geospatial_types;
0059 };
0060 
0061 class GeoStatisticsImpl;
0062 
0063 /// \brief Base type for computing geospatial column statistics while writing a file
0064 /// or representing them when reading a file
0065 ///
0066 /// These statistics track the minimum and maximum value (omitting NaN values) of the
0067 /// four possible dimensions (X, Y, Z, and M) and the distinct set of geometry
0068 /// type/dimension combinations (e.g., point XY, linestring XYZM) present in the data.
0069 /// Any of these individual components may be "invalid": for example, when reading a
0070 /// Parquet file, information about individual components obtained from the column
0071 /// chunk metadata may have been missing or deemed unusable. Orthogonally,
0072 /// any of these individual components may be "empty": for example, when using
0073 /// GeoStatistics to accumulate bounds whilst writing, if all geometries in a column chunk
0074 /// are null, all ranges (X, Y, Z, and M) will be empty. If all geometries in a column
0075 /// chunk contain only XY coordinates (the most common case), the Z and M ranges will
0076 /// be empty but the X and Y ranges will contain finite bounds. Empty ranges are
0077 /// considered "valid" because they are known to represent exactly zero values (in
0078 /// contrast to an invalid range, whose contents is completely unknown). These concepts
0079 /// are all necessary for this object to accurately represent (1) accumulated or partially
0080 /// accumulated statistics during the writing process and (2) deserialized statistics read
0081 /// from the column chunk metadata during the reading process.
0082 ///
0083 /// EXPERIMENTAL
0084 class PARQUET_EXPORT GeoStatistics {
0085  public:
0086   GeoStatistics();
0087   explicit GeoStatistics(const EncodedGeoStatistics& encoded);
0088 
0089   ~GeoStatistics();
0090 
0091   /// \brief Return true if bounds, geometry types, and validity are identical
0092   bool Equals(const GeoStatistics& other) const;
0093 
0094   /// \brief Update these statistics based on previously calculated or decoded statistics
0095   ///
0096   /// Merging statistics with wraparound X values is not currently supported. Merging
0097   /// two GeoStatistics where one or both has a wraparound X range will result in these
0098   /// statistics having an X dimension marked as invalid.
0099   void Merge(const GeoStatistics& other);
0100 
0101   /// \brief Update these statistics based on values
0102   void Update(const ByteArray* values, int64_t num_values);
0103 
0104   /// \brief Update these statistics based on the non-null elements of values
0105   void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
0106                     int64_t valid_bits_offset, int64_t num_spaced_values,
0107                     int64_t num_values);
0108 
0109   /// \brief Update these statistics based on the non-null elements of values
0110   ///
0111   /// Currently, BinaryArray and LargeBinaryArray input is supported.
0112   void Update(const ::arrow::Array& values);
0113 
0114   /// \brief Return these statistics to an empty state
0115   void Reset();
0116 
0117   /// \brief Encode the statistics for serializing to Thrift
0118   ///
0119   /// If invalid WKB was encountered or if the statistics contain NaN
0120   /// for any reason, Encode() will return nullopt to indicate that
0121   /// statistics should not be written to thrift.
0122   std::optional<EncodedGeoStatistics> Encode() const;
0123 
0124   /// \brief Returns false if invalid WKB was encountered
0125   bool is_valid() const;
0126 
0127   /// \brief Reset existing statistics and populate them from previously-encoded ones
0128   void Decode(const EncodedGeoStatistics& encoded);
0129 
0130   /// \brief Minimum values in XYZM order
0131   ///
0132   /// For dimensions where dimension_valid() is false, the value will be NaN. For
0133   /// dimensions where dimension_empty() is true, the value will be +Inf.
0134   ///
0135   /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
0136   /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
0137   /// xmin]. This implementation does not yet generate these types of bounds but they may
0138   /// be encountered in statistics when reading a Parquet file.
0139   std::array<double, kMaxDimensions> lower_bound() const;
0140 
0141   /// \brief Maximum values in XYZM order
0142   ///
0143   /// For dimensions where dimension_valid() is false, the value will be NaN. For
0144   /// dimensions where dimension_empty() is true, the value will be -Inf.
0145   ///
0146   /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
0147   /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
0148   /// xmin]. This implementation does not yet generate these types of bounds but they may
0149   /// be encountered in statistics when reading a Parquet file.
0150   std::array<double, kMaxDimensions> upper_bound() const;
0151 
0152   /// \brief Dimension emptiness in XYZM order
0153   ///
0154   /// True for a given dimension if and only if zero non-NaN values were encountered
0155   /// in that dimension and dimension_valid() is true for that dimension.
0156   ///
0157   /// When calculating statistics, zero or more of these values may be true because
0158   /// this implementation calculates bounds for all dimensions; however, it may be
0159   /// true that zero coordinates were encountered in a given dimension. For example,
0160   /// dimension_empty() will return four true values if Update() was not called
0161   /// or if Update() was called with only null values. If Update() was provided
0162   /// one or more geometries with X and Y dimensions but not Z or M dimensions,
0163   /// dimension_empty() will return true, true, false, false.
0164   ///
0165   /// For statistics read from a Parquet file, dimension_empty() will always contain
0166   /// false values because there is no mechanism to communicate an empty interval
0167   /// in the Thrift metadata.
0168   std::array<bool, kMaxDimensions> dimension_empty() const;
0169 
0170   /// \brief Dimension validity (i.e. presence) in XYZM order
0171   ///
0172   /// When calculating statistics, this will always be true because this implementation
0173   /// calculates statistics for all dimensions. When reading a Parquet file, one or more
0174   /// of these values may be false because the file may not have provided bounds for all
0175   /// dimensions.
0176   ///
0177   /// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the
0178   /// canonical values of those outputs for the dimensions where dimension_valid() is
0179   /// false.
0180   std::array<bool, kMaxDimensions> dimension_valid() const;
0181 
0182   /// \brief Return the geometry type codes
0183   ///
0184   /// This implementation always returns sorted output with no duplicates. When
0185   /// calculating statistics, a value will always be returned (although the returned
0186   /// vector may be empty if Update() was never called or was only called with null
0187   /// values). When reading a Parquet file, std::nullopt may be returned because
0188   /// the file may not have provided this information.
0189   std::optional<std::vector<int32_t>> geometry_types() const;
0190 
0191   /// \brief Return a string representation of these statistics
0192   std::string ToString() const;
0193 
0194  private:
0195   std::unique_ptr<GeoStatisticsImpl> impl_;
0196 };
0197 
0198 }  // namespace parquet::geospatial