Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:55

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // This module contains the logical parquet-cpp types (independent of Thrift
0019 // structures), schema nodes, and related type tools
0020 
0021 #pragma once
0022 
0023 #include <cstdint>
0024 #include <memory>
0025 #include <ostream>
0026 #include <string>
0027 #include <unordered_map>
0028 #include <utility>
0029 #include <vector>
0030 
0031 #include "parquet/platform.h"
0032 #include "parquet/types.h"
0033 #include "parquet/windows_fixup.h"  // for OPTIONAL
0034 
0035 namespace parquet {
0036 
0037 class SchemaDescriptor;
0038 
0039 namespace schema {
0040 
0041 class Node;
0042 
0043 // List encodings: using the terminology from Impala to define different styles
0044 // of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
0045 // the converted type named in the Parquet metadata is ConvertedType::LIST we
0046 // use that terminology here. It also helps distinguish from the *_ARRAY
0047 // primitive types.
0048 //
0049 // One-level encoding: Only allows required lists with required cells
0050 //   repeated value_type name
0051 //
0052 // Two-level encoding: Enables optional lists with only required cells
0053 //   <required/optional> group list
0054 //     repeated value_type item
0055 //
0056 // Three-level encoding: Enables optional lists with optional cells
0057 //   <required/optional> group bag
0058 //     repeated group list
0059 //       <required/optional> value_type item
0060 //
0061 // 2- and 1-level encoding are respectively equivalent to 3-level encoding with
0062 // the non-repeated nodes set to required.
0063 //
0064 // The "official" encoding recommended in the Parquet spec is the 3-level, and
0065 // we use that as the default when creating list types. For semantic completeness
0066 // we allow the other two. Since all types of encodings will occur "in the
0067 // wild" we need to be able to interpret the associated definition levels in
0068 // the context of the actual encoding used in the file.
0069 //
0070 // NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
0071 // SchemaElement, which could make things challenging if we are trying to infer
0072 // that a sequence of nodes semantically represents an array according to one
0073 // of these encodings (versus a struct containing an array). We should refuse
0074 // the temptation to guess, as they say.
0075 struct ListEncoding {
0076   enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
0077 };
0078 
0079 class PARQUET_EXPORT ColumnPath {
0080  public:
0081   ColumnPath() : path_() {}
0082   explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
0083   explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
0084 
0085   static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
0086   static std::shared_ptr<ColumnPath> FromNode(const Node& node);
0087 
0088   std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
0089   std::string ToDotString() const;
0090   const std::vector<std::string>& ToDotVector() const;
0091 
0092  protected:
0093   std::vector<std::string> path_;
0094 };
0095 
0096 // Base class for logical schema types. A type has a name, repetition level,
0097 // and optionally a logical type (ConvertedType in Parquet metadata parlance)
0098 class PARQUET_EXPORT Node {
0099  public:
0100   enum type { PRIMITIVE, GROUP };
0101 
0102   virtual ~Node() {}
0103 
0104   bool is_primitive() const { return type_ == Node::PRIMITIVE; }
0105 
0106   bool is_group() const { return type_ == Node::GROUP; }
0107 
0108   bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
0109 
0110   bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
0111 
0112   bool is_required() const { return repetition_ == Repetition::REQUIRED; }
0113 
0114   virtual bool Equals(const Node* other) const = 0;
0115 
0116   const std::string& name() const { return name_; }
0117 
0118   Node::type node_type() const { return type_; }
0119 
0120   Repetition::type repetition() const { return repetition_; }
0121 
0122   ConvertedType::type converted_type() const { return converted_type_; }
0123 
0124   const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
0125 
0126   /// \brief The field_id value for the serialized SchemaElement. If the
0127   /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
0128   /// Thrift.
0129   int field_id() const { return field_id_; }
0130 
0131   const Node* parent() const { return parent_; }
0132 
0133   const std::shared_ptr<ColumnPath> path() const;
0134 
0135   virtual void ToParquet(void* element) const = 0;
0136 
0137   // Node::Visitor abstract class for walking schemas with the visitor pattern
0138   class Visitor {
0139    public:
0140     virtual ~Visitor() {}
0141 
0142     virtual void Visit(Node* node) = 0;
0143   };
0144   class ConstVisitor {
0145    public:
0146     virtual ~ConstVisitor() {}
0147 
0148     virtual void Visit(const Node* node) = 0;
0149   };
0150 
0151   virtual void Visit(Visitor* visitor) = 0;
0152   virtual void VisitConst(ConstVisitor* visitor) const = 0;
0153 
0154  protected:
0155   friend class GroupNode;
0156 
0157   Node(Node::type type, const std::string& name, Repetition::type repetition,
0158        ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
0159       : type_(type),
0160         name_(name),
0161         repetition_(repetition),
0162         converted_type_(converted_type),
0163         field_id_(field_id),
0164         parent_(NULLPTR) {}
0165 
0166   Node(Node::type type, const std::string& name, Repetition::type repetition,
0167        std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
0168       : type_(type),
0169         name_(name),
0170         repetition_(repetition),
0171         logical_type_(std::move(logical_type)),
0172         field_id_(field_id),
0173         parent_(NULLPTR) {}
0174 
0175   Node::type type_;
0176   std::string name_;
0177   Repetition::type repetition_;
0178   ConvertedType::type converted_type_{ConvertedType::NONE};
0179   std::shared_ptr<const LogicalType> logical_type_;
0180   int field_id_;
0181   // Nodes should not be shared, they have a single parent.
0182   const Node* parent_;
0183 
0184   bool EqualsInternal(const Node* other) const;
0185   void SetParent(const Node* p_parent);
0186 
0187  private:
0188   PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
0189 };
0190 
0191 // Save our breath all over the place with these typedefs
0192 using NodePtr = std::shared_ptr<Node>;
0193 using NodeVector = std::vector<NodePtr>;
0194 
0195 // A type that is one of the primitive Parquet storage types. In addition to
0196 // the other type metadata (name, repetition level, logical type), also has the
0197 // physical storage type and their type-specific metadata (byte width, decimal
0198 // parameters)
0199 class PARQUET_EXPORT PrimitiveNode : public Node {
0200  public:
0201   static std::unique_ptr<Node> FromParquet(const void* opaque_element);
0202 
0203   // A field_id -1 (or any negative value) will be serialized as null in Thrift
0204   static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0205                              Type::type type,
0206                              ConvertedType::type converted_type = ConvertedType::NONE,
0207                              int length = -1, int precision = -1, int scale = -1,
0208                              int field_id = -1) {
0209     return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
0210                                      precision, scale, field_id));
0211   }
0212 
0213   // If no logical type, pass LogicalType::None() or nullptr
0214   // A field_id -1 (or any negative value) will be serialized as null in Thrift
0215   static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0216                              std::shared_ptr<const LogicalType> logical_type,
0217                              Type::type primitive_type, int primitive_length = -1,
0218                              int field_id = -1) {
0219     return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type),
0220                                      primitive_type, primitive_length, field_id));
0221   }
0222 
0223   bool Equals(const Node* other) const override;
0224 
0225   Type::type physical_type() const { return physical_type_; }
0226 
0227   ColumnOrder column_order() const { return column_order_; }
0228 
0229   void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
0230 
0231   int32_t type_length() const { return type_length_; }
0232 
0233   const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
0234 
0235   void ToParquet(void* element) const override;
0236   void Visit(Visitor* visitor) override;
0237   void VisitConst(ConstVisitor* visitor) const override;
0238 
0239  private:
0240   PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
0241                 ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
0242                 int precision = -1, int scale = -1, int field_id = -1);
0243 
0244   PrimitiveNode(const std::string& name, Repetition::type repetition,
0245                 std::shared_ptr<const LogicalType> logical_type,
0246                 Type::type primitive_type, int primitive_length = -1, int field_id = -1);
0247 
0248   Type::type physical_type_;
0249   int32_t type_length_;
0250   DecimalMetadata decimal_metadata_;
0251   ColumnOrder column_order_;
0252 
0253   // For FIXED_LEN_BYTE_ARRAY
0254   void SetTypeLength(int32_t length) { type_length_ = length; }
0255 
0256   bool EqualsInternal(const PrimitiveNode* other) const;
0257 
0258   FRIEND_TEST(TestPrimitiveNode, Attrs);
0259   FRIEND_TEST(TestPrimitiveNode, Equals);
0260   FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
0261   FRIEND_TEST(TestPrimitiveNode, FromParquet);
0262 };
0263 
0264 class PARQUET_EXPORT GroupNode : public Node {
0265  public:
0266   static std::unique_ptr<Node> FromParquet(const void* opaque_element,
0267                                            NodeVector fields = {});
0268 
0269   // A field_id -1 (or any negative value) will be serialized as null in Thrift
0270   static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0271                              const NodeVector& fields,
0272                              ConvertedType::type converted_type = ConvertedType::NONE,
0273                              int field_id = -1) {
0274     return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
0275   }
0276 
0277   // If no logical type, pass nullptr
0278   // A field_id -1 (or any negative value) will be serialized as null in Thrift
0279   static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0280                              const NodeVector& fields,
0281                              std::shared_ptr<const LogicalType> logical_type,
0282                              int field_id = -1) {
0283     return NodePtr(
0284         new GroupNode(name, repetition, fields, std::move(logical_type), field_id));
0285   }
0286 
0287   bool Equals(const Node* other) const override;
0288 
0289   const NodePtr& field(int i) const { return fields_[i]; }
0290   // Get the index of a field by its name, or negative value if not found.
0291   // If several fields share the same name, it is unspecified which one
0292   // is returned.
0293   int FieldIndex(const std::string& name) const;
0294   // Get the index of a field by its node, or negative value if not found.
0295   int FieldIndex(const Node& node) const;
0296 
0297   int field_count() const { return static_cast<int>(fields_.size()); }
0298 
0299   void ToParquet(void* element) const override;
0300   void Visit(Visitor* visitor) override;
0301   void VisitConst(ConstVisitor* visitor) const override;
0302 
0303   /// \brief Return true if this node or any child node has REPEATED repetition
0304   /// type
0305   bool HasRepeatedFields() const;
0306 
0307  private:
0308   GroupNode(const std::string& name, Repetition::type repetition,
0309             const NodeVector& fields,
0310             ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
0311 
0312   GroupNode(const std::string& name, Repetition::type repetition,
0313             const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
0314             int field_id = -1);
0315 
0316   NodeVector fields_;
0317   bool EqualsInternal(const GroupNode* other) const;
0318 
0319   // Mapping between field name to the field index
0320   std::unordered_multimap<std::string, int> field_name_to_idx_;
0321 
0322   FRIEND_TEST(TestGroupNode, Attrs);
0323   FRIEND_TEST(TestGroupNode, Equals);
0324   FRIEND_TEST(TestGroupNode, FieldIndex);
0325   FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
0326 };
0327 
0328 // ----------------------------------------------------------------------
0329 // Convenience primitive type factory functions
0330 
0331 #define PRIMITIVE_FACTORY(FuncName, TYPE)                                                \
0332   static inline NodePtr FuncName(const std::string& name,                                \
0333                                  Repetition::type repetition = Repetition::OPTIONAL,     \
0334                                  int field_id = -1) {                                    \
0335     return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE,        \
0336                                /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
0337   }
0338 
0339 PRIMITIVE_FACTORY(Boolean, BOOLEAN)
0340 PRIMITIVE_FACTORY(Int32, INT32)
0341 PRIMITIVE_FACTORY(Int64, INT64)
0342 PRIMITIVE_FACTORY(Int96, INT96)
0343 PRIMITIVE_FACTORY(Float, FLOAT)
0344 PRIMITIVE_FACTORY(Double, DOUBLE)
0345 PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
0346 
0347 void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
0348                                 int indent_width = 2);
0349 
0350 }  // namespace schema
0351 
0352 // The ColumnDescriptor encapsulates information necessary to interpret
0353 // primitive column data in the context of a particular schema. We have to
0354 // examine the node structure of a column's path to the root in the schema tree
0355 // to be able to reassemble the nested structure from the repetition and
0356 // definition levels.
0357 class PARQUET_EXPORT ColumnDescriptor {
0358  public:
0359   ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
0360                    int16_t max_repetition_level,
0361                    const SchemaDescriptor* schema_descr = NULLPTR);
0362 
0363   bool Equals(const ColumnDescriptor& other) const;
0364 
0365   int16_t max_definition_level() const { return max_definition_level_; }
0366 
0367   int16_t max_repetition_level() const { return max_repetition_level_; }
0368 
0369   Type::type physical_type() const { return primitive_node_->physical_type(); }
0370 
0371   ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
0372 
0373   const std::shared_ptr<const LogicalType>& logical_type() const {
0374     return primitive_node_->logical_type();
0375   }
0376 
0377   ColumnOrder column_order() const { return primitive_node_->column_order(); }
0378 
0379   SortOrder::type sort_order() const {
0380     const auto& la = logical_type();
0381     auto pt = physical_type();
0382     return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
0383   }
0384 
0385   const std::string& name() const { return primitive_node_->name(); }
0386 
0387   const std::shared_ptr<schema::ColumnPath> path() const;
0388 
0389   const schema::NodePtr& schema_node() const { return node_; }
0390 
0391   std::string ToString() const;
0392 
0393   int type_length() const;
0394 
0395   int type_precision() const;
0396 
0397   int type_scale() const;
0398 
0399  private:
0400   schema::NodePtr node_;
0401   const schema::PrimitiveNode* primitive_node_;
0402 
0403   int16_t max_definition_level_;
0404   int16_t max_repetition_level_;
0405 };
0406 
0407 // Container for the converted Parquet schema with a computed information from
0408 // the schema analysis needed for file reading
0409 //
0410 // * Column index to Node
0411 // * Max repetition / definition levels for each primitive node
0412 //
0413 // The ColumnDescriptor objects produced by this class can be used to assist in
0414 // the reconstruction of fully materialized data structures from the
0415 // repetition-definition level encoding of nested data
0416 //
0417 // TODO(wesm): this object can be recomputed from a Schema
0418 class PARQUET_EXPORT SchemaDescriptor {
0419  public:
0420   SchemaDescriptor() = default;
0421   ~SchemaDescriptor() = default;
0422 
0423   // Analyze the schema
0424   void Init(std::unique_ptr<schema::Node> schema);
0425   void Init(schema::NodePtr schema);
0426 
0427   const ColumnDescriptor* Column(int i) const;
0428 
0429   // Get the index of a column by its dotstring path, or negative value if not found.
0430   // If several columns share the same dotstring path, it is unspecified which one
0431   // is returned.
0432   int ColumnIndex(const std::string& node_path) const;
0433   // Get the index of a column by its node, or negative value if not found.
0434   int ColumnIndex(const schema::Node& node) const;
0435 
0436   bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const;
0437 
0438   // The number of physical columns appearing in the file
0439   int num_columns() const { return static_cast<int>(leaves_.size()); }
0440 
0441   const schema::NodePtr& schema_root() const { return schema_; }
0442 
0443   const schema::GroupNode* group_node() const { return group_node_; }
0444 
0445   // Returns the root (child of the schema root) node of the leaf(column) node
0446   const schema::Node* GetColumnRoot(int i) const;
0447 
0448   const std::string& name() const { return group_node_->name(); }
0449 
0450   std::string ToString() const;
0451 
0452   void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
0453 
0454   /// \brief Return column index corresponding to a particular
0455   /// PrimitiveNode. Returns -1 if not found
0456   int GetColumnIndex(const schema::PrimitiveNode& node) const;
0457 
0458   /// \brief Return true if any field or their children have REPEATED repetition
0459   /// type
0460   bool HasRepeatedFields() const;
0461 
0462  private:
0463   friend class ColumnDescriptor;
0464 
0465   // Root Node
0466   schema::NodePtr schema_;
0467   // Root Node
0468   // Would never be NULLPTR.
0469   const schema::GroupNode* group_node_;
0470 
0471   void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
0472                  int16_t max_rep_level, const schema::NodePtr& base);
0473 
0474   // Result of leaf node / tree analysis
0475   std::vector<ColumnDescriptor> leaves_;
0476 
0477   std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
0478 
0479   // Mapping between leaf nodes and root group of leaf (first node
0480   // below the schema's root group)
0481   //
0482   // For example, the leaf `a.b.c.d` would have a link back to `a`
0483   //
0484   // -- a  <------
0485   // -- -- b     |
0486   // -- -- -- c  |
0487   // -- -- -- -- d
0488   std::unordered_map<int, schema::NodePtr> leaf_to_base_;
0489 
0490   // Mapping between ColumnPath DotString to the leaf index
0491   std::unordered_multimap<std::string, int> leaf_to_idx_;
0492 };
0493 
0494 }  // namespace parquet