File indexing completed on 2026-04-17 08:28:55
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021 #pragma once
0022
0023 #include <cstdint>
0024 #include <memory>
0025 #include <ostream>
0026 #include <string>
0027 #include <unordered_map>
0028 #include <utility>
0029 #include <vector>
0030
0031 #include "parquet/platform.h"
0032 #include "parquet/types.h"
0033 #include "parquet/windows_fixup.h" // for OPTIONAL
0034
0035 namespace parquet {
0036
0037 class SchemaDescriptor;
0038
0039 namespace schema {
0040
0041 class Node;
0042
0043
0044
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055
0056
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073
0074
0075 struct ListEncoding {
0076 enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
0077 };
0078
0079 class PARQUET_EXPORT ColumnPath {
0080 public:
0081 ColumnPath() : path_() {}
0082 explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
0083 explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
0084
0085 static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
0086 static std::shared_ptr<ColumnPath> FromNode(const Node& node);
0087
0088 std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
0089 std::string ToDotString() const;
0090 const std::vector<std::string>& ToDotVector() const;
0091
0092 protected:
0093 std::vector<std::string> path_;
0094 };
0095
0096
0097
0098 class PARQUET_EXPORT Node {
0099 public:
0100 enum type { PRIMITIVE, GROUP };
0101
0102 virtual ~Node() {}
0103
0104 bool is_primitive() const { return type_ == Node::PRIMITIVE; }
0105
0106 bool is_group() const { return type_ == Node::GROUP; }
0107
0108 bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
0109
0110 bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
0111
0112 bool is_required() const { return repetition_ == Repetition::REQUIRED; }
0113
0114 virtual bool Equals(const Node* other) const = 0;
0115
0116 const std::string& name() const { return name_; }
0117
0118 Node::type node_type() const { return type_; }
0119
0120 Repetition::type repetition() const { return repetition_; }
0121
0122 ConvertedType::type converted_type() const { return converted_type_; }
0123
0124 const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
0125
0126
0127
0128
0129 int field_id() const { return field_id_; }
0130
0131 const Node* parent() const { return parent_; }
0132
0133 const std::shared_ptr<ColumnPath> path() const;
0134
0135 virtual void ToParquet(void* element) const = 0;
0136
0137
0138 class Visitor {
0139 public:
0140 virtual ~Visitor() {}
0141
0142 virtual void Visit(Node* node) = 0;
0143 };
0144 class ConstVisitor {
0145 public:
0146 virtual ~ConstVisitor() {}
0147
0148 virtual void Visit(const Node* node) = 0;
0149 };
0150
0151 virtual void Visit(Visitor* visitor) = 0;
0152 virtual void VisitConst(ConstVisitor* visitor) const = 0;
0153
0154 protected:
0155 friend class GroupNode;
0156
0157 Node(Node::type type, const std::string& name, Repetition::type repetition,
0158 ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
0159 : type_(type),
0160 name_(name),
0161 repetition_(repetition),
0162 converted_type_(converted_type),
0163 field_id_(field_id),
0164 parent_(NULLPTR) {}
0165
0166 Node(Node::type type, const std::string& name, Repetition::type repetition,
0167 std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
0168 : type_(type),
0169 name_(name),
0170 repetition_(repetition),
0171 logical_type_(std::move(logical_type)),
0172 field_id_(field_id),
0173 parent_(NULLPTR) {}
0174
0175 Node::type type_;
0176 std::string name_;
0177 Repetition::type repetition_;
0178 ConvertedType::type converted_type_{ConvertedType::NONE};
0179 std::shared_ptr<const LogicalType> logical_type_;
0180 int field_id_;
0181
0182 const Node* parent_;
0183
0184 bool EqualsInternal(const Node* other) const;
0185 void SetParent(const Node* p_parent);
0186
0187 private:
0188 PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
0189 };
0190
0191
0192 using NodePtr = std::shared_ptr<Node>;
0193 using NodeVector = std::vector<NodePtr>;
0194
0195
0196
0197
0198
0199 class PARQUET_EXPORT PrimitiveNode : public Node {
0200 public:
0201 static std::unique_ptr<Node> FromParquet(const void* opaque_element);
0202
0203
0204 static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0205 Type::type type,
0206 ConvertedType::type converted_type = ConvertedType::NONE,
0207 int length = -1, int precision = -1, int scale = -1,
0208 int field_id = -1) {
0209 return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
0210 precision, scale, field_id));
0211 }
0212
0213
0214
0215 static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0216 std::shared_ptr<const LogicalType> logical_type,
0217 Type::type primitive_type, int primitive_length = -1,
0218 int field_id = -1) {
0219 return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type),
0220 primitive_type, primitive_length, field_id));
0221 }
0222
0223 bool Equals(const Node* other) const override;
0224
0225 Type::type physical_type() const { return physical_type_; }
0226
0227 ColumnOrder column_order() const { return column_order_; }
0228
0229 void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
0230
0231 int32_t type_length() const { return type_length_; }
0232
0233 const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
0234
0235 void ToParquet(void* element) const override;
0236 void Visit(Visitor* visitor) override;
0237 void VisitConst(ConstVisitor* visitor) const override;
0238
0239 private:
0240 PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
0241 ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
0242 int precision = -1, int scale = -1, int field_id = -1);
0243
0244 PrimitiveNode(const std::string& name, Repetition::type repetition,
0245 std::shared_ptr<const LogicalType> logical_type,
0246 Type::type primitive_type, int primitive_length = -1, int field_id = -1);
0247
0248 Type::type physical_type_;
0249 int32_t type_length_;
0250 DecimalMetadata decimal_metadata_;
0251 ColumnOrder column_order_;
0252
0253
0254 void SetTypeLength(int32_t length) { type_length_ = length; }
0255
0256 bool EqualsInternal(const PrimitiveNode* other) const;
0257
0258 FRIEND_TEST(TestPrimitiveNode, Attrs);
0259 FRIEND_TEST(TestPrimitiveNode, Equals);
0260 FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
0261 FRIEND_TEST(TestPrimitiveNode, FromParquet);
0262 };
0263
0264 class PARQUET_EXPORT GroupNode : public Node {
0265 public:
0266 static std::unique_ptr<Node> FromParquet(const void* opaque_element,
0267 NodeVector fields = {});
0268
0269
0270 static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0271 const NodeVector& fields,
0272 ConvertedType::type converted_type = ConvertedType::NONE,
0273 int field_id = -1) {
0274 return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
0275 }
0276
0277
0278
0279 static inline NodePtr Make(const std::string& name, Repetition::type repetition,
0280 const NodeVector& fields,
0281 std::shared_ptr<const LogicalType> logical_type,
0282 int field_id = -1) {
0283 return NodePtr(
0284 new GroupNode(name, repetition, fields, std::move(logical_type), field_id));
0285 }
0286
0287 bool Equals(const Node* other) const override;
0288
0289 const NodePtr& field(int i) const { return fields_[i]; }
0290
0291
0292
0293 int FieldIndex(const std::string& name) const;
0294
0295 int FieldIndex(const Node& node) const;
0296
0297 int field_count() const { return static_cast<int>(fields_.size()); }
0298
0299 void ToParquet(void* element) const override;
0300 void Visit(Visitor* visitor) override;
0301 void VisitConst(ConstVisitor* visitor) const override;
0302
0303
0304
0305 bool HasRepeatedFields() const;
0306
0307 private:
0308 GroupNode(const std::string& name, Repetition::type repetition,
0309 const NodeVector& fields,
0310 ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
0311
0312 GroupNode(const std::string& name, Repetition::type repetition,
0313 const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
0314 int field_id = -1);
0315
0316 NodeVector fields_;
0317 bool EqualsInternal(const GroupNode* other) const;
0318
0319
0320 std::unordered_multimap<std::string, int> field_name_to_idx_;
0321
0322 FRIEND_TEST(TestGroupNode, Attrs);
0323 FRIEND_TEST(TestGroupNode, Equals);
0324 FRIEND_TEST(TestGroupNode, FieldIndex);
0325 FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
0326 };
0327
0328
0329
0330
0331 #define PRIMITIVE_FACTORY(FuncName, TYPE) \
0332 static inline NodePtr FuncName(const std::string& name, \
0333 Repetition::type repetition = Repetition::OPTIONAL, \
0334 int field_id = -1) { \
0335 return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \
0336 -1, -1, -1, field_id); \
0337 }
0338
0339 PRIMITIVE_FACTORY(Boolean, BOOLEAN)
0340 PRIMITIVE_FACTORY(Int32, INT32)
0341 PRIMITIVE_FACTORY(Int64, INT64)
0342 PRIMITIVE_FACTORY(Int96, INT96)
0343 PRIMITIVE_FACTORY(Float, FLOAT)
0344 PRIMITIVE_FACTORY(Double, DOUBLE)
0345 PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
0346
0347 void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
0348 int indent_width = 2);
0349
0350 }
0351
0352
0353
0354
0355
0356
0357 class PARQUET_EXPORT ColumnDescriptor {
0358 public:
0359 ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
0360 int16_t max_repetition_level,
0361 const SchemaDescriptor* schema_descr = NULLPTR);
0362
0363 bool Equals(const ColumnDescriptor& other) const;
0364
0365 int16_t max_definition_level() const { return max_definition_level_; }
0366
0367 int16_t max_repetition_level() const { return max_repetition_level_; }
0368
0369 Type::type physical_type() const { return primitive_node_->physical_type(); }
0370
0371 ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
0372
0373 const std::shared_ptr<const LogicalType>& logical_type() const {
0374 return primitive_node_->logical_type();
0375 }
0376
0377 ColumnOrder column_order() const { return primitive_node_->column_order(); }
0378
0379 SortOrder::type sort_order() const {
0380 const auto& la = logical_type();
0381 auto pt = physical_type();
0382 return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
0383 }
0384
0385 const std::string& name() const { return primitive_node_->name(); }
0386
0387 const std::shared_ptr<schema::ColumnPath> path() const;
0388
0389 const schema::NodePtr& schema_node() const { return node_; }
0390
0391 std::string ToString() const;
0392
0393 int type_length() const;
0394
0395 int type_precision() const;
0396
0397 int type_scale() const;
0398
0399 private:
0400 schema::NodePtr node_;
0401 const schema::PrimitiveNode* primitive_node_;
0402
0403 int16_t max_definition_level_;
0404 int16_t max_repetition_level_;
0405 };
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417
0418 class PARQUET_EXPORT SchemaDescriptor {
0419 public:
0420 SchemaDescriptor() = default;
0421 ~SchemaDescriptor() = default;
0422
0423
0424 void Init(std::unique_ptr<schema::Node> schema);
0425 void Init(schema::NodePtr schema);
0426
0427 const ColumnDescriptor* Column(int i) const;
0428
0429
0430
0431
0432 int ColumnIndex(const std::string& node_path) const;
0433
0434 int ColumnIndex(const schema::Node& node) const;
0435
0436 bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const;
0437
0438
0439 int num_columns() const { return static_cast<int>(leaves_.size()); }
0440
0441 const schema::NodePtr& schema_root() const { return schema_; }
0442
0443 const schema::GroupNode* group_node() const { return group_node_; }
0444
0445
0446 const schema::Node* GetColumnRoot(int i) const;
0447
0448 const std::string& name() const { return group_node_->name(); }
0449
0450 std::string ToString() const;
0451
0452 void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
0453
0454
0455
0456 int GetColumnIndex(const schema::PrimitiveNode& node) const;
0457
0458
0459
0460 bool HasRepeatedFields() const;
0461
0462 private:
0463 friend class ColumnDescriptor;
0464
0465
0466 schema::NodePtr schema_;
0467
0468
0469 const schema::GroupNode* group_node_;
0470
0471 void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
0472 int16_t max_rep_level, const schema::NodePtr& base);
0473
0474
0475 std::vector<ColumnDescriptor> leaves_;
0476
0477 std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
0478
0479
0480
0481
0482
0483
0484
0485
0486
0487
0488 std::unordered_map<int, schema::NodePtr> leaf_to_base_;
0489
0490
0491 std::unordered_multimap<std::string, int> leaf_to_idx_;
0492 };
0493
0494 }