File indexing completed on 2026-04-17 08:28:53
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #pragma once
0019
0020 #include <cassert>
0021 #include <memory>
0022 #include <unordered_map>
0023 #include <unordered_set>
0024 #include <vector>
0025
0026 #include "arrow/result.h"
0027 #include "arrow/status.h"
0028 #include "arrow/type.h"
0029 #include "arrow/type_fwd.h"
0030
0031 #include "parquet/level_conversion.h"
0032 #include "parquet/platform.h"
0033 #include "parquet/schema.h"
0034
0035 namespace parquet {
0036
0037 class ArrowReaderProperties;
0038 class ArrowWriterProperties;
0039 class WriterProperties;
0040
0041 namespace arrow {
0042
0043
0044
0045
0046
0047
0048 PARQUET_EXPORT
0049 ::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
0050 const WriterProperties& properties,
0051 const ArrowWriterProperties& arrow_properties,
0052 schema::NodePtr* out);
0053
0054 PARQUET_EXPORT
0055 ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
0056 const WriterProperties& properties,
0057 const ArrowWriterProperties& arrow_properties,
0058 std::shared_ptr<SchemaDescriptor>* out);
0059
0060 PARQUET_EXPORT
0061 ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
0062 const WriterProperties& properties,
0063 std::shared_ptr<SchemaDescriptor>* out);
0064
0065
0066
0067
0068
0069
0070
0071
0072 PARQUET_EXPORT
0073 ::arrow::Status FromParquetSchema(
0074 const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
0075 const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
0076 std::shared_ptr<::arrow::Schema>* out);
0077
0078 PARQUET_EXPORT
0079 ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
0080 const ArrowReaderProperties& properties,
0081 std::shared_ptr<::arrow::Schema>* out);
0082
0083 PARQUET_EXPORT
0084 ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
0085 std::shared_ptr<::arrow::Schema>* out);
0086
0087
0088
0089
0090 struct PARQUET_EXPORT SchemaField {
0091 std::shared_ptr<::arrow::Field> field;
0092 std::vector<SchemaField> children;
0093
0094
0095 int column_index = -1;
0096
0097 parquet::internal::LevelInfo level_info;
0098
0099 bool is_leaf() const { return column_index != -1; }
0100 };
0101
0102
0103
0104
0105
0106 struct PARQUET_EXPORT SchemaManifest {
0107 static ::arrow::Status Make(
0108 const SchemaDescriptor* schema,
0109 const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
0110 const ArrowReaderProperties& properties, SchemaManifest* manifest);
0111
0112 const SchemaDescriptor* descr;
0113 std::shared_ptr<::arrow::Schema> origin_schema;
0114 std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
0115 std::vector<SchemaField> schema_fields;
0116
0117 std::unordered_map<int, const SchemaField*> column_index_to_field;
0118 std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
0119
0120 ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
0121 auto it = column_index_to_field.find(column_index);
0122 if (it == column_index_to_field.end()) {
0123 return ::arrow::Status::KeyError("Column index ", column_index,
0124 " not found in schema manifest, may be malformed");
0125 }
0126 *out = it->second;
0127 return ::arrow::Status::OK();
0128 }
0129
0130 const SchemaField* GetParent(const SchemaField* field) const {
0131
0132 auto it = child_to_parent.find(field);
0133 if (it == child_to_parent.end()) {
0134 return NULLPTR;
0135 }
0136 return it->second;
0137 }
0138
0139
0140
0141
0142
0143
0144
0145
0146
0147
0148
0149
0150
0151
0152
0153
0154
0155
0156
0157
0158 ::arrow::Result<std::vector<int>> GetFieldIndices(
0159 const std::vector<int>& column_indices) const {
0160 const schema::GroupNode* group = descr->group_node();
0161 std::unordered_set<int> already_added;
0162
0163 std::vector<int> out;
0164 for (int column_idx : column_indices) {
0165 if (column_idx < 0 || column_idx >= descr->num_columns()) {
0166 return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
0167 }
0168
0169 auto field_node = descr->GetColumnRoot(column_idx);
0170 auto field_idx = group->FieldIndex(*field_node);
0171 if (field_idx == -1) {
0172 return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
0173 }
0174
0175 if (already_added.insert(field_idx).second) {
0176 out.push_back(field_idx);
0177 }
0178 }
0179 return out;
0180 }
0181 };
0182
0183 }
0184 }