Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-04-17 08:28:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cassert>
0021 #include <memory>
0022 #include <unordered_map>
0023 #include <unordered_set>
0024 #include <vector>
0025 
0026 #include "arrow/result.h"
0027 #include "arrow/status.h"
0028 #include "arrow/type.h"
0029 #include "arrow/type_fwd.h"
0030 
0031 #include "parquet/level_conversion.h"
0032 #include "parquet/platform.h"
0033 #include "parquet/schema.h"
0034 
0035 namespace parquet {
0036 
0037 class ArrowReaderProperties;
0038 class ArrowWriterProperties;
0039 class WriterProperties;
0040 
0041 namespace arrow {
0042 
0043 /// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
0044 /// schema into a Parquet schema.
0045 ///
0046 /// @{
0047 
0048 PARQUET_EXPORT
0049 ::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
0050                             const WriterProperties& properties,
0051                             const ArrowWriterProperties& arrow_properties,
0052                             schema::NodePtr* out);
0053 
0054 PARQUET_EXPORT
0055 ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
0056                                 const WriterProperties& properties,
0057                                 const ArrowWriterProperties& arrow_properties,
0058                                 std::shared_ptr<SchemaDescriptor>* out);
0059 
0060 PARQUET_EXPORT
0061 ::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
0062                                 const WriterProperties& properties,
0063                                 std::shared_ptr<SchemaDescriptor>* out);
0064 
0065 /// @}
0066 
0067 /// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
0068 /// schema into an Arrow schema.
0069 ///
0070 /// @{
0071 
0072 PARQUET_EXPORT
0073 ::arrow::Status FromParquetSchema(
0074     const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
0075     const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
0076     std::shared_ptr<::arrow::Schema>* out);
0077 
0078 PARQUET_EXPORT
0079 ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
0080                                   const ArrowReaderProperties& properties,
0081                                   std::shared_ptr<::arrow::Schema>* out);
0082 
0083 PARQUET_EXPORT
0084 ::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
0085                                   std::shared_ptr<::arrow::Schema>* out);
0086 
0087 /// @}
0088 
0089 /// \brief Bridge between an arrow::Field and parquet column indices.
0090 struct PARQUET_EXPORT SchemaField {
0091   std::shared_ptr<::arrow::Field> field;
0092   std::vector<SchemaField> children;
0093 
0094   // Only set for leaf nodes
0095   int column_index = -1;
0096 
0097   parquet::internal::LevelInfo level_info;
0098 
0099   bool is_leaf() const { return column_index != -1; }
0100 };
0101 
0102 /// \brief Bridge between a parquet Schema and an arrow Schema.
0103 ///
0104 /// Expose parquet columns as a tree structure. Useful traverse and link
0105 /// between arrow's Schema and parquet's Schema.
0106 struct PARQUET_EXPORT SchemaManifest {
0107   static ::arrow::Status Make(
0108       const SchemaDescriptor* schema,
0109       const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
0110       const ArrowReaderProperties& properties, SchemaManifest* manifest);
0111 
0112   const SchemaDescriptor* descr;
0113   std::shared_ptr<::arrow::Schema> origin_schema;
0114   std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
0115   std::vector<SchemaField> schema_fields;
0116 
0117   std::unordered_map<int, const SchemaField*> column_index_to_field;
0118   std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
0119 
0120   ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
0121     auto it = column_index_to_field.find(column_index);
0122     if (it == column_index_to_field.end()) {
0123       return ::arrow::Status::KeyError("Column index ", column_index,
0124                                        " not found in schema manifest, may be malformed");
0125     }
0126     *out = it->second;
0127     return ::arrow::Status::OK();
0128   }
0129 
0130   const SchemaField* GetParent(const SchemaField* field) const {
0131     // Returns nullptr also if not found
0132     auto it = child_to_parent.find(field);
0133     if (it == child_to_parent.end()) {
0134       return NULLPTR;
0135     }
0136     return it->second;
0137   }
0138 
0139   /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
0140   /// correspond to the column root (first node below the parquet schema's root group) of
0141   /// each leaf referenced in column_indices.
0142   ///
0143   /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
0144   /// the roots are `a` and `i` (return=[0,2]).
0145   ///
0146   /// root
0147   /// -- a  <------
0148   /// -- -- b  |  |
0149   /// -- -- -- c  |
0150   /// -- -- -- d  |
0151   /// -- -- -- -- e
0152   /// -- f
0153   /// -- -- g
0154   /// -- -- -- h
0155   /// -- i  <---
0156   /// -- -- j  |
0157   /// -- -- -- k
0158   ::arrow::Result<std::vector<int>> GetFieldIndices(
0159       const std::vector<int>& column_indices) const {
0160     const schema::GroupNode* group = descr->group_node();
0161     std::unordered_set<int> already_added;
0162 
0163     std::vector<int> out;
0164     for (int column_idx : column_indices) {
0165       if (column_idx < 0 || column_idx >= descr->num_columns()) {
0166         return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
0167       }
0168 
0169       auto field_node = descr->GetColumnRoot(column_idx);
0170       auto field_idx = group->FieldIndex(*field_node);
0171       if (field_idx == -1) {
0172         return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
0173       }
0174 
0175       if (already_added.insert(field_idx).second) {
0176         out.push_back(field_idx);
0177       }
0178     }
0179     return out;
0180   }
0181 };
0182 
0183 }  // namespace arrow
0184 }  // namespace parquet