![]() |
|
|||
File indexing completed on 2025-08-28 08:26:59
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 // Tools for dictionaries in IPC context 0019 0020 #pragma once 0021 0022 #include <cstdint> 0023 #include <memory> 0024 #include <utility> 0025 #include <vector> 0026 0027 #include "arrow/result.h" 0028 #include "arrow/status.h" 0029 #include "arrow/type_fwd.h" 0030 #include "arrow/util/macros.h" 0031 #include "arrow/util/visibility.h" 0032 0033 namespace arrow { 0034 namespace ipc { 0035 0036 namespace internal { 0037 0038 class FieldPosition { 0039 public: 0040 FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {} 0041 0042 FieldPosition child(int index) const { return {this, index}; } 0043 0044 std::vector<int> path() const { 0045 std::vector<int> path(depth_); 0046 const FieldPosition* cur = this; 0047 for (int i = depth_ - 1; i >= 0; --i) { 0048 path[i] = cur->index_; 0049 cur = cur->parent_; 0050 } 0051 return path; 0052 } 0053 0054 protected: 0055 FieldPosition(const FieldPosition* parent, int index) 0056 : parent_(parent), index_(index), depth_(parent->depth_ + 1) {} 0057 0058 const FieldPosition* parent_; 0059 int index_; 0060 int depth_; 0061 }; 0062 0063 } // namespace internal 0064 0065 /// \brief Map fields in a schema to dictionary ids 0066 /// 0067 /// The mapping is structural, i.e. the field path (as a vector of indices) 0068 /// is associated to the dictionary id. A dictionary id may be associated 0069 /// to multiple fields. 0070 class ARROW_EXPORT DictionaryFieldMapper { 0071 public: 0072 DictionaryFieldMapper(); 0073 explicit DictionaryFieldMapper(const Schema& schema); 0074 ~DictionaryFieldMapper(); 0075 0076 Status AddSchemaFields(const Schema& schema); 0077 Status AddField(int64_t id, std::vector<int> field_path); 0078 0079 Result<int64_t> GetFieldId(std::vector<int> field_path) const; 0080 0081 int num_fields() const; 0082 0083 /// \brief Returns number of unique dictionaries, taking into 0084 /// account that different fields can share the same dictionary. 0085 int num_dicts() const; 0086 0087 private: 0088 struct Impl; 0089 std::unique_ptr<Impl> impl_; 0090 }; 0091 0092 using DictionaryVector = std::vector<std::pair<int64_t, std::shared_ptr<Array>>>; 0093 0094 /// \brief Memoization data structure for reading dictionaries from IPC streams 0095 /// 0096 /// This structure tracks the following associations: 0097 /// - field position (structural) -> dictionary id 0098 /// - dictionary id -> value type 0099 /// - dictionary id -> dictionary (value) data 0100 /// 0101 /// Together, they allow resolving dictionary data when reading an IPC stream, 0102 /// using metadata recorded in the schema message and data recorded in the 0103 /// dictionary batch messages (see ResolveDictionaries). 0104 /// 0105 /// This structure isn't useful for writing an IPC stream, where only 0106 /// DictionaryFieldMapper is necessary. 0107 class ARROW_EXPORT DictionaryMemo { 0108 public: 0109 DictionaryMemo(); 0110 ~DictionaryMemo(); 0111 0112 DictionaryFieldMapper& fields(); 0113 const DictionaryFieldMapper& fields() const; 0114 0115 /// \brief Return current dictionary corresponding to a particular 0116 /// id. Returns KeyError if id not found 0117 Result<std::shared_ptr<ArrayData>> GetDictionary(int64_t id, MemoryPool* pool) const; 0118 0119 /// \brief Return dictionary value type corresponding to a 0120 /// particular dictionary id. 0121 Result<std::shared_ptr<DataType>> GetDictionaryType(int64_t id) const; 0122 0123 /// \brief Return true if we have a dictionary for the input id 0124 bool HasDictionary(int64_t id) const; 0125 0126 /// \brief Add a dictionary value type to the memo with a particular id. 0127 /// Returns KeyError if a different type is already registered with the same id. 0128 Status AddDictionaryType(int64_t id, const std::shared_ptr<DataType>& type); 0129 0130 /// \brief Add a dictionary to the memo with a particular id. Returns 0131 /// KeyError if that dictionary already exists 0132 Status AddDictionary(int64_t id, const std::shared_ptr<ArrayData>& dictionary); 0133 0134 /// \brief Append a dictionary delta to the memo with a particular id. Returns 0135 /// KeyError if that dictionary does not exists 0136 Status AddDictionaryDelta(int64_t id, const std::shared_ptr<ArrayData>& dictionary); 0137 0138 /// \brief Add a dictionary to the memo if it does not have one with the id, 0139 /// otherwise, replace the dictionary with the new one. 0140 /// 0141 /// Return true if the dictionary was added, false if replaced. 0142 Result<bool> AddOrReplaceDictionary(int64_t id, 0143 const std::shared_ptr<ArrayData>& dictionary); 0144 0145 private: 0146 struct Impl; 0147 std::unique_ptr<Impl> impl_; 0148 }; 0149 0150 // For writing: collect dictionary entries to write to the IPC stream, in order 0151 // (i.e. inner dictionaries before dependent outer dictionaries). 0152 ARROW_EXPORT 0153 Result<DictionaryVector> CollectDictionaries(const RecordBatch& batch, 0154 const DictionaryFieldMapper& mapper); 0155 0156 // For reading: resolve all dictionaries in columns, according to the field 0157 // mapping and dictionary arrays stored in memo. 0158 // Columns may be sparse, i.e. some entries may be left null 0159 // (e.g. if an inclusion mask was used). 0160 ARROW_EXPORT 0161 Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo, 0162 MemoryPool* pool); 0163 0164 namespace internal { 0165 0166 // Like CollectDictionaries above, but uses the memo's DictionaryFieldMapper 0167 // and all collected dictionaries are added to the memo using AddDictionary. 0168 // 0169 // This is used as a shortcut in some roundtripping tests (to avoid emitting 0170 // any actual dictionary batches). 0171 ARROW_EXPORT 0172 Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo); 0173 0174 } // namespace internal 0175 0176 } // namespace ipc 0177 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |