Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:59

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // Tools for dictionaries in IPC context
0019 
0020 #pragma once
0021 
0022 #include <cstdint>
0023 #include <memory>
0024 #include <utility>
0025 #include <vector>
0026 
0027 #include "arrow/result.h"
0028 #include "arrow/status.h"
0029 #include "arrow/type_fwd.h"
0030 #include "arrow/util/macros.h"
0031 #include "arrow/util/visibility.h"
0032 
0033 namespace arrow {
0034 namespace ipc {
0035 
0036 namespace internal {
0037 
0038 class FieldPosition {
0039  public:
0040   FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
0041 
0042   FieldPosition child(int index) const { return {this, index}; }
0043 
0044   std::vector<int> path() const {
0045     std::vector<int> path(depth_);
0046     const FieldPosition* cur = this;
0047     for (int i = depth_ - 1; i >= 0; --i) {
0048       path[i] = cur->index_;
0049       cur = cur->parent_;
0050     }
0051     return path;
0052   }
0053 
0054  protected:
0055   FieldPosition(const FieldPosition* parent, int index)
0056       : parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
0057 
0058   const FieldPosition* parent_;
0059   int index_;
0060   int depth_;
0061 };
0062 
0063 }  // namespace internal
0064 
0065 /// \brief Map fields in a schema to dictionary ids
0066 ///
0067 /// The mapping is structural, i.e. the field path (as a vector of indices)
0068 /// is associated to the dictionary id.  A dictionary id may be associated
0069 /// to multiple fields.
0070 class ARROW_EXPORT DictionaryFieldMapper {
0071  public:
0072   DictionaryFieldMapper();
0073   explicit DictionaryFieldMapper(const Schema& schema);
0074   ~DictionaryFieldMapper();
0075 
0076   Status AddSchemaFields(const Schema& schema);
0077   Status AddField(int64_t id, std::vector<int> field_path);
0078 
0079   Result<int64_t> GetFieldId(std::vector<int> field_path) const;
0080 
0081   int num_fields() const;
0082 
0083   /// \brief Returns number of unique dictionaries, taking into
0084   /// account that different fields can share the same dictionary.
0085   int num_dicts() const;
0086 
0087  private:
0088   struct Impl;
0089   std::unique_ptr<Impl> impl_;
0090 };
0091 
0092 using DictionaryVector = std::vector<std::pair<int64_t, std::shared_ptr<Array>>>;
0093 
0094 /// \brief Memoization data structure for reading dictionaries from IPC streams
0095 ///
0096 /// This structure tracks the following associations:
0097 /// - field position (structural) -> dictionary id
0098 /// - dictionary id -> value type
0099 /// - dictionary id -> dictionary (value) data
0100 ///
0101 /// Together, they allow resolving dictionary data when reading an IPC stream,
0102 /// using metadata recorded in the schema message and data recorded in the
0103 /// dictionary batch messages (see ResolveDictionaries).
0104 ///
0105 /// This structure isn't useful for writing an IPC stream, where only
0106 /// DictionaryFieldMapper is necessary.
0107 class ARROW_EXPORT DictionaryMemo {
0108  public:
0109   DictionaryMemo();
0110   ~DictionaryMemo();
0111 
0112   DictionaryFieldMapper& fields();
0113   const DictionaryFieldMapper& fields() const;
0114 
0115   /// \brief Return current dictionary corresponding to a particular
0116   /// id. Returns KeyError if id not found
0117   Result<std::shared_ptr<ArrayData>> GetDictionary(int64_t id, MemoryPool* pool) const;
0118 
0119   /// \brief Return dictionary value type corresponding to a
0120   /// particular dictionary id.
0121   Result<std::shared_ptr<DataType>> GetDictionaryType(int64_t id) const;
0122 
0123   /// \brief Return true if we have a dictionary for the input id
0124   bool HasDictionary(int64_t id) const;
0125 
0126   /// \brief Add a dictionary value type to the memo with a particular id.
0127   /// Returns KeyError if a different type is already registered with the same id.
0128   Status AddDictionaryType(int64_t id, const std::shared_ptr<DataType>& type);
0129 
0130   /// \brief Add a dictionary to the memo with a particular id. Returns
0131   /// KeyError if that dictionary already exists
0132   Status AddDictionary(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
0133 
0134   /// \brief Append a dictionary delta to the memo with a particular id. Returns
0135   /// KeyError if that dictionary does not exists
0136   Status AddDictionaryDelta(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
0137 
0138   /// \brief Add a dictionary to the memo if it does not have one with the id,
0139   /// otherwise, replace the dictionary with the new one.
0140   ///
0141   /// Return true if the dictionary was added, false if replaced.
0142   Result<bool> AddOrReplaceDictionary(int64_t id,
0143                                       const std::shared_ptr<ArrayData>& dictionary);
0144 
0145  private:
0146   struct Impl;
0147   std::unique_ptr<Impl> impl_;
0148 };
0149 
0150 // For writing: collect dictionary entries to write to the IPC stream, in order
0151 // (i.e. inner dictionaries before dependent outer dictionaries).
0152 ARROW_EXPORT
0153 Result<DictionaryVector> CollectDictionaries(const RecordBatch& batch,
0154                                              const DictionaryFieldMapper& mapper);
0155 
0156 // For reading: resolve all dictionaries in columns, according to the field
0157 // mapping and dictionary arrays stored in memo.
0158 // Columns may be sparse, i.e. some entries may be left null
0159 // (e.g. if an inclusion mask was used).
0160 ARROW_EXPORT
0161 Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo,
0162                            MemoryPool* pool);
0163 
0164 namespace internal {
0165 
0166 // Like CollectDictionaries above, but uses the memo's DictionaryFieldMapper
0167 // and all collected dictionaries are added to the memo using AddDictionary.
0168 //
0169 // This is used as a shortcut in some roundtripping tests (to avoid emitting
0170 // any actual dictionary batches).
0171 ARROW_EXPORT
0172 Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo);
0173 
0174 }  // namespace internal
0175 
0176 }  // namespace ipc
0177 }  // namespace arrow