![]() |
|
|||
File indexing completed on 2025-08-28 08:26:53
0001 // Licensed to the Apache Software Foundation (ASF) under one 0002 // or more contributor license agreements. See the NOTICE file 0003 // distributed with this work for additional information 0004 // regarding copyright ownership. The ASF licenses this file 0005 // to you under the Apache License, Version 2.0 (the 0006 // "License"); you may not use this file except in compliance 0007 // with the License. You may obtain a copy of the License at 0008 // 0009 // http://www.apache.org/licenses/LICENSE-2.0 0010 // 0011 // Unless required by applicable law or agreed to in writing, 0012 // software distributed under the License is distributed on an 0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 0014 // KIND, either express or implied. See the License for the 0015 // specific language governing permissions and limitations 0016 // under the License. 0017 0018 #pragma once 0019 0020 #include <cstdint> 0021 #include <memory> 0022 0023 #include "arrow/array/array_base.h" 0024 #include "arrow/array/data.h" 0025 #include "arrow/result.h" 0026 #include "arrow/status.h" 0027 #include "arrow/type.h" 0028 #include "arrow/util/macros.h" 0029 #include "arrow/util/visibility.h" 0030 0031 namespace arrow { 0032 0033 // ---------------------------------------------------------------------- 0034 // DictionaryArray 0035 0036 /// \brief Array type for dictionary-encoded data with a 0037 /// data-dependent dictionary 0038 /// 0039 /// A dictionary array contains an array of non-negative integers (the 0040 /// "dictionary indices") along with a data type containing a "dictionary" 0041 /// corresponding to the distinct values represented in the data. 0042 /// 0043 /// For example, the array 0044 /// 0045 /// ["foo", "bar", "foo", "bar", "foo", "bar"] 0046 /// 0047 /// with dictionary ["bar", "foo"], would have dictionary array representation 0048 /// 0049 /// indices: [1, 0, 1, 0, 1, 0] 0050 /// dictionary: ["bar", "foo"] 0051 /// 0052 /// The indices in principle may be any integer type. 0053 class ARROW_EXPORT DictionaryArray : public Array { 0054 public: 0055 using TypeClass = DictionaryType; 0056 0057 explicit DictionaryArray(const std::shared_ptr<ArrayData>& data); 0058 0059 DictionaryArray(const std::shared_ptr<DataType>& type, 0060 const std::shared_ptr<Array>& indices, 0061 const std::shared_ptr<Array>& dictionary); 0062 0063 /// \brief Construct DictionaryArray from dictionary and indices 0064 /// array and validate 0065 /// 0066 /// This function does the validation of the indices and input type. It checks if 0067 /// all indices are non-negative and smaller than the size of the dictionary. 0068 /// 0069 /// \param[in] type a dictionary type 0070 /// \param[in] dictionary the dictionary with same value type as the 0071 /// type object 0072 /// \param[in] indices an array of non-negative integers smaller than the 0073 /// size of the dictionary 0074 static Result<std::shared_ptr<Array>> FromArrays( 0075 const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices, 0076 const std::shared_ptr<Array>& dictionary); 0077 0078 static Result<std::shared_ptr<Array>> FromArrays( 0079 const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) { 0080 return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices, 0081 dictionary); 0082 } 0083 0084 /// \brief Transpose this DictionaryArray 0085 /// 0086 /// This method constructs a new dictionary array with the given dictionary 0087 /// type, transposing indices using the transpose map. The type and the 0088 /// transpose map are typically computed using DictionaryUnifier. 0089 /// 0090 /// \param[in] type the new type object 0091 /// \param[in] dictionary the new dictionary 0092 /// \param[in] transpose_map transposition array of this array's indices 0093 /// into the target array's indices 0094 /// \param[in] pool a pool to allocate the array data from 0095 Result<std::shared_ptr<Array>> Transpose( 0096 const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary, 0097 const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const; 0098 0099 Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const; 0100 0101 /// \brief Determine whether dictionary arrays may be compared without unification 0102 bool CanCompareIndices(const DictionaryArray& other) const; 0103 0104 /// \brief Return the dictionary for this array, which is stored as 0105 /// a member of the ArrayData internal structure 0106 const std::shared_ptr<Array>& dictionary() const; 0107 const std::shared_ptr<Array>& indices() const; 0108 0109 /// \brief Return the ith value of indices, cast to int64_t. Not recommended 0110 /// for use in performance-sensitive code. Does not validate whether the 0111 /// value is null or out-of-bounds. 0112 int64_t GetValueIndex(int64_t i) const; 0113 0114 const DictionaryType* dict_type() const { return dict_type_; } 0115 0116 private: 0117 void SetData(const std::shared_ptr<ArrayData>& data); 0118 const DictionaryType* dict_type_; 0119 std::shared_ptr<Array> indices_; 0120 0121 // Lazily initialized when invoking dictionary() 0122 mutable std::shared_ptr<Array> dictionary_; 0123 }; 0124 0125 /// \brief Helper class for incremental dictionary unification 0126 class ARROW_EXPORT DictionaryUnifier { 0127 public: 0128 virtual ~DictionaryUnifier() = default; 0129 0130 /// \brief Construct a DictionaryUnifier 0131 /// \param[in] value_type the data type of the dictionaries 0132 /// \param[in] pool MemoryPool to use for memory allocations 0133 static Result<std::unique_ptr<DictionaryUnifier>> Make( 0134 std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool()); 0135 0136 /// \brief Unify dictionaries across array chunks 0137 /// 0138 /// The dictionaries in the array chunks will be unified, their indices 0139 /// accordingly transposed. 0140 /// 0141 /// Only dictionaries with a primitive value type are currently supported. 0142 /// However, dictionaries nested inside a more complex type are correctly unified. 0143 static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray( 0144 const std::shared_ptr<ChunkedArray>& array, 0145 MemoryPool* pool = default_memory_pool()); 0146 0147 /// \brief Unify dictionaries across the chunks of each table column 0148 /// 0149 /// The dictionaries in each table column will be unified, their indices 0150 /// accordingly transposed. 0151 /// 0152 /// Only dictionaries with a primitive value type are currently supported. 0153 /// However, dictionaries nested inside a more complex type are correctly unified. 0154 static Result<std::shared_ptr<Table>> UnifyTable( 0155 const Table& table, MemoryPool* pool = default_memory_pool()); 0156 0157 /// \brief Append dictionary to the internal memo 0158 virtual Status Unify(const Array& dictionary) = 0; 0159 0160 /// \brief Append dictionary and compute transpose indices 0161 /// \param[in] dictionary the dictionary values to unify 0162 /// \param[out] out_transpose a Buffer containing computed transpose indices 0163 /// as int32_t values equal in length to the passed dictionary. The value in 0164 /// each slot corresponds to the new index value for each original index 0165 /// for a DictionaryArray with the old dictionary 0166 virtual Status Unify(const Array& dictionary, 0167 std::shared_ptr<Buffer>* out_transpose) = 0; 0168 0169 /// \brief Return a result DictionaryType with the smallest possible index 0170 /// type to accommodate the unified dictionary. The unifier cannot be used 0171 /// after this is called 0172 virtual Status GetResult(std::shared_ptr<DataType>* out_type, 0173 std::shared_ptr<Array>* out_dict) = 0; 0174 0175 /// \brief Return a unified dictionary with the given index type. If 0176 /// the index type is not large enough then an invalid status will be returned. 0177 /// The unifier cannot be used after this is called 0178 virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type, 0179 std::shared_ptr<Array>* out_dict) = 0; 0180 }; 0181 0182 } // namespace arrow
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |
![]() ![]() |