Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 #pragma once
0019 
0020 #include <cstdint>
0021 #include <memory>
0022 
0023 #include "arrow/array/array_base.h"
0024 #include "arrow/array/data.h"
0025 #include "arrow/result.h"
0026 #include "arrow/status.h"
0027 #include "arrow/type.h"
0028 #include "arrow/util/macros.h"
0029 #include "arrow/util/visibility.h"
0030 
0031 namespace arrow {
0032 
0033 // ----------------------------------------------------------------------
0034 // DictionaryArray
0035 
0036 /// \brief Array type for dictionary-encoded data with a
0037 /// data-dependent dictionary
0038 ///
0039 /// A dictionary array contains an array of non-negative integers (the
0040 /// "dictionary indices") along with a data type containing a "dictionary"
0041 /// corresponding to the distinct values represented in the data.
0042 ///
0043 /// For example, the array
0044 ///
0045 ///   ["foo", "bar", "foo", "bar", "foo", "bar"]
0046 ///
0047 /// with dictionary ["bar", "foo"], would have dictionary array representation
0048 ///
0049 ///   indices: [1, 0, 1, 0, 1, 0]
0050 ///   dictionary: ["bar", "foo"]
0051 ///
0052 /// The indices in principle may be any integer type.
0053 class ARROW_EXPORT DictionaryArray : public Array {
0054  public:
0055   using TypeClass = DictionaryType;
0056 
0057   explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
0058 
0059   DictionaryArray(const std::shared_ptr<DataType>& type,
0060                   const std::shared_ptr<Array>& indices,
0061                   const std::shared_ptr<Array>& dictionary);
0062 
0063   /// \brief Construct DictionaryArray from dictionary and indices
0064   /// array and validate
0065   ///
0066   /// This function does the validation of the indices and input type. It checks if
0067   /// all indices are non-negative and smaller than the size of the dictionary.
0068   ///
0069   /// \param[in] type a dictionary type
0070   /// \param[in] dictionary the dictionary with same value type as the
0071   /// type object
0072   /// \param[in] indices an array of non-negative integers smaller than the
0073   /// size of the dictionary
0074   static Result<std::shared_ptr<Array>> FromArrays(
0075       const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
0076       const std::shared_ptr<Array>& dictionary);
0077 
0078   static Result<std::shared_ptr<Array>> FromArrays(
0079       const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
0080     return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
0081                       dictionary);
0082   }
0083 
0084   /// \brief Transpose this DictionaryArray
0085   ///
0086   /// This method constructs a new dictionary array with the given dictionary
0087   /// type, transposing indices using the transpose map.  The type and the
0088   /// transpose map are typically computed using DictionaryUnifier.
0089   ///
0090   /// \param[in] type the new type object
0091   /// \param[in] dictionary the new dictionary
0092   /// \param[in] transpose_map transposition array of this array's indices
0093   ///   into the target array's indices
0094   /// \param[in] pool a pool to allocate the array data from
0095   Result<std::shared_ptr<Array>> Transpose(
0096       const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
0097       const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
0098 
0099   Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;
0100 
0101   /// \brief Determine whether dictionary arrays may be compared without unification
0102   bool CanCompareIndices(const DictionaryArray& other) const;
0103 
0104   /// \brief Return the dictionary for this array, which is stored as
0105   /// a member of the ArrayData internal structure
0106   const std::shared_ptr<Array>& dictionary() const;
0107   const std::shared_ptr<Array>& indices() const;
0108 
0109   /// \brief Return the ith value of indices, cast to int64_t. Not recommended
0110   /// for use in performance-sensitive code. Does not validate whether the
0111   /// value is null or out-of-bounds.
0112   int64_t GetValueIndex(int64_t i) const;
0113 
0114   const DictionaryType* dict_type() const { return dict_type_; }
0115 
0116  private:
0117   void SetData(const std::shared_ptr<ArrayData>& data);
0118   const DictionaryType* dict_type_;
0119   std::shared_ptr<Array> indices_;
0120 
0121   // Lazily initialized when invoking dictionary()
0122   mutable std::shared_ptr<Array> dictionary_;
0123 };
0124 
0125 /// \brief Helper class for incremental dictionary unification
0126 class ARROW_EXPORT DictionaryUnifier {
0127  public:
0128   virtual ~DictionaryUnifier() = default;
0129 
0130   /// \brief Construct a DictionaryUnifier
0131   /// \param[in] value_type the data type of the dictionaries
0132   /// \param[in] pool MemoryPool to use for memory allocations
0133   static Result<std::unique_ptr<DictionaryUnifier>> Make(
0134       std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
0135 
0136   /// \brief Unify dictionaries across array chunks
0137   ///
0138   /// The dictionaries in the array chunks will be unified, their indices
0139   /// accordingly transposed.
0140   ///
0141   /// Only dictionaries with a primitive value type are currently supported.
0142   /// However, dictionaries nested inside a more complex type are correctly unified.
0143   static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
0144       const std::shared_ptr<ChunkedArray>& array,
0145       MemoryPool* pool = default_memory_pool());
0146 
0147   /// \brief Unify dictionaries across the chunks of each table column
0148   ///
0149   /// The dictionaries in each table column will be unified, their indices
0150   /// accordingly transposed.
0151   ///
0152   /// Only dictionaries with a primitive value type are currently supported.
0153   /// However, dictionaries nested inside a more complex type are correctly unified.
0154   static Result<std::shared_ptr<Table>> UnifyTable(
0155       const Table& table, MemoryPool* pool = default_memory_pool());
0156 
0157   /// \brief Append dictionary to the internal memo
0158   virtual Status Unify(const Array& dictionary) = 0;
0159 
0160   /// \brief Append dictionary and compute transpose indices
0161   /// \param[in] dictionary the dictionary values to unify
0162   /// \param[out] out_transpose a Buffer containing computed transpose indices
0163   /// as int32_t values equal in length to the passed dictionary. The value in
0164   /// each slot corresponds to the new index value for each original index
0165   /// for a DictionaryArray with the old dictionary
0166   virtual Status Unify(const Array& dictionary,
0167                        std::shared_ptr<Buffer>* out_transpose) = 0;
0168 
0169   /// \brief Return a result DictionaryType with the smallest possible index
0170   /// type to accommodate the unified dictionary. The unifier cannot be used
0171   /// after this is called
0172   virtual Status GetResult(std::shared_ptr<DataType>* out_type,
0173                            std::shared_ptr<Array>* out_dict) = 0;
0174 
0175   /// \brief Return a unified dictionary with the given index type.  If
0176   /// the index type is not large enough then an invalid status will be returned.
0177   /// The unifier cannot be used after this is called
0178   virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
0179                                         std::shared_ptr<Array>* out_dict) = 0;
0180 };
0181 
0182 }  // namespace arrow