Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-08-28 08:26:53

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017 
0018 // Array accessor classes for Binary, LargeBinary, String, LargeString,
0019 // FixedSizeBinary
0020 
0021 #pragma once
0022 
0023 #include <cstdint>
0024 #include <memory>
0025 #include <optional>
0026 #include <string>
0027 #include <string_view>
0028 #include <vector>
0029 
0030 #include "arrow/array/array_base.h"
0031 #include "arrow/array/data.h"
0032 #include "arrow/buffer.h"
0033 #include "arrow/stl_iterator.h"
0034 #include "arrow/type.h"
0035 #include "arrow/util/checked_cast.h"
0036 #include "arrow/util/macros.h"
0037 #include "arrow/util/visibility.h"
0038 
0039 namespace arrow {
0040 
0041 /// \addtogroup binary-arrays
0042 ///
0043 /// @{
0044 
0045 // ----------------------------------------------------------------------
0046 // Binary and String
0047 
0048 /// Base class for variable-sized binary arrays, regardless of offset size
0049 /// and logical interpretation.
0050 template <typename TYPE>
0051 class BaseBinaryArray : public FlatArray {
0052  public:
0053   using TypeClass = TYPE;
0054   using offset_type = typename TypeClass::offset_type;
0055   using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
0056 
0057   /// Return the pointer to the given elements bytes
0058   // XXX should GetValue(int64_t i) return a string_view?
0059   const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
0060     const offset_type pos = raw_value_offsets_[i];
0061     *out_length = raw_value_offsets_[i + 1] - pos;
0062     return raw_data_ + pos;
0063   }
0064 
0065   /// \brief Get binary value as a string_view
0066   ///
0067   /// \param i the value index
0068   /// \return the view over the selected value
0069   std::string_view GetView(int64_t i) const {
0070     const offset_type pos = raw_value_offsets_[i];
0071     return std::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
0072                             raw_value_offsets_[i + 1] - pos);
0073   }
0074 
0075   std::optional<std::string_view> operator[](int64_t i) const {
0076     return *IteratorType(*this, i);
0077   }
0078 
0079   /// \brief Get binary value as a string_view
0080   /// Provided for consistency with other arrays.
0081   ///
0082   /// \param i the value index
0083   /// \return the view over the selected value
0084   std::string_view Value(int64_t i) const { return GetView(i); }
0085 
0086   /// \brief Get binary value as a std::string
0087   ///
0088   /// \param i the value index
0089   /// \return the value copied into a std::string
0090   std::string GetString(int64_t i) const { return std::string(GetView(i)); }
0091 
0092   /// Note that this buffer does not account for any slice offset
0093   std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
0094 
0095   /// Note that this buffer does not account for any slice offset
0096   std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
0097 
0098   const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
0099 
0100   const uint8_t* raw_data() const { return raw_data_; }
0101 
0102   /// \brief Return the data buffer absolute offset of the data for the value
0103   /// at the passed index.
0104   ///
0105   /// Does not perform boundschecking
0106   offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
0107 
0108   /// \brief Return the length of the data for the value at the passed index.
0109   ///
0110   /// Does not perform boundschecking
0111   offset_type value_length(int64_t i) const {
0112     return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
0113   }
0114 
0115   /// \brief Return the total length of the memory in the data buffer
0116   /// referenced by this array. If the array has been sliced then this may be
0117   /// less than the size of the data buffer (data_->buffers[2]).
0118   offset_type total_values_length() const {
0119     if (data_->length > 0) {
0120       return raw_value_offsets_[data_->length] - raw_value_offsets_[0];
0121     } else {
0122       return 0;
0123     }
0124   }
0125 
0126   IteratorType begin() const { return IteratorType(*this); }
0127 
0128   IteratorType end() const { return IteratorType(*this, length()); }
0129 
0130  protected:
0131   // For subclasses
0132   BaseBinaryArray() = default;
0133 
0134   // Protected method for constructors
0135   void SetData(const std::shared_ptr<ArrayData>& data) {
0136     this->Array::SetData(data);
0137     raw_value_offsets_ = data->GetValuesSafe<offset_type>(1);
0138     raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
0139   }
0140 
0141   const offset_type* raw_value_offsets_ = NULLPTR;
0142   const uint8_t* raw_data_ = NULLPTR;
0143 };
0144 
0145 /// Concrete Array class for variable-size binary data
0146 class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
0147  public:
0148   explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
0149 
0150   BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
0151               const std::shared_ptr<Buffer>& data,
0152               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
0153               int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0154 
0155  protected:
0156   // For subclasses such as StringArray
0157   BinaryArray() : BaseBinaryArray() {}
0158 };
0159 
0160 /// Concrete Array class for variable-size string (utf-8) data
0161 class ARROW_EXPORT StringArray : public BinaryArray {
0162  public:
0163   using TypeClass = StringType;
0164 
0165   explicit StringArray(const std::shared_ptr<ArrayData>& data);
0166 
0167   StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
0168               const std::shared_ptr<Buffer>& data,
0169               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
0170               int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0171 
0172   /// \brief Validate that this array contains only valid UTF8 entries
0173   ///
0174   /// This check is also implied by ValidateFull()
0175   Status ValidateUTF8() const;
0176 };
0177 
0178 /// Concrete Array class for large variable-size binary data
0179 class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
0180  public:
0181   explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
0182 
0183   LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
0184                    const std::shared_ptr<Buffer>& data,
0185                    const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
0186                    int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0187 
0188  protected:
0189   // For subclasses such as LargeStringArray
0190   LargeBinaryArray() : BaseBinaryArray() {}
0191 };
0192 
0193 /// Concrete Array class for large variable-size string (utf-8) data
0194 class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
0195  public:
0196   using TypeClass = LargeStringType;
0197 
0198   explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
0199 
0200   LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
0201                    const std::shared_ptr<Buffer>& data,
0202                    const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
0203                    int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0204 
0205   /// \brief Validate that this array contains only valid UTF8 entries
0206   ///
0207   /// This check is also implied by ValidateFull()
0208   Status ValidateUTF8() const;
0209 };
0210 
0211 // ----------------------------------------------------------------------
0212 // BinaryView and StringView
0213 
0214 /// Concrete Array class for variable-size binary view data using the
0215 /// BinaryViewType::c_type struct to reference in-line or out-of-line string values
0216 class ARROW_EXPORT BinaryViewArray : public FlatArray {
0217  public:
0218   using TypeClass = BinaryViewType;
0219   using IteratorType = stl::ArrayIterator<BinaryViewArray>;
0220   using c_type = BinaryViewType::c_type;
0221 
0222   explicit BinaryViewArray(std::shared_ptr<ArrayData> data);
0223 
0224   BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
0225                   std::shared_ptr<Buffer> views, BufferVector data_buffers,
0226                   std::shared_ptr<Buffer> null_bitmap = NULLPTR,
0227                   int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0228 
0229   // For API compatibility with BinaryArray etc.
0230   std::string_view GetView(int64_t i) const;
0231   std::string GetString(int64_t i) const { return std::string{GetView(i)}; }
0232 
0233   const auto& values() const { return data_->buffers[1]; }
0234   const c_type* raw_values() const { return raw_values_; }
0235 
0236   std::optional<std::string_view> operator[](int64_t i) const {
0237     return *IteratorType(*this, i);
0238   }
0239 
0240   IteratorType begin() const { return IteratorType(*this); }
0241   IteratorType end() const { return IteratorType(*this, length()); }
0242 
0243  protected:
0244   using FlatArray::FlatArray;
0245 
0246   void SetData(std::shared_ptr<ArrayData> data) {
0247     FlatArray::SetData(std::move(data));
0248     raw_values_ = data_->GetValuesSafe<c_type>(1);
0249   }
0250 
0251   const c_type* raw_values_;
0252 };
0253 
0254 /// Concrete Array class for variable-size string view (utf-8) data using
0255 /// BinaryViewType::c_type to reference in-line or out-of-line string values
0256 class ARROW_EXPORT StringViewArray : public BinaryViewArray {
0257  public:
0258   using TypeClass = StringViewType;
0259 
0260   explicit StringViewArray(std::shared_ptr<ArrayData> data);
0261 
0262   using BinaryViewArray::BinaryViewArray;
0263 
0264   /// \brief Validate that this array contains only valid UTF8 entries
0265   ///
0266   /// This check is also implied by ValidateFull()
0267   Status ValidateUTF8() const;
0268 };
0269 
0270 // ----------------------------------------------------------------------
0271 // Fixed width binary
0272 
0273 /// Concrete Array class for fixed-size binary data
0274 class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
0275  public:
0276   using TypeClass = FixedSizeBinaryType;
0277   using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
0278 
0279   explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
0280 
0281   FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
0282                        const std::shared_ptr<Buffer>& data,
0283                        const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
0284                        int64_t null_count = kUnknownNullCount, int64_t offset = 0);
0285 
0286   const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; }
0287   const uint8_t* Value(int64_t i) const { return GetValue(i); }
0288 
0289   std::string_view GetView(int64_t i) const {
0290     return std::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width_);
0291   }
0292 
0293   std::optional<std::string_view> operator[](int64_t i) const {
0294     return *IteratorType(*this, i);
0295   }
0296 
0297   std::string GetString(int64_t i) const { return std::string(GetView(i)); }
0298 
0299   int32_t byte_width() const { return byte_width_; }
0300 
0301   const uint8_t* raw_values() const { return values_; }
0302 
0303   IteratorType begin() const { return IteratorType(*this); }
0304 
0305   IteratorType end() const { return IteratorType(*this, length()); }
0306 
0307  protected:
0308   void SetData(const std::shared_ptr<ArrayData>& data) {
0309     this->PrimitiveArray::SetData(data);
0310     byte_width_ =
0311         internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
0312     values_ = raw_values_ + data_->offset * byte_width_;
0313   }
0314 
0315   const uint8_t* values_;
0316   int32_t byte_width_;
0317 };
0318 
0319 /// @}
0320 
0321 }  // namespace arrow