include/arrow/tensor.h

0001 // Licensed to the Apache Software Foundation (ASF) under one
0002 // or more contributor license agreements.  See the NOTICE file
0003 // distributed with this work for additional information
0004 // regarding copyright ownership.  The ASF licenses this file
0005 // to you under the Apache License, Version 2.0 (the
0006 // "License"); you may not use this file except in compliance
0007 // with the License.  You may obtain a copy of the License at
0008 //
0009 //   http://www.apache.org/licenses/LICENSE-2.0
0010 //
0011 // Unless required by applicable law or agreed to in writing,
0012 // software distributed under the License is distributed on an
0013 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
0014 // KIND, either express or implied.  See the License for the
0015 // specific language governing permissions and limitations
0016 // under the License.
0017
0018 #pragma once
0019
0020 #include <cstdint>
0021 #include <memory>
0022 #include <string>
0023 #include <vector>
0024
0025 #include "arrow/buffer.h"
0026 #include "arrow/compare.h"
0027 #include "arrow/result.h"
0028 #include "arrow/status.h"
0029 #include "arrow/type.h"
0030 #include "arrow/type_traits.h"
0031 #include "arrow/util/macros.h"
0032 #include "arrow/util/visibility.h"
0033
0034 namespace arrow {
0035
0036 static inline bool is_tensor_supported(Type::type type_id) {
0037   switch (type_id) {
0038     case Type::UINT8:
0039     case Type::INT8:
0040     case Type::UINT16:
0041     case Type::INT16:
0042     case Type::UINT32:
0043     case Type::INT32:
0044     case Type::UINT64:
0045     case Type::INT64:
0046     case Type::HALF_FLOAT:
0047     case Type::FLOAT:
0048     case Type::DOUBLE:
0049       return true;
0050     default:
0051       break;
0052   }
0053   return false;
0054 }
0055
0056 namespace internal {
0057
0058 ARROW_EXPORT
0059 Status ComputeRowMajorStrides(const FixedWidthType& type,
0060                               const std::vector<int64_t>& shape,
0061                               std::vector<int64_t>* strides);
0062
0063 ARROW_EXPORT
0064 Status ComputeColumnMajorStrides(const FixedWidthType& type,
0065                                  const std::vector<int64_t>& shape,
0066                                  std::vector<int64_t>* strides);
0067
0068 ARROW_EXPORT
0069 bool IsTensorStridesContiguous(const std::shared_ptr<DataType>& type,
0070                                const std::vector<int64_t>& shape,
0071                                const std::vector<int64_t>& strides);
0072
0073 ARROW_EXPORT
0074 Status ValidateTensorParameters(const std::shared_ptr<DataType>& type,
0075                                 const std::shared_ptr<Buffer>& data,
0076                                 const std::vector<int64_t>& shape,
0077                                 const std::vector<int64_t>& strides,
0078                                 const std::vector<std::string>& dim_names);
0079
0080 ARROW_EXPORT
0081 Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major,
0082                            MemoryPool* pool, std::shared_ptr<Tensor>* tensor);
0083
0084 }  // namespace internal
0085
0086 class ARROW_EXPORT Tensor {
0087  public:
0088   /// \brief Create a Tensor with full parameters
0089   ///
0090   /// This factory function will return Status::Invalid when the parameters are
0091   /// inconsistent
0092   ///
0093   /// \param[in] type The data type of the tensor values
0094   /// \param[in] data The buffer of the tensor content
0095   /// \param[in] shape The shape of the tensor
0096   /// \param[in] strides The strides of the tensor
0097   ///            (if this is empty, the data assumed to be row-major)
0098   /// \param[in] dim_names The names of the tensor dimensions
0099   static inline Result<std::shared_ptr<Tensor>> Make(
0100       const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
0101       const std::vector<int64_t>& shape, const std::vector<int64_t>& strides = {},
0102       const std::vector<std::string>& dim_names = {}) {
0103     ARROW_RETURN_NOT_OK(
0104         internal::ValidateTensorParameters(type, data, shape, strides, dim_names));
0105     return std::make_shared<Tensor>(type, data, shape, strides, dim_names);
0106   }
0107
0108   virtual ~Tensor() = default;
0109
0110   /// Constructor with no dimension names or strides, data assumed to be row-major
0111   Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
0112          const std::vector<int64_t>& shape);
0113
0114   /// Constructor with non-negative strides
0115   Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
0116          const std::vector<int64_t>& shape, const std::vector<int64_t>& strides);
0117
0118   /// Constructor with non-negative strides and dimension names
0119   Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
0120          const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
0121          const std::vector<std::string>& dim_names);
0122
0123   std::shared_ptr<DataType> type() const { return type_; }
0124   std::shared_ptr<Buffer> data() const { return data_; }
0125
0126   const uint8_t* raw_data() const { return data_->data(); }
0127   uint8_t* raw_mutable_data() { return data_->mutable_data(); }
0128
0129   const std::vector<int64_t>& shape() const { return shape_; }
0130   const std::vector<int64_t>& strides() const { return strides_; }
0131
0132   int ndim() const { return static_cast<int>(shape_.size()); }
0133
0134   const std::vector<std::string>& dim_names() const { return dim_names_; }
0135   const std::string& dim_name(int i) const;
0136
0137   /// Total number of value cells in the tensor
0138   int64_t size() const;
0139
0140   /// Return true if the underlying data buffer is mutable
0141   bool is_mutable() const { return data_->is_mutable(); }
0142
0143   /// Either row major or column major
0144   bool is_contiguous() const;
0145
0146   /// AKA "C order"
0147   bool is_row_major() const;
0148
0149   /// AKA "Fortran order"
0150   bool is_column_major() const;
0151
0152   Type::type type_id() const;
0153
0154   bool Equals(const Tensor& other, const EqualOptions& = EqualOptions::Defaults()) const;
0155
0156   /// Compute the number of non-zero values in the tensor
0157   Result<int64_t> CountNonZero() const;
0158
0159   /// Return the offset of the given index on the given strides
0160   static int64_t CalculateValueOffset(const std::vector<int64_t>& strides,
0161                                       const std::vector<int64_t>& index) {
0162     const int64_t n = static_cast<int64_t>(index.size());
0163     int64_t offset = 0;
0164     for (int64_t i = 0; i < n; ++i) {
0165       offset += index[i] * strides[i];
0166     }
0167     return offset;
0168   }
0169
0170   int64_t CalculateValueOffset(const std::vector<int64_t>& index) const {
0171     return Tensor::CalculateValueOffset(strides_, index);
0172   }
0173
0174   /// Returns the value at the given index without data-type and bounds checks
0175   template <typename ValueType>
0176   const typename ValueType::c_type& Value(const std::vector<int64_t>& index) const {
0177     using c_type = typename ValueType::c_type;
0178     const int64_t offset = CalculateValueOffset(index);
0179     const c_type* ptr = reinterpret_cast<const c_type*>(raw_data() + offset);
0180     return *ptr;
0181   }
0182
0183   Status Validate() const {
0184     return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
0185   }
0186
0187  protected:
0188   Tensor() {}
0189
0190   std::shared_ptr<DataType> type_;
0191   std::shared_ptr<Buffer> data_;
0192   std::vector<int64_t> shape_;
0193   std::vector<int64_t> strides_;
0194
0195   /// These names are optional
0196   std::vector<std::string> dim_names_;
0197
0198   template <typename SparseIndexType>
0199   friend class SparseTensorImpl;
0200
0201  private:
0202   ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor);
0203 };
0204
0205 template <typename TYPE>
0206 class NumericTensor : public Tensor {
0207  public:
0208   using TypeClass = TYPE;
0209   using value_type = typename TypeClass::c_type;
0210
0211   /// \brief Create a NumericTensor with full parameters
0212   ///
0213   /// This factory function will return Status::Invalid when the parameters are
0214   /// inconsistent
0215   ///
0216   /// \param[in] data The buffer of the tensor content
0217   /// \param[in] shape The shape of the tensor
0218   /// \param[in] strides The strides of the tensor
0219   ///            (if this is empty, the data assumed to be row-major)
0220   /// \param[in] dim_names The names of the tensor dimensions
0221   static Result<std::shared_ptr<NumericTensor<TYPE>>> Make(
0222       const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
0223       const std::vector<int64_t>& strides = {},
0224       const std::vector<std::string>& dim_names = {}) {
0225     ARROW_RETURN_NOT_OK(internal::ValidateTensorParameters(
0226         TypeTraits<TYPE>::type_singleton(), data, shape, strides, dim_names));
0227     return std::make_shared<NumericTensor<TYPE>>(data, shape, strides, dim_names);
0228   }
0229
0230   /// Constructor with non-negative strides and dimension names
0231   NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
0232                 const std::vector<int64_t>& strides,
0233                 const std::vector<std::string>& dim_names)
0234       : Tensor(TypeTraits<TYPE>::type_singleton(), data, shape, strides, dim_names) {}
0235
0236   /// Constructor with no dimension names or strides, data assumed to be row-major
0237   NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape)
0238       : NumericTensor(data, shape, {}, {}) {}
0239
0240   /// Constructor with non-negative strides
0241   NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
0242                 const std::vector<int64_t>& strides)
0243       : NumericTensor(data, shape, strides, {}) {}
0244
0245   const value_type& Value(const std::vector<int64_t>& index) const {
0246     return Tensor::Value<TypeClass>(index);
0247   }
0248 };
0249
0250 }  // namespace arrow