Back to home page

EIC code displayed by LXR

 
 

    


Warning, file /include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 // This file is part of Eigen, a lightweight C++ template library
0002 // for linear algebra.
0003 //
0004 // This Source Code Form is subject to the terms of the Mozilla
0005 // Public License v. 2.0. If a copy of the MPL was not distributed
0006 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
0007 
0008 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
0009 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
0010 
0011 namespace Eigen {
0012 namespace internal {
0013 
0014 // -------------------------------------------------------------------------- //
0015 // Forward declarations for templates defined below.
0016 template <typename Scalar, typename IndexType, int NumDims, int Layout>
0017 class TensorBlockIO;
0018 
0019 // -------------------------------------------------------------------------- //
0020 // Helper function to compute strides for densely stored buffer of given
0021 // dimensions.
0022 
0023 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
0024 // this function instead everywhere.
0025 template <int Layout, typename IndexType, int NumDims>
0026 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
0027     const DSizes<IndexType, NumDims>& dimensions) {
0028   DSizes<IndexType, NumDims> strides;
0029   if (NumDims == 0) return strides;
0030 
0031   // TODO(ezhulenev): Use templates to unroll this loop (similar to
0032   // h_array_reduce in CXX11meta.h)? Benchmark it.
0033   if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
0034     strides[0] = 1;
0035     for (int i = 1; i < NumDims; ++i) {
0036       strides[i] = strides[i - 1] * dimensions[i - 1];
0037     }
0038   } else {
0039     strides[NumDims - 1] = 1;
0040     for (int i = NumDims - 2; i >= 0; --i) {
0041       strides[i] = strides[i + 1] * dimensions[i + 1];
0042     }
0043   }
0044 
0045   return strides;
0046 }
0047 
0048 template <int Layout, typename IndexType, size_t NumDims>
0049 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
0050     const Eigen::array<IndexType, NumDims>& dimensions) {
0051   return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
0052 }
0053 
0054 template <int Layout, std::ptrdiff_t... Indices>
0055 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
0056     const Sizes<Indices...>& sizes) {
0057   return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
0058 }
0059 
0060 // -------------------------------------------------------------------------- //
0061 
0062 // Tensor block shape type defines what are the shape preference for the blocks
0063 // extracted from the larger tensor.
0064 //
0065 // Example: blocks of 100 elements from the large 100x100 tensor:
0066 // - tensor: 100x100
0067 // - target_block_size: 100
0068 //
0069 // TensorBlockShapeType:
0070 //  - kUniformAllDims: 100 blocks of size 10x10
0071 //  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
0072 //                      or row major layout)
0073 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
0074 
0075 struct TensorBlockResourceRequirements {
0076   TensorBlockShapeType shape_type;  // target block shape
0077   size_t size;                      // target block size
0078   TensorOpCost cost_per_coeff;      // cost of computing a single block element
0079 
0080 #ifdef EIGEN_HIPCC
0081   // For HIPCC, we need to explicitly declare as a "device fun", the constructor
0082   // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
0083   // errors out complaining about the lack of a matching constructor
0084   EIGEN_DEVICE_FUNC
0085   TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
0086                   TensorOpCost cost_)
0087     : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
0088   {}
0089 #endif
0090 
0091   template <typename Scalar>
0092   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
0093       TensorBlockShapeType shape_type, size_t size_in_bytes,
0094       TensorOpCost cost) {
0095     const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
0096     return {shape_type, size, cost};
0097   }
0098 
0099   template <typename Scalar>
0100   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
0101       TensorBlockShapeType shape_type, size_t size_in_bytes) {
0102     // This default cost per coefficient is valid for most materialized tensor
0103     // block evaluation implementations, because they typically just read
0104     // coefficients from the underlying tensor storage, and write to the tensor
0105     // block buffer (scratch or destination memory, reads and writes have linear
0106     // access pattern). We ignore the fixed cost of block evaluation, because in
0107     // practice it should negligible.
0108     //
0109     // Lazy block evaluation adds the cost of calling a functor for each
0110     // coefficient.
0111     //
0112     // All non-trivial block evaluation implementations must provide their own
0113     // cost approximation (e.g. shuffling inner dimension has a much higher cost
0114     // because it reads memory randomly, although the total number of moved
0115     // bytes is the same).
0116     return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
0117                                     {/*bytes_loaded=*/sizeof(Scalar),
0118                                      /*bytes_stored=*/sizeof(Scalar),
0119                                      /*compute_cycles=*/0});
0120   }
0121 
0122   template <typename Scalar>
0123   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
0124       size_t size_in_bytes) {
0125     return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
0126                                     size_in_bytes);
0127   }
0128 
0129   template <typename Scalar>
0130   EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
0131       size_t size_in_bytes) {
0132     return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
0133                                     size_in_bytes);
0134   }
0135 
0136   EIGEN_DEVICE_FUNC
0137   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
0138   merge(const TensorBlockResourceRequirements& lhs,
0139         const TensorBlockResourceRequirements& rhs) {
0140     return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
0141             merge(lhs.size, rhs.size),                       // size
0142             merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
0143   }
0144 
0145   EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
0146       TensorOpCost cost) {
0147     cost_per_coeff += cost;
0148     return *this;
0149   }
0150 
0151   // This is a resource requirement that should be returned from expressions
0152   // that do not have any block evaluation preference (e.g. default tensor
0153   // expression with raw buffer access).
0154   EIGEN_DEVICE_FUNC
0155   static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
0156     return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
0157   }
0158 
0159  private:
0160   using Requirements = TensorBlockResourceRequirements;
0161 
0162   EIGEN_DEVICE_FUNC
0163   static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
0164     return numext::maxi(lhs_size, rhs_size);
0165   }
0166 
0167   EIGEN_DEVICE_FUNC
0168   static EIGEN_STRONG_INLINE TensorBlockShapeType
0169   merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
0170     return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
0171             rhs == TensorBlockShapeType::kSkewedInnerDims)
0172                ? TensorBlockShapeType::kSkewedInnerDims
0173                : TensorBlockShapeType::kUniformAllDims;
0174   }
0175 
0176   EIGEN_DEVICE_FUNC
0177   static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
0178                                                 TensorOpCost rhs_cost) {
0179     return lhs_cost + rhs_cost;
0180   }
0181 };
0182 
0183 // -------------------------------------------------------------------------- //
0184 // TensorBlockDescriptor specifies a block offset within a tensor and the block
0185 // sizes along each of the tensor dimensions.
0186 
0187 template <int NumDims, typename IndexType = Eigen::Index>
0188 class TensorBlockDescriptor {
0189  public:
0190   typedef DSizes<IndexType, NumDims> Dimensions;
0191 
0192   // If we evaluate a Tensor assignment, and expression on the left, already has
0193   // a memory buffer, then we might do performance optimization, and evaluate
0194   // the root expression directly into the final output memory. Some time it's
0195   // possible to reuse it for materializing subexpressions inside an expression
0196   // tree, to to avoid dynamic memory allocation.
0197   //
0198   // The pointer type of the underlying storage is erased, because passing
0199   // Scalar type through all the expression evaluation layers is way too many
0200   // templates. In practice destination buffer type should always match the
0201   // evaluated expression scalar type.
0202   class DestinationBuffer {
0203    public:
0204     enum DestinationBufferKind : int {
0205       // The above explicit specification of "int" as the enum basetype is
0206       // needed to get around a HIPCC link error ("the field type is not
0207       // amp-compatible")
0208       // which is issued for class members with the enum type.
0209       // TODO(rocm):
0210       // remove the "int" basetype once HIPCC has been fixed to not error out
0211       // in the above scenario.
0212 
0213       // Destination buffer is not defined (`m_data` == nullptr).
0214       kEmpty,
0215 
0216       // Tensor block defined by an owning tensor block descriptor can fit
0217       // contiguously into the destination buffer. In this case it's safe to
0218       // materialize tensor block in the destination buffer, wrap it in a
0219       // TensorMap, and use to build Eigen expression on top of it.
0220       kContiguous,
0221 
0222       // Destination buffer strides do not match strides of the contiguously
0223       // stored block, and it's impossible to define a TensorMap over this
0224       // buffer. However if we are evaluating a root of an expression tree, we
0225       // still can materialize an output into this destination, because we can
0226       // guarantee that no one will ever access it through block API.
0227       //
0228       // In theory it is possible to build valid TensorStriding<TensorMap>
0229       // expression on top of this destination buffer, however it has
0230       // inefficient coeff/packet access, and defeats the purpose of fast block
0231       // evaluation API.
0232       kStrided
0233     };
0234 
0235     template <typename Scalar>
0236     Scalar* data() const {
0237       eigen_assert(m_data_type_size == sizeof(Scalar));
0238       return static_cast<Scalar*>(m_data);
0239     }
0240 
0241     const Dimensions& strides() const { return m_strides; }
0242     const DestinationBufferKind& kind() const { return m_kind; }
0243 
0244    private:
0245     friend class TensorBlockDescriptor;
0246 
0247     DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
0248 
0249     template <typename Scalar>
0250     DestinationBuffer(Scalar* data, const Dimensions& strides,
0251                       DestinationBufferKind kind)
0252         : m_data(static_cast<void*>(data)),
0253           m_data_type_size(sizeof(Scalar)),
0254           m_strides(strides),
0255           m_kind(kind) {}
0256 
0257     template <int Layout, typename Scalar>
0258     static DestinationBuffer make(const TensorBlockDescriptor& desc,
0259                                   Scalar* data, const Dimensions& strides) {
0260       return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
0261     }
0262 
0263     template <int Layout>
0264     static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
0265                                       const Dimensions& strides) {
0266       const Dimensions& desc_dims = desc.dimensions();
0267       const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
0268       for (int i = 0; i < NumDims; ++i) {
0269         if (desc_dims[i] == 1) continue;
0270         if (desc_strides[i] != strides[i]) return kStrided;
0271       }
0272       return kContiguous;
0273     }
0274 
0275     // Storage pointer is type erased, to reduce template bloat, but we still
0276     // keep the size of the underlying element type for error checking.
0277     void* m_data;
0278     size_t m_data_type_size;
0279 
0280     // Destination buffer dimensions always match the dimensions of a tensor
0281     // block descriptor it belongs to, however strides might be different.
0282     Dimensions m_strides;
0283 
0284     DestinationBufferKind m_kind;
0285   };
0286 
0287   TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
0288                         const DestinationBuffer& destination)
0289       : m_offset(offset),
0290         m_dimensions(dimensions),
0291         m_destination(destination) {}
0292 
0293   TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
0294       : m_offset(offset),
0295         m_dimensions(dimensions),
0296         m_destination(DestinationBuffer()) {}
0297 
0298   IndexType offset() const { return m_offset; }
0299   const Dimensions& dimensions() const { return m_dimensions; }
0300   IndexType dimension(int index) const { return m_dimensions[index]; }
0301   IndexType size() const { return array_prod<IndexType>(m_dimensions); }
0302 
0303   const DestinationBuffer& destination() const { return m_destination; }
0304 
0305   template <int Layout, typename Scalar>
0306   void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
0307     eigen_assert(dst_base != NULL);
0308     m_destination =
0309         DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
0310   }
0311 
0312   template <int Layout, typename Scalar, typename DstStridesIndexType>
0313   void AddDestinationBuffer(
0314       Scalar* dst_base,
0315       const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
0316     // DSizes constructor will do index type promotion if it's safe.
0317     AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
0318   }
0319 
0320   TensorBlockDescriptor& DropDestinationBuffer() {
0321     m_destination.m_data = NULL;
0322     m_destination.m_kind = DestinationBuffer::kEmpty;
0323     return *this;
0324   }
0325 
0326   bool HasDestinationBuffer() const {
0327     return m_destination.kind() != DestinationBuffer::kEmpty;
0328   }
0329 
0330   // Returns a copy of `*this` with updated offset.
0331   TensorBlockDescriptor WithOffset(IndexType offset) const {
0332     return TensorBlockDescriptor(offset, m_dimensions, m_destination);
0333   }
0334 
0335  private:
0336   // Offset and dimensions are immutable after construction. Block descriptor
0337   // can only be mutated by adding or dropping destination.
0338   const IndexType m_offset;
0339   const Dimensions m_dimensions;
0340   DestinationBuffer m_destination;
0341 };
0342 
0343 // -------------------------------------------------------------------------- //
0344 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
0345 
0346 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
0347 class TensorBlockMapper {
0348   typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
0349 
0350  public:
0351   typedef DSizes<IndexType, NumDims> Dimensions;
0352 
0353   TensorBlockMapper() = default;
0354   TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
0355                     const TensorBlockResourceRequirements& requirements)
0356       : m_tensor_dimensions(dimensions), m_requirements(requirements) {
0357     // Compute block dimensions and the total number of blocks.
0358     InitializeBlockDimensions();
0359   }
0360 
0361   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
0362     return m_total_block_count;
0363   }
0364 
0365   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
0366     return m_block_dimensions.TotalSize();
0367   }
0368 
0369   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
0370   blockDimensions() const {
0371     return m_block_dimensions;
0372   }
0373 
0374   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
0375   blockDescriptor(IndexType block_index) const {
0376     static const bool isColMajor = Layout == static_cast<int>(ColMajor);
0377 
0378     IndexType offset = 0;
0379     DSizes<IndexType, NumDims> dimensions;
0380 
0381     if (NumDims == 0) return BlockDescriptor(offset, dimensions);
0382 
0383     // Iterate outer -> inner dimensions.
0384     for (int i = NumDims - 1; i >= 0; --i) {
0385       const int dim = isColMajor ? i : NumDims - i - 1;
0386 
0387       const IndexType idx = block_index / m_block_strides[dim];
0388       block_index -= idx * m_block_strides[dim];
0389 
0390       const IndexType coord = idx * m_block_dimensions[dim];
0391       dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
0392                                      m_block_dimensions[dim]);
0393       offset += coord * m_tensor_strides[dim];
0394     }
0395 
0396     return {offset, dimensions};
0397   }
0398 
0399  private:
0400   void InitializeBlockDimensions() {
0401     // Requested block shape and size.
0402     const TensorBlockShapeType shape_type = m_requirements.shape_type;
0403     IndexType target_block_size =
0404         numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
0405 
0406     IndexType tensor_size = m_tensor_dimensions.TotalSize();
0407 
0408     // Corner case: one of the dimensions is zero. Logic below is too complex
0409     // to handle this case on a general basis, just use unit block size.
0410     // Note: we must not yield blocks with zero dimensions (recipe for
0411     // overflows/underflows, divisions by zero and NaNs later).
0412     if (tensor_size == 0) {
0413       for (int i = 0; i < NumDims; ++i) {
0414         m_block_dimensions[i] = 1;
0415       }
0416       m_total_block_count = 0;
0417       return;
0418     }
0419 
0420     // If tensor fits into a target block size, evaluate it as a single block.
0421     if (tensor_size <= target_block_size) {
0422       m_block_dimensions = m_tensor_dimensions;
0423       m_total_block_count = 1;
0424       // The only valid block index is `0`, and in this case we do not need
0425       // to compute real strides for tensor or blocks (see blockDescriptor).
0426       for (int i = 0; i < NumDims; ++i) {
0427         m_tensor_strides[i] = 0;
0428         m_block_strides[i] = 1;
0429       }
0430       return;
0431     }
0432 
0433     static const bool isColMajor = Layout == static_cast<int>(ColMajor);
0434 
0435     // Block shape skewed towards inner dimension.
0436     if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
0437       IndexType coeff_to_allocate = target_block_size;
0438 
0439       for (int i = 0; i < NumDims; ++i) {
0440         const int dim = isColMajor ? i : NumDims - i - 1;
0441         m_block_dimensions[dim] =
0442             numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
0443         coeff_to_allocate = divup(
0444             coeff_to_allocate,
0445             numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
0446       }
0447       eigen_assert(coeff_to_allocate == 1);
0448 
0449     } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
0450       // Tensor will not fit within 'target_block_size' budget: calculate tensor
0451       // block dimension sizes based on "square" dimension size target.
0452       const IndexType dim_size_target = convert_index<IndexType>(
0453           std::pow(static_cast<float>(target_block_size),
0454                    1.0f / static_cast<float>(m_block_dimensions.rank())));
0455 
0456       for (int i = 0; i < NumDims; ++i) {
0457         // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
0458         // a multiple of the packet size. Note that reducing
0459         // 'block_dim_size' in this manner can increase the number of
0460         // blocks, and so will amplify any per-block overhead.
0461         m_block_dimensions[i] =
0462             numext::mini(dim_size_target, m_tensor_dimensions[i]);
0463       }
0464 
0465       // Add any un-allocated coefficients to inner dimension(s).
0466       IndexType total_size = m_block_dimensions.TotalSize();
0467       for (int i = 0; i < NumDims; ++i) {
0468         const int dim = isColMajor ? i : NumDims - i - 1;
0469 
0470         if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
0471           const IndexType total_size_other_dims =
0472               total_size / m_block_dimensions[dim];
0473           const IndexType alloc_avail =
0474               divup<IndexType>(target_block_size, total_size_other_dims);
0475           if (alloc_avail == m_block_dimensions[dim]) {
0476             // Insufficient excess coefficients to allocate.
0477             break;
0478           }
0479           m_block_dimensions[dim] =
0480               numext::mini(m_tensor_dimensions[dim], alloc_avail);
0481           total_size = total_size_other_dims * m_block_dimensions[dim];
0482         }
0483       }
0484 
0485     } else {
0486       eigen_assert(false);  // unknown block shape
0487     }
0488 
0489     eigen_assert(m_block_dimensions.TotalSize() >=
0490                  numext::mini<IndexType>(target_block_size,
0491                                          m_tensor_dimensions.TotalSize()));
0492 
0493     // Calculate block counts by dimension and total block count.
0494     DSizes<IndexType, NumDims> block_count;
0495     for (int i = 0; i < NumDims; ++i) {
0496       block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
0497     }
0498     m_total_block_count = array_prod(block_count);
0499 
0500     // Calculate block strides (used for enumerating blocks).
0501     m_tensor_strides = strides<Layout>(m_tensor_dimensions);
0502     m_block_strides = strides<Layout>(block_count);
0503   }
0504 
0505   DSizes<IndexType, NumDims> m_tensor_dimensions;
0506   TensorBlockResourceRequirements m_requirements;
0507 
0508   DSizes<IndexType, NumDims> m_block_dimensions;
0509   IndexType m_total_block_count;
0510 
0511   DSizes<IndexType, NumDims> m_tensor_strides;
0512   DSizes<IndexType, NumDims> m_block_strides;
0513 };
0514 
0515 // -------------------------------------------------------------------------- //
0516 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
0517 // for block evaluation (output or input block materialization). Given that
0518 // Eigen expression traversal order is deterministic, all temporary allocations
0519 // are happening in the same order, and usually have exactly the same size.
0520 // Scratch allocator keeps a trace of all dynamic allocations, and after the
0521 // first block evaluation is completed, we should be able to reuse all the
0522 // temporary buffers for the next block evaluation.
0523 
0524 template <typename Device>
0525 class TensorBlockScratchAllocator {
0526  public:
0527   explicit TensorBlockScratchAllocator(const Device& device)
0528       : m_device(device), m_allocation_index(0) {}
0529 
0530   ~TensorBlockScratchAllocator() {
0531     for (size_t i = 0; i < m_allocations.size(); ++i) {
0532       m_device.deallocate(m_allocations[i].ptr);
0533     }
0534   }
0535 
0536   void* allocate(size_t size) {
0537     // TODO(ezhulenev): Remove when replaced with inlined vector.
0538     if (m_allocations.capacity() == 0) m_allocations.reserve(8);
0539 
0540     // Check if we already have an existing allocation att current index.
0541     const int num_allocations = static_cast<int>(m_allocations.size());
0542     const bool has_allocation = m_allocation_index < num_allocations;
0543 
0544     // Allocation index can't be larger than the number of allocations.
0545     eigen_assert(m_allocation_index <= num_allocations);
0546 
0547     // If we have existing allocation, and its size is larger or equal to
0548     // requested size, we do nothing.
0549 
0550     // If current allocation can't fit requested size, we deallocate it, and
0551     // replace with a larger allocation.
0552     if (has_allocation && m_allocations[m_allocation_index].size < size) {
0553       m_device.deallocate(m_allocations[m_allocation_index].ptr);
0554       m_allocations[m_allocation_index].ptr = m_device.allocate(size);
0555       m_allocations[m_allocation_index].size = size;
0556     }
0557 
0558     // Make a new allocation if we don't have and existing one.
0559     if (!has_allocation) {
0560       Allocation allocation;
0561       allocation.ptr = m_device.allocate(size);
0562       allocation.size = size;
0563       m_allocations.push_back(allocation);
0564     }
0565 
0566     eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
0567     eigen_assert(m_allocations[m_allocation_index].size >= size);
0568 
0569     return m_allocations[m_allocation_index++].ptr;
0570   }
0571 
0572   void reset() { m_allocation_index = 0; }
0573 
0574  private:
0575   struct Allocation {
0576     void* ptr;
0577     size_t size;
0578   };
0579 
0580   const Device& m_device;
0581   int m_allocation_index;
0582   // TODO(ezhulenev): This should be an inlined vector.
0583   std::vector<Allocation> m_allocations;
0584 };
0585 
0586 // -------------------------------------------------------------------------- //
0587 // TensorBlockKind represents all possible block kinds, that can be produced by
0588 // TensorEvaluator::evalBlock function.
0589 enum TensorBlockKind {
0590   // Tensor block that is a lazy expression that must be assigned to a
0591   // destination using TensorBlockAssign.
0592   kExpr,
0593 
0594   // Tensor block that is a view into a memory buffer owned by an underlying
0595   // Tensor expression (e.g. it can be a view into a Tensor buffer).
0596   kView,
0597 
0598   // Tensor block that was materialized in a scratch memory buffer, allocated
0599   // with TensorBlockScratchAllocator. This block must be copied to a
0600   // destination, similar to a block of `kExpr` type.
0601   kMaterializedInScratch,
0602 
0603   // Tensor block that was materialized directly into the final output memory
0604   // buffer. For example if the left side of an assignment is a Tensor, we can
0605   // directly materialize the block in the destination memory.
0606   //
0607   // If strides in the output buffer do not match tensor block strides, the
0608   // Tensor expression will be invalid, and should not be used by
0609   // TensorBlockAssign or for constructing another block expression.
0610   kMaterializedInOutput
0611 };
0612 
0613 // -------------------------------------------------------------------------- //
0614 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
0615 // TensorEvaluators that do not support block evaluation.
0616 
0617 class TensorBlockNotImplemented {
0618  public:
0619   typedef void XprType;
0620 };
0621 
0622 // -------------------------------------------------------------------------- //
0623 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
0624 // is not void). It's required to be able to define lazy block expression for
0625 // argument types, that do not support block evaluation.
0626 
0627 template <typename XprType>
0628 struct XprScalar {
0629   typedef typename XprType::Scalar type;
0630 };
0631 template <>
0632 struct XprScalar<void> {
0633   typedef void type;
0634 };
0635 
0636 // -------------------------------------------------------------------------- //
0637 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
0638 // and XprType is just a TensorMap over the data. This block type is typically
0639 // used to materialize blocks of tensor expressions, that can't be efficiently
0640 // represented as lazy Tensor expressions with fast coeff/packet operations,
0641 // e.g. we materialize all broadcasts into evaluated blocks.
0642 //
0643 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
0644 // buffer that backs the original expression (e.g. block is just a view into a
0645 // Tensor), or a memory buffer allocated with scratch allocator, and in this
0646 // case the scratch allocator will deallocate it at the end of block based
0647 // expression execution.
0648 //
0649 // If the block was evaluated directly into the output buffer, and strides in
0650 // the output buffer do not match block strides, the TensorMap expression will
0651 // be invalid, and should never be used in block assignment or any other tensor
0652 // expression.
0653 
0654 template <typename Scalar, int NumDims, int Layout,
0655           typename IndexType = Eigen::Index>
0656 class TensorMaterializedBlock {
0657  public:
0658   typedef DSizes<IndexType, NumDims> Dimensions;
0659   typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
0660 
0661   TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
0662                           const Dimensions& dimensions, bool valid_expr = true)
0663       : m_kind(kind),
0664         m_data(data),
0665         m_dimensions(dimensions),
0666         m_expr(m_data, m_dimensions),
0667         m_valid_expr(valid_expr) {
0668     eigen_assert(m_kind == internal::TensorBlockKind::kView ||
0669                  m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
0670                  m_kind == internal::TensorBlockKind::kMaterializedInOutput);
0671   }
0672 
0673   TensorBlockKind kind() const { return m_kind; }
0674   // NOTE(ezhulenev): Returning XprType by value like in other block types
0675   // causes asan failures. The theory is that XprType::Nested doesn't work
0676   // properly for TensorMap.
0677   const XprType& expr() const {
0678     eigen_assert(m_valid_expr);
0679     return m_expr;
0680   }
0681   const Scalar* data() const { return m_data; }
0682   void cleanup() {}
0683 
0684   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
0685 
0686   // TensorMaterializedBlock can be backed by different types of storage:
0687   //
0688   //   (1) Contiguous block of memory allocated with scratch allocator.
0689   //   (2) Contiguous block of memory reused from tensor block descriptor
0690   //       destination buffer.
0691   //   (3) Strided block of memory reused from tensor block descriptor
0692   //       destination buffer.
0693   //
0694   class Storage {
0695    public:
0696     Scalar* data() const { return m_data; }
0697     const Dimensions& dimensions() const { return m_dimensions; }
0698     const Dimensions& strides() const { return m_strides; }
0699 
0700     TensorMaterializedBlock AsTensorMaterializedBlock() const {
0701       return TensorMaterializedBlock(
0702           m_materialized_in_output
0703               ? internal::TensorBlockKind::kMaterializedInOutput
0704               : internal::TensorBlockKind::kMaterializedInScratch,
0705           m_data, m_dimensions, !m_strided_storage);
0706     }
0707 
0708    private:
0709     friend class TensorMaterializedBlock;
0710 
0711     Storage(Scalar* data, const Dimensions& dimensions,
0712             const Dimensions& strides, bool materialized_in_output,
0713             bool strided_storage)
0714         : m_data(data),
0715           m_dimensions(dimensions),
0716           m_strides(strides),
0717           m_materialized_in_output(materialized_in_output),
0718           m_strided_storage(strided_storage) {}
0719 
0720     Scalar* m_data;
0721     Dimensions m_dimensions;
0722     Dimensions m_strides;
0723     bool m_materialized_in_output;
0724     bool m_strided_storage;
0725   };
0726 
0727   // Creates a storage for materialized block either from the block descriptor
0728   // destination buffer, or allocates a new buffer with scratch allocator.
0729   template <typename TensorBlockScratch>
0730   EIGEN_STRONG_INLINE static Storage prepareStorage(
0731       TensorBlockDesc& desc, TensorBlockScratch& scratch,
0732       bool allow_strided_storage = false) {
0733     // Try to reuse destination as an output block buffer.
0734     typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
0735 
0736     if (desc.destination().kind() == DestinationBuffer::kContiguous) {
0737       Scalar* buffer = desc.destination().template data<Scalar>();
0738       desc.DropDestinationBuffer();
0739       return Storage(buffer, desc.dimensions(),
0740                      internal::strides<Layout>(desc.dimensions()),
0741                      /*materialized_in_output=*/true,
0742                      /*strided_storage=*/false);
0743 
0744     } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
0745                allow_strided_storage) {
0746       Scalar* buffer = desc.destination().template data<Scalar>();
0747       desc.DropDestinationBuffer();
0748       return Storage(buffer, desc.dimensions(), desc.destination().strides(),
0749                      /*materialized_in_output=*/true, /*strided_storage=*/true);
0750 
0751     } else {
0752       void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
0753       return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
0754                      internal::strides<Layout>(desc.dimensions()),
0755                      /*materialized_in_output=*/false,
0756                      /*strided_storage=*/false);
0757     }
0758   }
0759 
0760   // Creates a materialized block for the given descriptor from a memory buffer.
0761   template <typename DataDimensions, typename TensorBlockScratch>
0762   EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
0763       const Scalar* data, const DataDimensions& data_dims,
0764       TensorBlockDesc& desc, TensorBlockScratch& scratch) {
0765     eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
0766 
0767     // If a tensor block dimensions covers a contiguous block of the underlying
0768     // memory, we can skip block buffer memory allocation, and construct a block
0769     // from existing `data` memory buffer.
0770     //
0771     // Example: (RowMajor layout)
0772     //   data_dims:          [11, 12, 13, 14]
0773     //   desc.dimensions():  [1,   1,  3, 14]
0774     //
0775     // In this case we can construct a TensorBlock starting at
0776     // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
0777     static const bool is_col_major = Layout == ColMajor;
0778 
0779     // Find out how many inner dimensions have a matching size.
0780     int num_matching_inner_dims = 0;
0781     for (int i = 0; i < NumDims; ++i) {
0782       int dim = is_col_major ? i : NumDims - i - 1;
0783       if (data_dims[dim] != desc.dimensions()[dim]) break;
0784       ++num_matching_inner_dims;
0785     }
0786 
0787     // All the outer dimensions must be of size `1`, except a single dimension
0788     // before the matching inner dimension (`3` in the example above).
0789     bool can_use_direct_access = true;
0790     for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
0791       int dim = is_col_major ? i : NumDims - i - 1;
0792       if (desc.dimension(dim) != 1) {
0793         can_use_direct_access = false;
0794         break;
0795       }
0796     }
0797 
0798     if (can_use_direct_access) {
0799       const Scalar* block_start = data + desc.offset();
0800       return TensorMaterializedBlock(internal::TensorBlockKind::kView,
0801                                      block_start, desc.dimensions());
0802 
0803     } else {
0804       // Reuse destination buffer or allocate new buffer with scratch allocator.
0805       const Storage storage = prepareStorage(desc, scratch);
0806 
0807       typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
0808           TensorBlockIO;
0809       typedef typename TensorBlockIO::Dst TensorBlockIODst;
0810       typedef typename TensorBlockIO::Src TensorBlockIOSrc;
0811 
0812       TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
0813                            data, desc.offset());
0814       TensorBlockIODst dst(storage.dimensions(), storage.strides(),
0815                            storage.data());
0816 
0817       TensorBlockIO::Copy(dst, src);
0818       return storage.AsTensorMaterializedBlock();
0819     }
0820   }
0821 
0822  private:
0823   TensorBlockKind m_kind;
0824   const Scalar* m_data;
0825   Dimensions m_dimensions;
0826   XprType m_expr;
0827   bool m_valid_expr;
0828 };
0829 
0830 // -------------------------------------------------------------------------- //
0831 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
0832 // functor to the blocks produced by the underlying Tensor expression.
0833 
0834 template <typename UnaryOp, typename ArgTensorBlock>
0835 class TensorCwiseUnaryBlock {
0836   static const bool NoArgBlockAccess =
0837       internal::is_void<typename ArgTensorBlock::XprType>::value;
0838 
0839  public:
0840   typedef typename conditional<
0841       NoArgBlockAccess, void,
0842       TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
0843       type XprType;
0844 
0845   typedef typename XprScalar<XprType>::type Scalar;
0846 
0847   TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
0848       : m_arg_block(arg_block), m_functor(functor) {}
0849 
0850   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0851 
0852   XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
0853   const Scalar* data() const { return NULL; }
0854   void cleanup() { m_arg_block.cleanup(); }
0855 
0856  private:
0857   ArgTensorBlock m_arg_block;
0858   UnaryOp m_functor;
0859 };
0860 
0861 // -------------------------------------------------------------------------- //
0862 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
0863 // functor to the blocks produced by the underlying Tensor expression.
0864 
0865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
0866 class TensorCwiseBinaryBlock {
0867   static const bool NoArgBlockAccess =
0868       internal::is_void<typename LhsTensorBlock::XprType>::value ||
0869       internal::is_void<typename RhsTensorBlock::XprType>::value;
0870 
0871  public:
0872   typedef typename conditional<
0873       NoArgBlockAccess, void,
0874       TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
0875                           const typename RhsTensorBlock::XprType> >::type
0876       XprType;
0877 
0878   typedef typename XprScalar<XprType>::type Scalar;
0879 
0880   TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
0881                          const RhsTensorBlock& right_block,
0882                          const BinaryOp& functor)
0883       : m_left_block(left_block),
0884         m_right_block(right_block),
0885         m_functor(functor) {}
0886 
0887   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0888 
0889   XprType expr() const {
0890     return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
0891   }
0892 
0893   const Scalar* data() const { return NULL; }
0894 
0895   void cleanup() {
0896     m_left_block.cleanup();
0897     m_right_block.cleanup();
0898   }
0899 
0900  private:
0901   LhsTensorBlock m_left_block;
0902   RhsTensorBlock m_right_block;
0903   BinaryOp m_functor;
0904 };
0905 
0906 // -------------------------------------------------------------------------- //
0907 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
0908 // an arbitrary tensor expression from a block of the underlying type (this is a
0909 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
0910 
0911 template <typename BlockFactory, typename ArgTensorBlock>
0912 class TensorUnaryExprBlock {
0913   typedef typename ArgTensorBlock::XprType ArgXprType;
0914   static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
0915 
0916  public:
0917   typedef typename conditional<
0918       NoArgBlockAccess, void,
0919       typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
0920 
0921   typedef typename XprScalar<XprType>::type Scalar;
0922 
0923   TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
0924                        const BlockFactory& factory)
0925       : m_arg_block(arg_block), m_factory(factory) {}
0926 
0927   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0928   XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
0929   const Scalar* data() const { return NULL; }
0930   void cleanup() { m_arg_block.cleanup(); }
0931 
0932  private:
0933   ArgTensorBlock m_arg_block;
0934   BlockFactory m_factory;
0935 };
0936 
0937 // -------------------------------------------------------------------------- //
0938 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
0939 // an arbitrary tensor expression from three blocks of the underlying type.
0940 
0941 template <typename BlockFactory, typename Arg1TensorBlock,
0942           typename Arg2TensorBlock, typename Arg3TensorBlock>
0943 class TensorTernaryExprBlock {
0944   typedef typename Arg1TensorBlock::XprType Arg1XprType;
0945   typedef typename Arg2TensorBlock::XprType Arg2XprType;
0946   typedef typename Arg3TensorBlock::XprType Arg3XprType;
0947 
0948   static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
0949                                        internal::is_void<Arg2XprType>::value ||
0950                                        internal::is_void<Arg3XprType>::value;
0951 
0952  public:
0953   typedef typename conditional<
0954       NoArgBlockAccess, void,
0955       typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
0956                                               Arg3XprType>::type>::type XprType;
0957 
0958   typedef typename XprScalar<XprType>::type Scalar;
0959 
0960   TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
0961                          const Arg2TensorBlock& arg2_block,
0962                          const Arg3TensorBlock& arg3_block,
0963                          const BlockFactory& factory)
0964       : m_arg1_block(arg1_block),
0965         m_arg2_block(arg2_block),
0966         m_arg3_block(arg3_block),
0967         m_factory(factory) {}
0968 
0969   TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0970   XprType expr() const {
0971     return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
0972                           m_arg3_block.expr());
0973   }
0974   const Scalar* data() const { return NULL; }
0975   void cleanup() {
0976     m_arg1_block.cleanup();
0977     m_arg2_block.cleanup();
0978     m_arg3_block.cleanup();
0979   }
0980 
0981  private:
0982   Arg1TensorBlock m_arg1_block;
0983   Arg2TensorBlock m_arg2_block;
0984   Arg3TensorBlock m_arg3_block;
0985   BlockFactory m_factory;
0986 };
0987 
0988 // -------------------------------------------------------------------------- //
0989 // StridedLinearBufferCopy provides a method to copy data between two linear
0990 // buffers with different strides, with optimized paths for scatter/gather.
0991 
0992 template <typename Scalar, typename IndexType>
0993 class StridedLinearBufferCopy {
0994   typedef typename packet_traits<Scalar>::type Packet;
0995   enum {
0996     Vectorizable = packet_traits<Scalar>::Vectorizable,
0997     PacketSize = packet_traits<Scalar>::size
0998   };
0999 
1000  public:
1001   // Specifying linear copy kind statically gives ~30% speedup for small sizes.
1002   enum class Kind {
1003     Linear = 0,       // src_stride == 1 && dst_stride == 1
1004     Scatter = 1,      // src_stride == 1 && dst_stride != 1
1005     FillLinear = 2,   // src_stride == 0 && dst_stride == 1
1006     FillScatter = 3,  // src_stride == 0 && dst_stride != 1
1007     Gather = 4,       // dst_stride == 1
1008     Random = 5        // everything else
1009   };
1010 
1011   struct Dst {
1012     Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1013 
1014     IndexType offset;
1015     IndexType stride;
1016     Scalar* data;
1017   };
1018 
1019   struct Src {
1020     Src(IndexType o, IndexType s, const Scalar* d)
1021         : offset(o), stride(s), data(d) {}
1022 
1023     IndexType offset;
1024     IndexType stride;
1025     const Scalar* data;
1026   };
1027 
1028   template <typename StridedLinearBufferCopy::Kind kind>
1029   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
1030                                                         const Src& src,
1031                                                         const size_t count) {
1032     Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1033               src.data);
1034   }
1035 
1036  private:
1037   template <typename StridedLinearBufferCopy::Kind kind>
1038   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1039       const IndexType count, const IndexType dst_offset,
1040       const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041       const IndexType src_offset, const IndexType src_stride,
1042       const Scalar* EIGEN_RESTRICT src_data) {
1043     const Scalar* src = &src_data[src_offset];
1044     Scalar* dst = &dst_data[dst_offset];
1045 
1046     if (!Vectorizable) {
1047       for (Index i = 0; i < count; ++i) {
1048         dst[i * dst_stride] = src[i * src_stride];
1049       }
1050       return;
1051     }
1052 
1053     const IndexType vectorized_size = count - PacketSize;
1054     IndexType i = 0;
1055 
1056     if (kind == StridedLinearBufferCopy::Kind::Linear) {
1057       // ******************************************************************** //
1058       // Linear copy from `src` to `dst`.
1059       const IndexType unrolled_size = count - 4 * PacketSize;
1060       eigen_assert(src_stride == 1 && dst_stride == 1);
1061       for (; i <= unrolled_size; i += 4 * PacketSize) {
1062         for (int j = 0; j < 4; ++j) {
1063           Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1065         }
1066       }
1067       for (; i <= vectorized_size; i += PacketSize) {
1068         Packet p = ploadu<Packet>(src + i);
1069         pstoreu<Scalar, Packet>(dst + i, p);
1070       }
1071       for (; i < count; ++i) {
1072         dst[i] = src[i];
1073       }
1074       // ******************************************************************** //
1075     } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1076       // Scatter from `src` to `dst`.
1077       eigen_assert(src_stride == 1 && dst_stride != 1);
1078       for (; i <= vectorized_size; i += PacketSize) {
1079         Packet p = ploadu<Packet>(src + i);
1080         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1081       }
1082       for (; i < count; ++i) {
1083         dst[i * dst_stride] = src[i];
1084       }
1085       // ******************************************************************** //
1086     } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1087       // Fill `dst` with value at `*src`.
1088       eigen_assert(src_stride == 0 && dst_stride == 1);
1089       const IndexType unrolled_size = count - 4 * PacketSize;
1090       Packet p = pload1<Packet>(src);
1091       for (; i <= unrolled_size; i += 4 * PacketSize) {
1092         for (int j = 0; j < 4; ++j) {
1093           pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1094         }
1095       }
1096       for (; i <= vectorized_size; i += PacketSize) {
1097         pstoreu<Scalar, Packet>(dst + i, p);
1098       }
1099       for (; i < count; ++i) {
1100         dst[i] = *src;
1101       }
1102       // ******************************************************************** //
1103     } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1104       // Scatter `*src` into `dst`.
1105       eigen_assert(src_stride == 0 && dst_stride != 1);
1106       Packet p = pload1<Packet>(src);
1107       for (; i <= vectorized_size; i += PacketSize) {
1108         pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1109       }
1110       for (; i < count; ++i) {
1111         dst[i * dst_stride] = *src;
1112       }
1113       // ******************************************************************** //
1114     } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1115       // Gather from `src` into `dst`.
1116       eigen_assert(dst_stride == 1);
1117       for (; i <= vectorized_size; i += PacketSize) {
1118         Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119         pstoreu<Scalar, Packet>(dst + i, p);
1120       }
1121       for (; i < count; ++i) {
1122         dst[i] = src[i * src_stride];
1123       }
1124       // ******************************************************************** //
1125     } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1126       // Random.
1127       for (; i < count; ++i) {
1128         dst[i * dst_stride] = src[i * src_stride];
1129       }
1130     } else {
1131       eigen_assert(false);
1132     }
1133   }
1134 };
1135 
1136 // -------------------------------------------------------------------------- //
1137 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1138 // It's possible to specify src->dst dimension mapping for the copy operation.
1139 // Dimensions of `dst` specify how many elements have to be copied, for the
1140 // `src` we need to know only stride to navigate through source memory buffer.
1141 
1142 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1143 class TensorBlockIO {
1144   static const bool IsColMajor = (Layout == ColMajor);
1145 
1146   typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1147 
1148  public:
1149   typedef DSizes<IndexType, NumDims> Dimensions;
1150   typedef DSizes<int, NumDims> DimensionsMap;
1151 
1152   struct Dst {
1153     Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
1154         IndexType dst_offset = 0)
1155         : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1156 
1157     Dimensions dims;
1158     Dimensions strides;
1159     Scalar* data;
1160     IndexType offset;
1161   };
1162 
1163   struct Src {
1164     Src(const Dimensions& src_strides, const Scalar* src,
1165         IndexType src_offset = 0)
1166         : strides(src_strides), data(src), offset(src_offset) {}
1167 
1168     Dimensions strides;
1169     const Scalar* data;
1170     IndexType offset;
1171   };
1172 
1173   // Copies data to `dst` from `src`, using provided dimensions mapping:
1174   //
1175   //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1176   //
1177   // Returns the number of copied elements.
1178   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1179       const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1180     // Copy single scalar value from `src` to `dst`.
1181     if (NumDims == 0) {
1182       *(dst.data + dst.offset) = *(src.data + src.offset);
1183       return 1;
1184     }
1185 
1186     // Both `dst` and `src` must have contiguous innermost dimension. We also
1187     // accept the special case with stride '0', because it's used as a trick to
1188     // implement broadcasting.
1189     {
1190       int inner_dim = IsColMajor ? 0 : NumDims - 1;
1191       EIGEN_UNUSED_VARIABLE(inner_dim);
1192       eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193       eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1194     }
1195 
1196     // Give a shorter name to `dst_to_src_dim_map`.
1197     const DimensionsMap& dim_map = dst_to_src_dim_map;
1198 
1199     // Do not squeeze reordered inner dimensions.
1200     int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1201 
1202     // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1203     // block, and we write data linearly into that dimension, reading it from
1204     // the src. If dimensions are reordered, we might end up reading data from
1205     // the src with `stride != 1`.
1206     //
1207     // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1208     // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1209 
1210     // Find the innermost dimension in the dst whose size is not 1. This is the
1211     // effective inner dim.
1212     int num_size_one_inner_dims = 0;
1213     for (int i = 0; i < num_squeezable_dims; ++i) {
1214       const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215       if (dst.dims[dst_dim] != 1) break;
1216       num_size_one_inner_dims++;
1217     }
1218 
1219     // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1220     if (num_size_one_inner_dims == NumDims) {
1221       *(dst.data + dst.offset) = *(src.data + src.offset);
1222       return 1;
1223     }
1224 
1225     // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1226     const int dst_stride1_dim = IsColMajor
1227                                     ? num_size_one_inner_dims
1228                                     : NumDims - num_size_one_inner_dims - 1;
1229 
1230     // Dimension in the src that corresponds to the dst innermost dimension.
1231     const int src_dim_for_dst_stride1_dim =
1232         NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1233 
1234     // Size of the innermost dimension (length of contiguous blocks of memory).
1235     IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1236 
1237     // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1238     // `src` memory, so we can do less linear copy calls.
1239     for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240       const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241       const IndexType dst_stride = dst.strides[dst_dim];
1242       const IndexType src_stride = src.strides[dim_map[dst_dim]];
1243       if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1244         dst_inner_dim_size *= dst.dims[dst_dim];
1245         ++num_size_one_inner_dims;
1246       } else {
1247         break;
1248       }
1249     }
1250 
1251     // Setup strides to read data from `src` and write to `dst`.
1252     IndexType input_offset = src.offset;
1253     IndexType output_offset = dst.offset;
1254     IndexType input_stride =
1255         NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256     IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1257 
1258     const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1259     array<BlockIteratorState, at_least_1_dim> it;
1260 
1261     // Initialize block iterator state. Squeeze away any dimension of size 1.
1262     int idx = 0;  // currently initialized iterator state index
1263     for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264       const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265       if (dst.dims[dst_dim] == 1) continue;
1266 
1267       it[idx].size = dst.dims[dst_dim];
1268       it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269       it[idx].output_stride = dst.strides[dst_dim];
1270 
1271       it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272       it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1273 
1274       idx++;
1275     }
1276 
1277     // Iterate copying data from src to dst.
1278     const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1279 
1280 #define COPY_INNER_DIM(KIND)                                           \
1281   IndexType num_copied = 0;                                            \
1282   for (num_copied = 0; num_copied < block_total_size;                  \
1283        num_copied += dst_inner_dim_size) {                             \
1284     LinCopy::template Run<KIND>(                                       \
1285         typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1286         typename LinCopy::Src(input_offset, input_stride, src.data),   \
1287         dst_inner_dim_size);                                           \
1288                                                                        \
1289     for (int j = 0; j < idx; ++j) {                                    \
1290       if (++it[j].count < it[j].size) {                                \
1291         input_offset += it[j].input_stride;                            \
1292         output_offset += it[j].output_stride;                          \
1293         break;                                                         \
1294       }                                                                \
1295       it[j].count = 0;                                                 \
1296       input_offset -= it[j].input_span;                                \
1297       output_offset -= it[j].output_span;                              \
1298     }                                                                  \
1299   }                                                                    \
1300   return num_copied;
1301 
1302     if (input_stride == 1 && output_stride == 1) {
1303       COPY_INNER_DIM(LinCopy::Kind::Linear);
1304     } else if (input_stride == 1 && output_stride != 1) {
1305       COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306     } else if (input_stride == 0 && output_stride == 1) {
1307       COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308     } else if (input_stride == 0 && output_stride != 1) {
1309       COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310     } else if (output_stride == 1) {
1311       COPY_INNER_DIM(LinCopy::Kind::Gather);
1312     } else {
1313       COPY_INNER_DIM(LinCopy::Kind::Random);
1314     }
1315 
1316 #undef COPY_INNER_DIM
1317   }
1318 
1319   // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1320   // the number of copied elements.
1321   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
1322                                                               const Src& src) {
1323     DimensionsMap dst_to_src_map;
1324     for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325     return Copy(dst, src, dst_to_src_map);
1326   }
1327 
1328  private:
1329   struct BlockIteratorState {
1330     BlockIteratorState()
1331         : size(0),
1332           count(0),
1333           input_stride(0),
1334           output_stride(0),
1335           input_span(0),
1336           output_span(0) {}
1337 
1338     IndexType size;
1339     IndexType count;
1340     IndexType input_stride;
1341     IndexType output_stride;
1342     IndexType input_span;
1343     IndexType output_span;
1344   };
1345 
1346   // Compute how many inner dimensions it's allowed to squeeze when doing IO
1347   // between two tensor blocks. It's safe to squeeze inner dimensions, only
1348   // if they are not reordered.
1349   static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1350     int num_squeezable_dims = 0;
1351     for (int i = 0; i < NumDims; ++i) {
1352       const int dim = IsColMajor ? i : NumDims - i - 1;
1353       if (dim_map[dim] != dim) break;
1354       num_squeezable_dims++;
1355     }
1356     return num_squeezable_dims;
1357   }
1358 };
1359 
1360 // -------------------------------------------------------------------------- //
1361 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1362 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1363 //
1364 // Currently there is no way to write from a Tensor expression to a block of
1365 // memory, if dimensions are reordered. If you need to do that, you should
1366 // materialize a Tensor block expression into a memory buffer, and then use
1367 // TensorBlockIO to copy data between two memory buffers with a custom
1368 // `target->src` dimension map (see definition above).
1369 //
1370 // Also currently the innermost dimension of `target` must have a stride '1'
1371 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
1372 // but in practice it's never needed, and there is a similar TensorBlockIO
1373 // workaround for that.
1374 //
1375 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1376 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
1377 // to use expressions instead of pointers, and after that TensorBlockAssignment
1378 // will become an alias to IO.
1379 template <typename Scalar, int NumDims, typename TensorBlockExpr,
1380           typename IndexType = Eigen::Index>
1381 class TensorBlockAssignment {
1382   // We will use coeff/packet path to evaluate block expressions.
1383   typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1384       TensorBlockEvaluator;
1385 
1386   typedef DSizes<IndexType, NumDims> Dimensions;
1387 
1388   enum {
1389     Vectorizable = packet_traits<Scalar>::Vectorizable,
1390     PacketSize = packet_traits<Scalar>::size
1391   };
1392 
1393   template <bool Vectorizable, typename Evaluator>
1394   struct InnerDimAssign {
1395     EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1396                                         const Evaluator& eval,
1397                                         IndexType eval_offset) {
1398       for (IndexType i = 0; i < count; ++i) {
1399         target[i] = eval.coeff(eval_offset + i);
1400       }
1401     }
1402   };
1403 
1404   template <typename Evaluator>
1405   struct InnerDimAssign<true, Evaluator> {
1406     EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1407                                         const Evaluator& eval,
1408                                         IndexType eval_offset) {
1409       typedef typename packet_traits<Scalar>::type Packet;
1410 
1411       const IndexType unrolled_size = count - 4 * PacketSize;
1412       const IndexType vectorized_size = count - PacketSize;
1413       IndexType i = 0;
1414 
1415       for (; i <= unrolled_size; i += 4 * PacketSize) {
1416         for (int j = 0; j < 4; ++j) {
1417           const IndexType idx = eval_offset + i + j * PacketSize;
1418           Packet p = eval.template packet<Unaligned>(idx);
1419           pstoreu<Scalar>(target + i + j * PacketSize, p);
1420         }
1421       }
1422 
1423       for (; i <= vectorized_size; i += PacketSize) {
1424         Packet p = eval.template packet<Unaligned>(eval_offset + i);
1425         pstoreu<Scalar>(target + i, p);
1426       }
1427 
1428       for (; i < count; ++i) {
1429         target[i] = eval.coeff(eval_offset + i);
1430       }
1431     }
1432   };
1433 
1434  public:
1435   struct Target {
1436     Target(const Dimensions& target_dims, const Dimensions& target_strides,
1437            Scalar* target_data, IndexType target_offset = 0)
1438         : dims(target_dims),
1439           strides(target_strides),
1440           data(target_data),
1441           offset(target_offset) {}
1442 
1443     Dimensions dims;
1444     Dimensions strides;
1445     Scalar* data;
1446     IndexType offset;
1447   };
1448 
1449   static Target target(const Dimensions& target_dims,
1450                        const Dimensions& target_strides, Scalar* target_data,
1451                        IndexType target_offset = 0) {
1452     return Target(target_dims, target_strides, target_data, target_offset);
1453   }
1454 
1455   template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1456   static Target target(
1457       const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1458       const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1459       Scalar* target_data, IndexType target_offset = 0) {
1460     // DSizes constructor will do index type promotion if it's safe.
1461     return Target(Dimensions(target_dims), Dimensions(target_strides),
1462                   target_data, target_offset);
1463   }
1464 
1465   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1466       const Target& target, const TensorBlockExpr& expr) {
1467     // Prepare evaluator for block expression.
1468     DefaultDevice default_device;
1469     TensorBlockEvaluator eval(expr, default_device);
1470 
1471     // Tensor block expression dimension should match destination dimensions.
1472     eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1473 
1474     static const int Layout = TensorBlockEvaluator::Layout;
1475     static const bool is_col_major = Layout == ColMajor;
1476 
1477     // Initialize output inner dimension size based on a layout.
1478     const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479     const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480     IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1481 
1482     // Target inner dimension stride must be '1'.
1483     eigen_assert(target.strides[inner_dim_idx] == 1);
1484 
1485     // Squeeze multiple inner dims into one if they are contiguous in `target`.
1486     IndexType num_squeezed_dims = 0;
1487     for (Index i = 1; i < NumDims; ++i) {
1488       const Index dim = is_col_major ? i : NumDims - i - 1;
1489       const IndexType target_stride = target.strides[dim];
1490 
1491       if (output_inner_dim_size == target_stride) {
1492         output_inner_dim_size *= target.dims[dim];
1493         num_squeezed_dims++;
1494       } else {
1495         break;
1496       }
1497     }
1498 
1499     // Initialize output block iterator state. Dimension in this array are
1500     // always in inner_most -> outer_most order (col major layout).
1501     array<BlockIteratorState, NumDims> it;
1502 
1503     int idx = 0;  // currently initialized iterator state index
1504     for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505       const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1506 
1507       it[idx].count = 0;
1508       it[idx].size = target.dims[dim];
1509       it[idx].output_stride = target.strides[dim];
1510       it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1511       idx++;
1512     }
1513 
1514     // We read block expression from the beginning, and start writing data to
1515     // `target` at given offset.
1516     IndexType input_offset = 0;
1517     IndexType output_offset = target.offset;
1518 
1519     // Iterate copying data from `eval` to `target`.
1520     for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1521       // Assign to `target` at current offset.
1522       InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1523                      TensorBlockEvaluator>::Run(target.data + output_offset,
1524                                                 output_inner_dim_size, eval,
1525                                                 input_offset);
1526 
1527       // Move input offset forward by the number of assigned coefficients.
1528       input_offset += output_inner_dim_size;
1529 
1530       // Update index.
1531       for (int j = 0; j < idx; ++j) {
1532         if (++it[j].count < it[j].size) {
1533           output_offset += it[j].output_stride;
1534           break;
1535         }
1536         it[j].count = 0;
1537         output_offset -= it[j].output_span;
1538       }
1539     }
1540   }
1541 
1542  private:
1543   struct BlockIteratorState {
1544     BlockIteratorState()
1545         : count(0), size(0), output_stride(0), output_span(0) {}
1546 
1547     IndexType count;
1548     IndexType size;
1549     IndexType output_stride;
1550     IndexType output_span;
1551   };
1552 };
1553 
1554 // -------------------------------------------------------------------------- //
1555 
1556 }  // namespace internal
1557 }  // namespace Eigen
1558 
1559 #endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H