Warning, file /include/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h was not indexed
or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).
0001
0002
0003
0004
0005
0006
0007
0008 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
0009 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
0010
0011 namespace Eigen {
0012 namespace internal {
0013
0014
0015
0016 template <typename Scalar, typename IndexType, int NumDims, int Layout>
0017 class TensorBlockIO;
0018
0019
0020
0021
0022
0023
0024
0025 template <int Layout, typename IndexType, int NumDims>
0026 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
0027 const DSizes<IndexType, NumDims>& dimensions) {
0028 DSizes<IndexType, NumDims> strides;
0029 if (NumDims == 0) return strides;
0030
0031
0032
0033 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
0034 strides[0] = 1;
0035 for (int i = 1; i < NumDims; ++i) {
0036 strides[i] = strides[i - 1] * dimensions[i - 1];
0037 }
0038 } else {
0039 strides[NumDims - 1] = 1;
0040 for (int i = NumDims - 2; i >= 0; --i) {
0041 strides[i] = strides[i + 1] * dimensions[i + 1];
0042 }
0043 }
0044
0045 return strides;
0046 }
0047
0048 template <int Layout, typename IndexType, size_t NumDims>
0049 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
0050 const Eigen::array<IndexType, NumDims>& dimensions) {
0051 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
0052 }
0053
0054 template <int Layout, std::ptrdiff_t... Indices>
0055 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
0056 const Sizes<Indices...>& sizes) {
0057 return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
0058 }
0059
0060
0061
0062
0063
0064
0065
0066
0067
0068
0069
0070
0071
0072
0073 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
0074
0075 struct TensorBlockResourceRequirements {
0076 TensorBlockShapeType shape_type;
0077 size_t size;
0078 TensorOpCost cost_per_coeff;
0079
0080 #ifdef EIGEN_HIPCC
0081
0082
0083
0084 EIGEN_DEVICE_FUNC
0085 TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
0086 TensorOpCost cost_)
0087 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
0088 {}
0089 #endif
0090
0091 template <typename Scalar>
0092 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
0093 TensorBlockShapeType shape_type, size_t size_in_bytes,
0094 TensorOpCost cost) {
0095 const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
0096 return {shape_type, size, cost};
0097 }
0098
0099 template <typename Scalar>
0100 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
0101 TensorBlockShapeType shape_type, size_t size_in_bytes) {
0102
0103
0104
0105
0106
0107
0108
0109
0110
0111
0112
0113
0114
0115
0116 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
0117 {sizeof(Scalar),
0118 sizeof(Scalar),
0119 0});
0120 }
0121
0122 template <typename Scalar>
0123 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
0124 size_t size_in_bytes) {
0125 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
0126 size_in_bytes);
0127 }
0128
0129 template <typename Scalar>
0130 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
0131 size_t size_in_bytes) {
0132 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
0133 size_in_bytes);
0134 }
0135
0136 EIGEN_DEVICE_FUNC
0137 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
0138 merge(const TensorBlockResourceRequirements& lhs,
0139 const TensorBlockResourceRequirements& rhs) {
0140 return {merge(lhs.shape_type, rhs.shape_type),
0141 merge(lhs.size, rhs.size),
0142 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};
0143 }
0144
0145 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
0146 TensorOpCost cost) {
0147 cost_per_coeff += cost;
0148 return *this;
0149 }
0150
0151
0152
0153
0154 EIGEN_DEVICE_FUNC
0155 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
0156 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
0157 }
0158
0159 private:
0160 using Requirements = TensorBlockResourceRequirements;
0161
0162 EIGEN_DEVICE_FUNC
0163 static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
0164 return numext::maxi(lhs_size, rhs_size);
0165 }
0166
0167 EIGEN_DEVICE_FUNC
0168 static EIGEN_STRONG_INLINE TensorBlockShapeType
0169 merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
0170 return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
0171 rhs == TensorBlockShapeType::kSkewedInnerDims)
0172 ? TensorBlockShapeType::kSkewedInnerDims
0173 : TensorBlockShapeType::kUniformAllDims;
0174 }
0175
0176 EIGEN_DEVICE_FUNC
0177 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
0178 TensorOpCost rhs_cost) {
0179 return lhs_cost + rhs_cost;
0180 }
0181 };
0182
0183
0184
0185
0186
0187 template <int NumDims, typename IndexType = Eigen::Index>
0188 class TensorBlockDescriptor {
0189 public:
0190 typedef DSizes<IndexType, NumDims> Dimensions;
0191
0192
0193
0194
0195
0196
0197
0198
0199
0200
0201
0202 class DestinationBuffer {
0203 public:
0204 enum DestinationBufferKind : int {
0205
0206
0207
0208
0209
0210
0211
0212
0213
0214 kEmpty,
0215
0216
0217
0218
0219
0220 kContiguous,
0221
0222
0223
0224
0225
0226
0227
0228
0229
0230
0231
0232 kStrided
0233 };
0234
0235 template <typename Scalar>
0236 Scalar* data() const {
0237 eigen_assert(m_data_type_size == sizeof(Scalar));
0238 return static_cast<Scalar*>(m_data);
0239 }
0240
0241 const Dimensions& strides() const { return m_strides; }
0242 const DestinationBufferKind& kind() const { return m_kind; }
0243
0244 private:
0245 friend class TensorBlockDescriptor;
0246
0247 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
0248
0249 template <typename Scalar>
0250 DestinationBuffer(Scalar* data, const Dimensions& strides,
0251 DestinationBufferKind kind)
0252 : m_data(static_cast<void*>(data)),
0253 m_data_type_size(sizeof(Scalar)),
0254 m_strides(strides),
0255 m_kind(kind) {}
0256
0257 template <int Layout, typename Scalar>
0258 static DestinationBuffer make(const TensorBlockDescriptor& desc,
0259 Scalar* data, const Dimensions& strides) {
0260 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
0261 }
0262
0263 template <int Layout>
0264 static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
0265 const Dimensions& strides) {
0266 const Dimensions& desc_dims = desc.dimensions();
0267 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
0268 for (int i = 0; i < NumDims; ++i) {
0269 if (desc_dims[i] == 1) continue;
0270 if (desc_strides[i] != strides[i]) return kStrided;
0271 }
0272 return kContiguous;
0273 }
0274
0275
0276
0277 void* m_data;
0278 size_t m_data_type_size;
0279
0280
0281
0282 Dimensions m_strides;
0283
0284 DestinationBufferKind m_kind;
0285 };
0286
0287 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
0288 const DestinationBuffer& destination)
0289 : m_offset(offset),
0290 m_dimensions(dimensions),
0291 m_destination(destination) {}
0292
0293 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
0294 : m_offset(offset),
0295 m_dimensions(dimensions),
0296 m_destination(DestinationBuffer()) {}
0297
0298 IndexType offset() const { return m_offset; }
0299 const Dimensions& dimensions() const { return m_dimensions; }
0300 IndexType dimension(int index) const { return m_dimensions[index]; }
0301 IndexType size() const { return array_prod<IndexType>(m_dimensions); }
0302
0303 const DestinationBuffer& destination() const { return m_destination; }
0304
0305 template <int Layout, typename Scalar>
0306 void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
0307 eigen_assert(dst_base != NULL);
0308 m_destination =
0309 DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
0310 }
0311
0312 template <int Layout, typename Scalar, typename DstStridesIndexType>
0313 void AddDestinationBuffer(
0314 Scalar* dst_base,
0315 const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
0316
0317 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
0318 }
0319
0320 TensorBlockDescriptor& DropDestinationBuffer() {
0321 m_destination.m_data = NULL;
0322 m_destination.m_kind = DestinationBuffer::kEmpty;
0323 return *this;
0324 }
0325
0326 bool HasDestinationBuffer() const {
0327 return m_destination.kind() != DestinationBuffer::kEmpty;
0328 }
0329
0330
0331 TensorBlockDescriptor WithOffset(IndexType offset) const {
0332 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
0333 }
0334
0335 private:
0336
0337
0338 const IndexType m_offset;
0339 const Dimensions m_dimensions;
0340 DestinationBuffer m_destination;
0341 };
0342
0343
0344
0345
0346 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
0347 class TensorBlockMapper {
0348 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
0349
0350 public:
0351 typedef DSizes<IndexType, NumDims> Dimensions;
0352
0353 TensorBlockMapper() = default;
0354 TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
0355 const TensorBlockResourceRequirements& requirements)
0356 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
0357
0358 InitializeBlockDimensions();
0359 }
0360
0361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
0362 return m_total_block_count;
0363 }
0364
0365 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
0366 return m_block_dimensions.TotalSize();
0367 }
0368
0369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
0370 blockDimensions() const {
0371 return m_block_dimensions;
0372 }
0373
0374 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
0375 blockDescriptor(IndexType block_index) const {
0376 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
0377
0378 IndexType offset = 0;
0379 DSizes<IndexType, NumDims> dimensions;
0380
0381 if (NumDims == 0) return BlockDescriptor(offset, dimensions);
0382
0383
0384 for (int i = NumDims - 1; i >= 0; --i) {
0385 const int dim = isColMajor ? i : NumDims - i - 1;
0386
0387 const IndexType idx = block_index / m_block_strides[dim];
0388 block_index -= idx * m_block_strides[dim];
0389
0390 const IndexType coord = idx * m_block_dimensions[dim];
0391 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
0392 m_block_dimensions[dim]);
0393 offset += coord * m_tensor_strides[dim];
0394 }
0395
0396 return {offset, dimensions};
0397 }
0398
0399 private:
0400 void InitializeBlockDimensions() {
0401
0402 const TensorBlockShapeType shape_type = m_requirements.shape_type;
0403 IndexType target_block_size =
0404 numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
0405
0406 IndexType tensor_size = m_tensor_dimensions.TotalSize();
0407
0408
0409
0410
0411
0412 if (tensor_size == 0) {
0413 for (int i = 0; i < NumDims; ++i) {
0414 m_block_dimensions[i] = 1;
0415 }
0416 m_total_block_count = 0;
0417 return;
0418 }
0419
0420
0421 if (tensor_size <= target_block_size) {
0422 m_block_dimensions = m_tensor_dimensions;
0423 m_total_block_count = 1;
0424
0425
0426 for (int i = 0; i < NumDims; ++i) {
0427 m_tensor_strides[i] = 0;
0428 m_block_strides[i] = 1;
0429 }
0430 return;
0431 }
0432
0433 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
0434
0435
0436 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
0437 IndexType coeff_to_allocate = target_block_size;
0438
0439 for (int i = 0; i < NumDims; ++i) {
0440 const int dim = isColMajor ? i : NumDims - i - 1;
0441 m_block_dimensions[dim] =
0442 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
0443 coeff_to_allocate = divup(
0444 coeff_to_allocate,
0445 numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
0446 }
0447 eigen_assert(coeff_to_allocate == 1);
0448
0449 } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
0450
0451
0452 const IndexType dim_size_target = convert_index<IndexType>(
0453 std::pow(static_cast<float>(target_block_size),
0454 1.0f / static_cast<float>(m_block_dimensions.rank())));
0455
0456 for (int i = 0; i < NumDims; ++i) {
0457
0458
0459
0460
0461 m_block_dimensions[i] =
0462 numext::mini(dim_size_target, m_tensor_dimensions[i]);
0463 }
0464
0465
0466 IndexType total_size = m_block_dimensions.TotalSize();
0467 for (int i = 0; i < NumDims; ++i) {
0468 const int dim = isColMajor ? i : NumDims - i - 1;
0469
0470 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
0471 const IndexType total_size_other_dims =
0472 total_size / m_block_dimensions[dim];
0473 const IndexType alloc_avail =
0474 divup<IndexType>(target_block_size, total_size_other_dims);
0475 if (alloc_avail == m_block_dimensions[dim]) {
0476
0477 break;
0478 }
0479 m_block_dimensions[dim] =
0480 numext::mini(m_tensor_dimensions[dim], alloc_avail);
0481 total_size = total_size_other_dims * m_block_dimensions[dim];
0482 }
0483 }
0484
0485 } else {
0486 eigen_assert(false);
0487 }
0488
0489 eigen_assert(m_block_dimensions.TotalSize() >=
0490 numext::mini<IndexType>(target_block_size,
0491 m_tensor_dimensions.TotalSize()));
0492
0493
0494 DSizes<IndexType, NumDims> block_count;
0495 for (int i = 0; i < NumDims; ++i) {
0496 block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
0497 }
0498 m_total_block_count = array_prod(block_count);
0499
0500
0501 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
0502 m_block_strides = strides<Layout>(block_count);
0503 }
0504
0505 DSizes<IndexType, NumDims> m_tensor_dimensions;
0506 TensorBlockResourceRequirements m_requirements;
0507
0508 DSizes<IndexType, NumDims> m_block_dimensions;
0509 IndexType m_total_block_count;
0510
0511 DSizes<IndexType, NumDims> m_tensor_strides;
0512 DSizes<IndexType, NumDims> m_block_strides;
0513 };
0514
0515
0516
0517
0518
0519
0520
0521
0522
0523
0524 template <typename Device>
0525 class TensorBlockScratchAllocator {
0526 public:
0527 explicit TensorBlockScratchAllocator(const Device& device)
0528 : m_device(device), m_allocation_index(0) {}
0529
0530 ~TensorBlockScratchAllocator() {
0531 for (size_t i = 0; i < m_allocations.size(); ++i) {
0532 m_device.deallocate(m_allocations[i].ptr);
0533 }
0534 }
0535
0536 void* allocate(size_t size) {
0537
0538 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
0539
0540
0541 const int num_allocations = static_cast<int>(m_allocations.size());
0542 const bool has_allocation = m_allocation_index < num_allocations;
0543
0544
0545 eigen_assert(m_allocation_index <= num_allocations);
0546
0547
0548
0549
0550
0551
0552 if (has_allocation && m_allocations[m_allocation_index].size < size) {
0553 m_device.deallocate(m_allocations[m_allocation_index].ptr);
0554 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
0555 m_allocations[m_allocation_index].size = size;
0556 }
0557
0558
0559 if (!has_allocation) {
0560 Allocation allocation;
0561 allocation.ptr = m_device.allocate(size);
0562 allocation.size = size;
0563 m_allocations.push_back(allocation);
0564 }
0565
0566 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
0567 eigen_assert(m_allocations[m_allocation_index].size >= size);
0568
0569 return m_allocations[m_allocation_index++].ptr;
0570 }
0571
0572 void reset() { m_allocation_index = 0; }
0573
0574 private:
0575 struct Allocation {
0576 void* ptr;
0577 size_t size;
0578 };
0579
0580 const Device& m_device;
0581 int m_allocation_index;
0582
0583 std::vector<Allocation> m_allocations;
0584 };
0585
0586
0587
0588
0589 enum TensorBlockKind {
0590
0591
0592 kExpr,
0593
0594
0595
0596 kView,
0597
0598
0599
0600
0601 kMaterializedInScratch,
0602
0603
0604
0605
0606
0607
0608
0609
0610 kMaterializedInOutput
0611 };
0612
0613
0614
0615
0616
0617 class TensorBlockNotImplemented {
0618 public:
0619 typedef void XprType;
0620 };
0621
0622
0623
0624
0625
0626
0627 template <typename XprType>
0628 struct XprScalar {
0629 typedef typename XprType::Scalar type;
0630 };
0631 template <>
0632 struct XprScalar<void> {
0633 typedef void type;
0634 };
0635
0636
0637
0638
0639
0640
0641
0642
0643
0644
0645
0646
0647
0648
0649
0650
0651
0652
0653
0654 template <typename Scalar, int NumDims, int Layout,
0655 typename IndexType = Eigen::Index>
0656 class TensorMaterializedBlock {
0657 public:
0658 typedef DSizes<IndexType, NumDims> Dimensions;
0659 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
0660
0661 TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
0662 const Dimensions& dimensions, bool valid_expr = true)
0663 : m_kind(kind),
0664 m_data(data),
0665 m_dimensions(dimensions),
0666 m_expr(m_data, m_dimensions),
0667 m_valid_expr(valid_expr) {
0668 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
0669 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
0670 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
0671 }
0672
0673 TensorBlockKind kind() const { return m_kind; }
0674
0675
0676
0677 const XprType& expr() const {
0678 eigen_assert(m_valid_expr);
0679 return m_expr;
0680 }
0681 const Scalar* data() const { return m_data; }
0682 void cleanup() {}
0683
0684 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
0685
0686
0687
0688
0689
0690
0691
0692
0693
0694 class Storage {
0695 public:
0696 Scalar* data() const { return m_data; }
0697 const Dimensions& dimensions() const { return m_dimensions; }
0698 const Dimensions& strides() const { return m_strides; }
0699
0700 TensorMaterializedBlock AsTensorMaterializedBlock() const {
0701 return TensorMaterializedBlock(
0702 m_materialized_in_output
0703 ? internal::TensorBlockKind::kMaterializedInOutput
0704 : internal::TensorBlockKind::kMaterializedInScratch,
0705 m_data, m_dimensions, !m_strided_storage);
0706 }
0707
0708 private:
0709 friend class TensorMaterializedBlock;
0710
0711 Storage(Scalar* data, const Dimensions& dimensions,
0712 const Dimensions& strides, bool materialized_in_output,
0713 bool strided_storage)
0714 : m_data(data),
0715 m_dimensions(dimensions),
0716 m_strides(strides),
0717 m_materialized_in_output(materialized_in_output),
0718 m_strided_storage(strided_storage) {}
0719
0720 Scalar* m_data;
0721 Dimensions m_dimensions;
0722 Dimensions m_strides;
0723 bool m_materialized_in_output;
0724 bool m_strided_storage;
0725 };
0726
0727
0728
0729 template <typename TensorBlockScratch>
0730 EIGEN_STRONG_INLINE static Storage prepareStorage(
0731 TensorBlockDesc& desc, TensorBlockScratch& scratch,
0732 bool allow_strided_storage = false) {
0733
0734 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
0735
0736 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
0737 Scalar* buffer = desc.destination().template data<Scalar>();
0738 desc.DropDestinationBuffer();
0739 return Storage(buffer, desc.dimensions(),
0740 internal::strides<Layout>(desc.dimensions()),
0741 true,
0742 false);
0743
0744 } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
0745 allow_strided_storage) {
0746 Scalar* buffer = desc.destination().template data<Scalar>();
0747 desc.DropDestinationBuffer();
0748 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
0749 true, true);
0750
0751 } else {
0752 void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
0753 return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
0754 internal::strides<Layout>(desc.dimensions()),
0755 false,
0756 false);
0757 }
0758 }
0759
0760
0761 template <typename DataDimensions, typename TensorBlockScratch>
0762 EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
0763 const Scalar* data, const DataDimensions& data_dims,
0764 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
0765 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
0766
0767
0768
0769
0770
0771
0772
0773
0774
0775
0776
0777 static const bool is_col_major = Layout == ColMajor;
0778
0779
0780 int num_matching_inner_dims = 0;
0781 for (int i = 0; i < NumDims; ++i) {
0782 int dim = is_col_major ? i : NumDims - i - 1;
0783 if (data_dims[dim] != desc.dimensions()[dim]) break;
0784 ++num_matching_inner_dims;
0785 }
0786
0787
0788
0789 bool can_use_direct_access = true;
0790 for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
0791 int dim = is_col_major ? i : NumDims - i - 1;
0792 if (desc.dimension(dim) != 1) {
0793 can_use_direct_access = false;
0794 break;
0795 }
0796 }
0797
0798 if (can_use_direct_access) {
0799 const Scalar* block_start = data + desc.offset();
0800 return TensorMaterializedBlock(internal::TensorBlockKind::kView,
0801 block_start, desc.dimensions());
0802
0803 } else {
0804
0805 const Storage storage = prepareStorage(desc, scratch);
0806
0807 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
0808 TensorBlockIO;
0809 typedef typename TensorBlockIO::Dst TensorBlockIODst;
0810 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
0811
0812 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
0813 data, desc.offset());
0814 TensorBlockIODst dst(storage.dimensions(), storage.strides(),
0815 storage.data());
0816
0817 TensorBlockIO::Copy(dst, src);
0818 return storage.AsTensorMaterializedBlock();
0819 }
0820 }
0821
0822 private:
0823 TensorBlockKind m_kind;
0824 const Scalar* m_data;
0825 Dimensions m_dimensions;
0826 XprType m_expr;
0827 bool m_valid_expr;
0828 };
0829
0830
0831
0832
0833
0834 template <typename UnaryOp, typename ArgTensorBlock>
0835 class TensorCwiseUnaryBlock {
0836 static const bool NoArgBlockAccess =
0837 internal::is_void<typename ArgTensorBlock::XprType>::value;
0838
0839 public:
0840 typedef typename conditional<
0841 NoArgBlockAccess, void,
0842 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
0843 type XprType;
0844
0845 typedef typename XprScalar<XprType>::type Scalar;
0846
0847 TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
0848 : m_arg_block(arg_block), m_functor(functor) {}
0849
0850 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0851
0852 XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
0853 const Scalar* data() const { return NULL; }
0854 void cleanup() { m_arg_block.cleanup(); }
0855
0856 private:
0857 ArgTensorBlock m_arg_block;
0858 UnaryOp m_functor;
0859 };
0860
0861
0862
0863
0864
0865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
0866 class TensorCwiseBinaryBlock {
0867 static const bool NoArgBlockAccess =
0868 internal::is_void<typename LhsTensorBlock::XprType>::value ||
0869 internal::is_void<typename RhsTensorBlock::XprType>::value;
0870
0871 public:
0872 typedef typename conditional<
0873 NoArgBlockAccess, void,
0874 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
0875 const typename RhsTensorBlock::XprType> >::type
0876 XprType;
0877
0878 typedef typename XprScalar<XprType>::type Scalar;
0879
0880 TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
0881 const RhsTensorBlock& right_block,
0882 const BinaryOp& functor)
0883 : m_left_block(left_block),
0884 m_right_block(right_block),
0885 m_functor(functor) {}
0886
0887 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0888
0889 XprType expr() const {
0890 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
0891 }
0892
0893 const Scalar* data() const { return NULL; }
0894
0895 void cleanup() {
0896 m_left_block.cleanup();
0897 m_right_block.cleanup();
0898 }
0899
0900 private:
0901 LhsTensorBlock m_left_block;
0902 RhsTensorBlock m_right_block;
0903 BinaryOp m_functor;
0904 };
0905
0906
0907
0908
0909
0910
0911 template <typename BlockFactory, typename ArgTensorBlock>
0912 class TensorUnaryExprBlock {
0913 typedef typename ArgTensorBlock::XprType ArgXprType;
0914 static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
0915
0916 public:
0917 typedef typename conditional<
0918 NoArgBlockAccess, void,
0919 typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
0920
0921 typedef typename XprScalar<XprType>::type Scalar;
0922
0923 TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
0924 const BlockFactory& factory)
0925 : m_arg_block(arg_block), m_factory(factory) {}
0926
0927 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0928 XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
0929 const Scalar* data() const { return NULL; }
0930 void cleanup() { m_arg_block.cleanup(); }
0931
0932 private:
0933 ArgTensorBlock m_arg_block;
0934 BlockFactory m_factory;
0935 };
0936
0937
0938
0939
0940
0941 template <typename BlockFactory, typename Arg1TensorBlock,
0942 typename Arg2TensorBlock, typename Arg3TensorBlock>
0943 class TensorTernaryExprBlock {
0944 typedef typename Arg1TensorBlock::XprType Arg1XprType;
0945 typedef typename Arg2TensorBlock::XprType Arg2XprType;
0946 typedef typename Arg3TensorBlock::XprType Arg3XprType;
0947
0948 static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
0949 internal::is_void<Arg2XprType>::value ||
0950 internal::is_void<Arg3XprType>::value;
0951
0952 public:
0953 typedef typename conditional<
0954 NoArgBlockAccess, void,
0955 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
0956 Arg3XprType>::type>::type XprType;
0957
0958 typedef typename XprScalar<XprType>::type Scalar;
0959
0960 TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
0961 const Arg2TensorBlock& arg2_block,
0962 const Arg3TensorBlock& arg3_block,
0963 const BlockFactory& factory)
0964 : m_arg1_block(arg1_block),
0965 m_arg2_block(arg2_block),
0966 m_arg3_block(arg3_block),
0967 m_factory(factory) {}
0968
0969 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
0970 XprType expr() const {
0971 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
0972 m_arg3_block.expr());
0973 }
0974 const Scalar* data() const { return NULL; }
0975 void cleanup() {
0976 m_arg1_block.cleanup();
0977 m_arg2_block.cleanup();
0978 m_arg3_block.cleanup();
0979 }
0980
0981 private:
0982 Arg1TensorBlock m_arg1_block;
0983 Arg2TensorBlock m_arg2_block;
0984 Arg3TensorBlock m_arg3_block;
0985 BlockFactory m_factory;
0986 };
0987
0988
0989
0990
0991
0992 template <typename Scalar, typename IndexType>
0993 class StridedLinearBufferCopy {
0994 typedef typename packet_traits<Scalar>::type Packet;
0995 enum {
0996 Vectorizable = packet_traits<Scalar>::Vectorizable,
0997 PacketSize = packet_traits<Scalar>::size
0998 };
0999
1000 public:
1001
1002 enum class Kind {
1003 Linear = 0,
1004 Scatter = 1,
1005 FillLinear = 2,
1006 FillScatter = 3,
1007 Gather = 4,
1008 Random = 5
1009 };
1010
1011 struct Dst {
1012 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1013
1014 IndexType offset;
1015 IndexType stride;
1016 Scalar* data;
1017 };
1018
1019 struct Src {
1020 Src(IndexType o, IndexType s, const Scalar* d)
1021 : offset(o), stride(s), data(d) {}
1022
1023 IndexType offset;
1024 IndexType stride;
1025 const Scalar* data;
1026 };
1027
1028 template <typename StridedLinearBufferCopy::Kind kind>
1029 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
1030 const Src& src,
1031 const size_t count) {
1032 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1033 src.data);
1034 }
1035
1036 private:
1037 template <typename StridedLinearBufferCopy::Kind kind>
1038 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1039 const IndexType count, const IndexType dst_offset,
1040 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041 const IndexType src_offset, const IndexType src_stride,
1042 const Scalar* EIGEN_RESTRICT src_data) {
1043 const Scalar* src = &src_data[src_offset];
1044 Scalar* dst = &dst_data[dst_offset];
1045
1046 if (!Vectorizable) {
1047 for (Index i = 0; i < count; ++i) {
1048 dst[i * dst_stride] = src[i * src_stride];
1049 }
1050 return;
1051 }
1052
1053 const IndexType vectorized_size = count - PacketSize;
1054 IndexType i = 0;
1055
1056 if (kind == StridedLinearBufferCopy::Kind::Linear) {
1057
1058
1059 const IndexType unrolled_size = count - 4 * PacketSize;
1060 eigen_assert(src_stride == 1 && dst_stride == 1);
1061 for (; i <= unrolled_size; i += 4 * PacketSize) {
1062 for (int j = 0; j < 4; ++j) {
1063 Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1065 }
1066 }
1067 for (; i <= vectorized_size; i += PacketSize) {
1068 Packet p = ploadu<Packet>(src + i);
1069 pstoreu<Scalar, Packet>(dst + i, p);
1070 }
1071 for (; i < count; ++i) {
1072 dst[i] = src[i];
1073 }
1074
1075 } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1076
1077 eigen_assert(src_stride == 1 && dst_stride != 1);
1078 for (; i <= vectorized_size; i += PacketSize) {
1079 Packet p = ploadu<Packet>(src + i);
1080 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1081 }
1082 for (; i < count; ++i) {
1083 dst[i * dst_stride] = src[i];
1084 }
1085
1086 } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1087
1088 eigen_assert(src_stride == 0 && dst_stride == 1);
1089 const IndexType unrolled_size = count - 4 * PacketSize;
1090 Packet p = pload1<Packet>(src);
1091 for (; i <= unrolled_size; i += 4 * PacketSize) {
1092 for (int j = 0; j < 4; ++j) {
1093 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1094 }
1095 }
1096 for (; i <= vectorized_size; i += PacketSize) {
1097 pstoreu<Scalar, Packet>(dst + i, p);
1098 }
1099 for (; i < count; ++i) {
1100 dst[i] = *src;
1101 }
1102
1103 } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1104
1105 eigen_assert(src_stride == 0 && dst_stride != 1);
1106 Packet p = pload1<Packet>(src);
1107 for (; i <= vectorized_size; i += PacketSize) {
1108 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1109 }
1110 for (; i < count; ++i) {
1111 dst[i * dst_stride] = *src;
1112 }
1113
1114 } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1115
1116 eigen_assert(dst_stride == 1);
1117 for (; i <= vectorized_size; i += PacketSize) {
1118 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119 pstoreu<Scalar, Packet>(dst + i, p);
1120 }
1121 for (; i < count; ++i) {
1122 dst[i] = src[i * src_stride];
1123 }
1124
1125 } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1126
1127 for (; i < count; ++i) {
1128 dst[i * dst_stride] = src[i * src_stride];
1129 }
1130 } else {
1131 eigen_assert(false);
1132 }
1133 }
1134 };
1135
1136
1137
1138
1139
1140
1141
1142 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1143 class TensorBlockIO {
1144 static const bool IsColMajor = (Layout == ColMajor);
1145
1146 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1147
1148 public:
1149 typedef DSizes<IndexType, NumDims> Dimensions;
1150 typedef DSizes<int, NumDims> DimensionsMap;
1151
1152 struct Dst {
1153 Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
1154 IndexType dst_offset = 0)
1155 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1156
1157 Dimensions dims;
1158 Dimensions strides;
1159 Scalar* data;
1160 IndexType offset;
1161 };
1162
1163 struct Src {
1164 Src(const Dimensions& src_strides, const Scalar* src,
1165 IndexType src_offset = 0)
1166 : strides(src_strides), data(src), offset(src_offset) {}
1167
1168 Dimensions strides;
1169 const Scalar* data;
1170 IndexType offset;
1171 };
1172
1173
1174
1175
1176
1177
1178 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1179 const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1180
1181 if (NumDims == 0) {
1182 *(dst.data + dst.offset) = *(src.data + src.offset);
1183 return 1;
1184 }
1185
1186
1187
1188
1189 {
1190 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1191 EIGEN_UNUSED_VARIABLE(inner_dim);
1192 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1194 }
1195
1196
1197 const DimensionsMap& dim_map = dst_to_src_dim_map;
1198
1199
1200 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212 int num_size_one_inner_dims = 0;
1213 for (int i = 0; i < num_squeezable_dims; ++i) {
1214 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215 if (dst.dims[dst_dim] != 1) break;
1216 num_size_one_inner_dims++;
1217 }
1218
1219
1220 if (num_size_one_inner_dims == NumDims) {
1221 *(dst.data + dst.offset) = *(src.data + src.offset);
1222 return 1;
1223 }
1224
1225
1226 const int dst_stride1_dim = IsColMajor
1227 ? num_size_one_inner_dims
1228 : NumDims - num_size_one_inner_dims - 1;
1229
1230
1231 const int src_dim_for_dst_stride1_dim =
1232 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1233
1234
1235 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1236
1237
1238
1239 for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241 const IndexType dst_stride = dst.strides[dst_dim];
1242 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1243 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1244 dst_inner_dim_size *= dst.dims[dst_dim];
1245 ++num_size_one_inner_dims;
1246 } else {
1247 break;
1248 }
1249 }
1250
1251
1252 IndexType input_offset = src.offset;
1253 IndexType output_offset = dst.offset;
1254 IndexType input_stride =
1255 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1257
1258 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1259 array<BlockIteratorState, at_least_1_dim> it;
1260
1261
1262 int idx = 0;
1263 for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265 if (dst.dims[dst_dim] == 1) continue;
1266
1267 it[idx].size = dst.dims[dst_dim];
1268 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269 it[idx].output_stride = dst.strides[dst_dim];
1270
1271 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1273
1274 idx++;
1275 }
1276
1277
1278 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1279
1280 #define COPY_INNER_DIM(KIND) \
1281 IndexType num_copied = 0; \
1282 for (num_copied = 0; num_copied < block_total_size; \
1283 num_copied += dst_inner_dim_size) { \
1284 LinCopy::template Run<KIND>( \
1285 typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1286 typename LinCopy::Src(input_offset, input_stride, src.data), \
1287 dst_inner_dim_size); \
1288 \
1289 for (int j = 0; j < idx; ++j) { \
1290 if (++it[j].count < it[j].size) { \
1291 input_offset += it[j].input_stride; \
1292 output_offset += it[j].output_stride; \
1293 break; \
1294 } \
1295 it[j].count = 0; \
1296 input_offset -= it[j].input_span; \
1297 output_offset -= it[j].output_span; \
1298 } \
1299 } \
1300 return num_copied;
1301
1302 if (input_stride == 1 && output_stride == 1) {
1303 COPY_INNER_DIM(LinCopy::Kind::Linear);
1304 } else if (input_stride == 1 && output_stride != 1) {
1305 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306 } else if (input_stride == 0 && output_stride == 1) {
1307 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308 } else if (input_stride == 0 && output_stride != 1) {
1309 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310 } else if (output_stride == 1) {
1311 COPY_INNER_DIM(LinCopy::Kind::Gather);
1312 } else {
1313 COPY_INNER_DIM(LinCopy::Kind::Random);
1314 }
1315
1316 #undef COPY_INNER_DIM
1317 }
1318
1319
1320
1321 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
1322 const Src& src) {
1323 DimensionsMap dst_to_src_map;
1324 for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325 return Copy(dst, src, dst_to_src_map);
1326 }
1327
1328 private:
1329 struct BlockIteratorState {
1330 BlockIteratorState()
1331 : size(0),
1332 count(0),
1333 input_stride(0),
1334 output_stride(0),
1335 input_span(0),
1336 output_span(0) {}
1337
1338 IndexType size;
1339 IndexType count;
1340 IndexType input_stride;
1341 IndexType output_stride;
1342 IndexType input_span;
1343 IndexType output_span;
1344 };
1345
1346
1347
1348
1349 static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1350 int num_squeezable_dims = 0;
1351 for (int i = 0; i < NumDims; ++i) {
1352 const int dim = IsColMajor ? i : NumDims - i - 1;
1353 if (dim_map[dim] != dim) break;
1354 num_squeezable_dims++;
1355 }
1356 return num_squeezable_dims;
1357 }
1358 };
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379 template <typename Scalar, int NumDims, typename TensorBlockExpr,
1380 typename IndexType = Eigen::Index>
1381 class TensorBlockAssignment {
1382
1383 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1384 TensorBlockEvaluator;
1385
1386 typedef DSizes<IndexType, NumDims> Dimensions;
1387
1388 enum {
1389 Vectorizable = packet_traits<Scalar>::Vectorizable,
1390 PacketSize = packet_traits<Scalar>::size
1391 };
1392
1393 template <bool Vectorizable, typename Evaluator>
1394 struct InnerDimAssign {
1395 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1396 const Evaluator& eval,
1397 IndexType eval_offset) {
1398 for (IndexType i = 0; i < count; ++i) {
1399 target[i] = eval.coeff(eval_offset + i);
1400 }
1401 }
1402 };
1403
1404 template <typename Evaluator>
1405 struct InnerDimAssign<true, Evaluator> {
1406 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1407 const Evaluator& eval,
1408 IndexType eval_offset) {
1409 typedef typename packet_traits<Scalar>::type Packet;
1410
1411 const IndexType unrolled_size = count - 4 * PacketSize;
1412 const IndexType vectorized_size = count - PacketSize;
1413 IndexType i = 0;
1414
1415 for (; i <= unrolled_size; i += 4 * PacketSize) {
1416 for (int j = 0; j < 4; ++j) {
1417 const IndexType idx = eval_offset + i + j * PacketSize;
1418 Packet p = eval.template packet<Unaligned>(idx);
1419 pstoreu<Scalar>(target + i + j * PacketSize, p);
1420 }
1421 }
1422
1423 for (; i <= vectorized_size; i += PacketSize) {
1424 Packet p = eval.template packet<Unaligned>(eval_offset + i);
1425 pstoreu<Scalar>(target + i, p);
1426 }
1427
1428 for (; i < count; ++i) {
1429 target[i] = eval.coeff(eval_offset + i);
1430 }
1431 }
1432 };
1433
1434 public:
1435 struct Target {
1436 Target(const Dimensions& target_dims, const Dimensions& target_strides,
1437 Scalar* target_data, IndexType target_offset = 0)
1438 : dims(target_dims),
1439 strides(target_strides),
1440 data(target_data),
1441 offset(target_offset) {}
1442
1443 Dimensions dims;
1444 Dimensions strides;
1445 Scalar* data;
1446 IndexType offset;
1447 };
1448
1449 static Target target(const Dimensions& target_dims,
1450 const Dimensions& target_strides, Scalar* target_data,
1451 IndexType target_offset = 0) {
1452 return Target(target_dims, target_strides, target_data, target_offset);
1453 }
1454
1455 template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1456 static Target target(
1457 const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1458 const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1459 Scalar* target_data, IndexType target_offset = 0) {
1460
1461 return Target(Dimensions(target_dims), Dimensions(target_strides),
1462 target_data, target_offset);
1463 }
1464
1465 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1466 const Target& target, const TensorBlockExpr& expr) {
1467
1468 DefaultDevice default_device;
1469 TensorBlockEvaluator eval(expr, default_device);
1470
1471
1472 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1473
1474 static const int Layout = TensorBlockEvaluator::Layout;
1475 static const bool is_col_major = Layout == ColMajor;
1476
1477
1478 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1481
1482
1483 eigen_assert(target.strides[inner_dim_idx] == 1);
1484
1485
1486 IndexType num_squeezed_dims = 0;
1487 for (Index i = 1; i < NumDims; ++i) {
1488 const Index dim = is_col_major ? i : NumDims - i - 1;
1489 const IndexType target_stride = target.strides[dim];
1490
1491 if (output_inner_dim_size == target_stride) {
1492 output_inner_dim_size *= target.dims[dim];
1493 num_squeezed_dims++;
1494 } else {
1495 break;
1496 }
1497 }
1498
1499
1500
1501 array<BlockIteratorState, NumDims> it;
1502
1503 int idx = 0;
1504 for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1506
1507 it[idx].count = 0;
1508 it[idx].size = target.dims[dim];
1509 it[idx].output_stride = target.strides[dim];
1510 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1511 idx++;
1512 }
1513
1514
1515
1516 IndexType input_offset = 0;
1517 IndexType output_offset = target.offset;
1518
1519
1520 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1521
1522 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1523 TensorBlockEvaluator>::Run(target.data + output_offset,
1524 output_inner_dim_size, eval,
1525 input_offset);
1526
1527
1528 input_offset += output_inner_dim_size;
1529
1530
1531 for (int j = 0; j < idx; ++j) {
1532 if (++it[j].count < it[j].size) {
1533 output_offset += it[j].output_stride;
1534 break;
1535 }
1536 it[j].count = 0;
1537 output_offset -= it[j].output_span;
1538 }
1539 }
1540 }
1541
1542 private:
1543 struct BlockIteratorState {
1544 BlockIteratorState()
1545 : count(0), size(0), output_stride(0), output_span(0) {}
1546
1547 IndexType count;
1548 IndexType size;
1549 IndexType output_stride;
1550 IndexType output_span;
1551 };
1552 };
1553
1554
1555
1556 }
1557 }
1558
1559 #endif