Architectures/Cpu/CpuTensor.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Authors: Sitong An, Lorenzo Moneta 10/2019
0003
0004 /*************************************************************************
0005  * Copyright (C) 2019, ROOT                                              *
0006  * All rights reserved.                                                  *
0007  *                                                                       *
0008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0010  *************************************************************************/
0011
0012 //////////////////////////////////////////////////////////
0013 // Definition of the CpuTensor class used to represent  //
0014 // tensor data  in deep neural nets (CNN, RNN, etc..)   //
0015 //////////////////////////////////////////////////////////
0016
0017 #ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUTENSOR
0018 #define TMVA_DNN_ARCHITECTURES_CPU_CPUTENSOR
0019
0020 #include <cstddef>
0021
0022
0023 #include "TMatrix.h"
0024 #include "TMVA/Config.h"
0025 #include "CpuBuffer.h"
0026 #include "CpuMatrix.h"
0027 #include <TMVA/RTensor.hxx>
0028
0029 namespace TMVA {
0030 namespace DNN {
0031
0032 // CPU Tensor Class
0033 // It is a simple wrapper for TMVA RTensor based on
0034 // memory owned by CPU Buffer
0035 // We need to keep a pointer for CPUBuffer for fast conversion
0036 // without copying to TCpuMatrix
0037 // also provides compatibility with old interface
0038
0039 template <typename AFloat>
0040 class TCpuTensor : public TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>> {
0041
0042 private:
0043    //TCpuTensor will have no extra private members than RTensor
0044 public:
0045    friend class TCpuMatrix<AFloat>;
0046
0047    using Shape_t = typename TMVA::Experimental::RTensor<AFloat>::Shape_t;
0048    using MemoryLayout = TMVA::Experimental::MemoryLayout;
0049    using Matrix_t = TCpuMatrix<AFloat>;
0050    using Scalar_t = AFloat;
0051
0052    // default constructor
0053    TCpuTensor(): TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(0), {0})
0054    {}
0055
0056    /** constructors from n m */
0057    TCpuTensor(size_t n, size_t m, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0058       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(n * m), {n, m}, memlayout)
0059    {}
0060
0061    /** constructors from batch size, depth, height*width */
0062    TCpuTensor(size_t bsize, size_t depth, size_t hw, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0063       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(bsize * depth * hw), {depth, hw, bsize}, memlayout)
0064    {
0065       if (memlayout == MemoryLayout::RowMajor)
0066          this->ReshapeInplace({bsize, depth, hw});
0067    }
0068
0069    /** constructors from batch size, depth, height, width */
0070    TCpuTensor(size_t bsize, size_t depth, size_t height, size_t width,
0071               MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0072       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(bsize * depth * height * width),
0073       {depth, height, width, bsize}, memlayout)
0074    {
0075       if (memlayout == MemoryLayout::RowMajor)
0076          this->ReshapeInplace({bsize, depth, height, width});
0077    }
0078
0079    /** constructors from a shape.*/
0080    TCpuTensor(Shape_t shape, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0081       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(TMVA::Experimental::Internal::GetSizeFromShape(shape)),
0082       shape, memlayout)
0083    {}
0084
0085     /* constructors from a AFloat pointer  and a shape. This is a copy */
0086
0087    TCpuTensor(AFloat *data, const Shape_t &shape,
0088               MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0089       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(TMVA::Experimental::Internal::GetSizeFromShape(shape)), shape, memlayout)
0090    {
0091       auto& container = *(this->GetContainer());
0092       for (size_t i = 0; i <  this->GetSize(); ++i) container[i] = data[i];
0093    }
0094
0095
0096
0097    /** constructors from a TCpuBuffer and a shape */
0098    //unsafe method for backwards compatibility, const not promised. A view.
0099    TCpuTensor(const TCpuBuffer<AFloat>& buffer, Shape_t shape, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0100       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(buffer), shape, memlayout) {
0101          R__ASSERT(this->GetSize() <= this->GetContainer()->GetSize());
0102       }
0103
0104
0105
0106    /** constructors from a TCpuMatrix. Memory layout is forced to be same as matrix (i.e. columnlayout) */
0107    //unsafe method for backwards compatibility, const not promised. A view of underlying data.
0108    TCpuTensor(const TCpuMatrix<AFloat> &matrix, size_t dim = 3, MemoryLayout memlayout = MemoryLayout::ColumnMajor)
0109       : TMVA::Experimental::RTensor<AFloat, TCpuBuffer<AFloat>>(std::make_shared<TCpuBuffer<AFloat>>(matrix.GetBuffer()),{matrix.GetNrows(), matrix.GetNcols()}, memlayout)
0110    {
0111
0112       if (dim >  2) {
0113          Shape_t shape = this->GetShape();
0114
0115          if (this->GetLayout() == MemoryLayout::ColumnMajor) {
0116             shape.insert(shape.end(),dim-2, 1);
0117          } else {
0118             shape.insert(shape.begin(), dim - 2, 1);
0119          }
0120          this->ReshapeInplace(shape);
0121       }
0122    }
0123
0124
0125    /** Convert to a TMatrixT<AFloat_t> object. Performs a deep copy of the matrix
0126     *  elements. */
0127
0128    operator TMatrixT<AFloat>() const {
0129       // this should work only for size 2 or 4 tensors
0130       if (this->GetShape().size() == 2 || (this->GetShape().size() == 3 && GetFirstSize() == 1)) {
0131          TCpuMatrix<AFloat> temp = GetMatrix();
0132          return temp;
0133       }
0134       // convert as a flat vector
0135       return TMatrixT<AFloat>(1, this->GetSize(), this->GetData());
0136    }
0137
0138
0139    /** Return raw pointer to the elements stored contiguously in column-major
0140     *  order. */
0141    AFloat *GetRawDataPointer() { return this->GetContainer()->data(); }
0142    const AFloat *GetRawDataPointer() const { return this->GetContainer()->data(); }
0143
0144    // for same API as CudaTensor (device buffer is the CpuBuffer)
0145    const TCpuBuffer<AFloat> & GetDeviceBuffer()     const {return *(this->GetContainer());}
0146    TCpuBuffer<AFloat>       & GetDeviceBuffer()           {return *(this->GetContainer());}
0147
0148
0149    size_t GetNoElements() const { return this->GetSize(); }
0150
0151    // return the size of the first dimension (if in row order) or last dimension if in column order
0152    // Tensor is  F x H x W x...for row order layout FHWC
0153    // or      H x W x ... x F  for column order layout CHWF
0154    // logic copied from TCudaTensor
0155    size_t GetFirstSize() const
0156    {
0157       auto& shape = this->GetShape();
0158       return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape.back() : shape.front();
0159    }
0160
0161    size_t GetCSize() const
0162    {
0163       auto& shape = this->GetShape();
0164       if (shape.size() == 2)  return 1;
0165       return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape.front() : shape[1]; // assume NHWC
0166    }
0167    //
0168    size_t GetHSize() const
0169    {
0170       auto& shape = this->GetShape();
0171       if (shape.size() == 2)  return shape[0];
0172       if (shape.size() == 3)  return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape[0] : shape[1] ;
0173       if (shape.size() >= 4)  return shape[2] ;
0174       return 0;
0175
0176    }
0177    size_t GetWSize() const
0178    {
0179       auto& shape = this->GetShape();
0180       if (shape.size() == 2)  return shape[1];
0181       if (shape.size() == 3)  return (this->GetMemoryLayout() == MemoryLayout::ColumnMajor) ? shape[1] : shape[2] ;
0182       if (shape.size() >= 4)  return shape[3] ;
0183       return 0;
0184
0185    }
0186
0187    // for backward compatibility (assume column-major
0188    // for backward compatibility : for CM tensor (n1,n2,n3,n4) -> ( n1*n2*n3, n4)
0189    //                              for RM tensor (n1,n2,n3,n4) -> ( n2*n3*n4, n1 ) ???
0190    size_t GetNrows() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? this->GetStrides().back() : this->GetShape().front();}
0191    size_t GetNcols() const { return (GetLayout() == MemoryLayout::ColumnMajor ) ? this->GetShape().back() : this->GetStrides().front(); }
0192
0193
0194    MemoryLayout GetLayout() const { return this->GetMemoryLayout(); }
0195
0196    //this will be an unsafe view. Method exists for backwards compatibility only
0197    TCpuMatrix<AFloat> GetMatrix() const
0198    {
0199       [[maybe_unused]] size_t ndims = 0;
0200       auto& shape = this->GetShape();
0201       //check if squeezable but do not actually squeeze
0202       for (auto& shape_i : shape){
0203          if (shape_i != 1) {
0204             ndims++;
0205          }
0206       }
0207       assert(ndims <= 2 && shape.size() > 1);  // to support shape cases {n,1}
0208       return TCpuMatrix<AFloat>(*(this->GetContainer()), GetHSize(), GetWSize());
0209    }
0210
0211    // Create copy, replace and return
0212    TCpuTensor<AFloat> Reshape(Shape_t shape) const
0213    {
0214       TCpuTensor<AFloat> x(*this);
0215       x.ReshapeInplace(shape);
0216       return x;
0217    }
0218
0219       // return a view of slices in the first dimension (if row wise) or last dimension if column wise
0220       // so single event slices
0221       TCpuTensor<AFloat> At(size_t i)
0222       {
0223          auto &shape = this->GetShape();
0224          auto layout = this->GetMemoryLayout();
0225          Shape_t sliced_shape = (layout == MemoryLayout::RowMajor) ? Shape_t(shape.begin() + 1, shape.end())
0226                                                                    : Shape_t(shape.begin(), shape.end() - 1);
0227
0228          size_t buffsize = (layout == MemoryLayout::RowMajor) ? this->GetStrides().front() : this->GetStrides().back();
0229          size_t offset = i * buffsize;
0230
0231          return TCpuTensor<AFloat>(this->GetContainer()->GetSubBuffer(offset, buffsize), sliced_shape, layout);
0232       }
0233
0234       TCpuTensor<AFloat> At(size_t i) const { return (const_cast<TCpuTensor<AFloat> &>(*this)).At(i); }
0235
0236       // for compatibility with old tensor (std::vector<matrix>)
0237       TCpuMatrix<AFloat> operator[](size_t i) const {
0238          assert(this->GetMemoryLayout() == MemoryLayout::ColumnMajor );
0239          return At(i).GetMatrix();
0240       }
0241
0242       // set all the tensor contents to zero
0243       void Zero()
0244       {
0245          AFloat *data = this->GetContainer()->data();
0246          for (size_t i = 0; i < this->GetSize(); ++i)
0247             data[i] = 0;
0248       }
0249
0250       // access single element - assume tensor dim is 2
0251       AFloat &operator()(size_t i, size_t j)
0252       {
0253          auto &shape = this->GetShape();
0254          assert(shape.size() == 2);
0255          return (this->GetMemoryLayout() == MemoryLayout::RowMajor) ? (*(this->GetContainer()))[i * shape[1] + j]
0256                                                                     : (*(this->GetContainer()))[j * shape[0] + i];
0257       }
0258
0259       // access single element - assume tensor dim is 3. First index i is always the major  independent of row-major or
0260       // column major row- major  I - J - K    . Column- major  is  J - K - I
0261       AFloat &operator()(size_t i, size_t j, size_t k)
0262       {
0263          auto &shape = this->GetShape();
0264          assert(shape.size() == 3);
0265
0266          return (this->GetMemoryLayout() == MemoryLayout::RowMajor)
0267                    ? (*(this->GetContainer()))[i * shape[1] * shape[2] + j * shape[2] + k]
0268                    : (*(this->GetContainer()))[i * shape[0] * shape[1] + k * shape[0] + j]; // note that is J-K-I
0269       }
0270
0271       // access single element - assume tensor dim is 2
0272       AFloat operator()(size_t i, size_t j) const
0273       {
0274          auto &shape = this->GetShape();
0275          assert(shape.size() == 2);
0276          return (this->GetMemoryLayout() == MemoryLayout::RowMajor) ? (this->GetData())[i * shape[1] + j]
0277                                                                     : (this->GetData())[j * shape[0] + i];
0278       }
0279
0280       AFloat operator()(size_t i, size_t j, size_t k) const
0281       {
0282          auto &shape = this->GetShape();
0283          assert(shape.size() == 3);
0284
0285          return (this->GetMemoryLayout() == MemoryLayout::RowMajor)
0286                    ? (this->GetData())[i * shape[1] * shape[2] + j * shape[2] + k]
0287                    : (this->GetData())[i * shape[0] * shape[1] + k * shape[0] + j]; // note that is J-K-I
0288       }
0289
0290       /** Map the given function over the matrix elements. Executed in parallel
0291        *  using TThreadExecutor. */
0292       template <typename Function_t>
0293       void Map(Function_t & f);
0294
0295       /** Same as maps but takes the input values from the tensor \p A and writes
0296        *  the results in this tensor. */
0297       template <typename Function_t>
0298       void MapFrom(Function_t & f, const TCpuTensor<AFloat> &A);
0299
0300       size_t GetBufferUseCount() const { return this->GetContainer()->GetUseCount(); }
0301
0302       void Print(const char *name = "Tensor") const
0303       {
0304          PrintShape(name);
0305
0306          for (size_t i = 0; i < this->GetSize(); i++)
0307             std::cout << (this->GetData())[i] << "  ";
0308          std::cout << std::endl;
0309       }
0310       void PrintShape(const char *name = "Tensor") const
0311       {
0312          std::string memlayout = (GetLayout() == MemoryLayout::RowMajor) ? "RowMajor" : "ColMajor";
0313          std::cout << name << " shape : { ";
0314          auto &shape = this->GetShape();
0315          for (size_t i = 0; i < shape.size() - 1; ++i)
0316             std::cout << shape[i] << " , ";
0317          std::cout << shape.back() << " } "
0318                    << " Layout : " << memlayout << std::endl;
0319       }
0320 };
0321
0322 //______________________________________________________________________________
0323 template <typename AFloat>
0324 template <typename Function_t>
0325 inline void TCpuTensor<AFloat>::Map(Function_t &f)
0326 {
0327    AFloat *data = GetRawDataPointer();
0328    size_t nelements = GetNoElements();
0329    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
0330
0331    auto ff = [data, &nsteps, &nelements, &f](UInt_t workerID) {
0332       size_t jMax = std::min(workerID + nsteps, nelements);
0333       for (size_t j = workerID; j < jMax; ++j) {
0334          data[j] = f(data[j]);
0335       }
0336       return 0;
0337    };
0338
0339    if (nsteps < nelements) {
0340       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
0341
0342       // for (size_t i = 0;  i < nelements; i+=nsteps)
0343       //    ff(i);
0344
0345    } else {
0346       R__ASSERT(nelements == nsteps);
0347       ff(0);
0348    }
0349 }
0350
0351 //______________________________________________________________________________
0352 template <typename AFloat>
0353 template <typename Function_t>
0354 inline void TCpuTensor<AFloat>::MapFrom(Function_t &f, const TCpuTensor<AFloat> &A)
0355 {
0356    AFloat *dataB = GetRawDataPointer();
0357    const AFloat *dataA = A.GetRawDataPointer();
0358
0359    size_t nelements = GetNoElements();
0360    R__ASSERT(nelements == A.GetNoElements());
0361    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
0362
0363    auto ff = [&dataB, &dataA, &nsteps, &nelements, &f](UInt_t workerID) {
0364       size_t jMax = std::min(workerID + nsteps, nelements);
0365       for (size_t j = workerID; j < jMax; ++j) {
0366          dataB[j] = f(dataA[j]);
0367       }
0368       return 0;
0369    };
0370    if (nsteps < nelements) {
0371       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
0372       // for (size_t i = 0;  i < nelements; i+=nsteps)
0373       //    ff(i);
0374
0375    } else {
0376       R__ASSERT(nelements == nsteps);
0377       ff(0);
0378    }
0379 }
0380
0381
0382 } // namespace DNN
0383 } // namespace TMVA
0384
0385 #endif