Architectures/Cpu/CpuMatrix.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Simon Pfreundschuh 20/07/16
0003
0004 /*************************************************************************
0005  * Copyright (C) 2016, Simon Pfreundschuh                                *
0006  * All rights reserved.                                                  *
0007  *                                                                       *
0008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0010  *************************************************************************/
0011
0012 //////////////////////////////////////////////////////////
0013 // Definition of the CpuMatrix class used to represent  //
0014 // weight and bias matrices in neural nets.             //
0015 //////////////////////////////////////////////////////////
0016
0017 #ifndef TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX
0018 #define TMVA_DNN_ARCHITECTURES_CPU_CPUMATRIX
0019
0020 #ifdef R__USE_IMT
0021 #define DL_USE_MTE // use MT with tbb
0022 #endif
0023
0024 #include <cstddef>
0025 #include <vector>
0026
0027 #include "TMatrix.h"
0028 #include "TMVA/Config.h"
0029 #include "CpuBuffer.h"
0030
0031 // #define DEBUG_TMVA_TCPUMATRIX
0032 #if defined(DEBUG_TMVA_TCPUMATRIX)
0033 /*
0034  * Debug(!) function for printing matrices.
0035  *
0036  * Prints the input expression `mat` using preprocessor directives (with
0037  * `#mat`). E.g. `PrintMatrix(matA, "Test")` _could_ generate
0038  * "matA is null pointer".
0039  *
0040  * Note: This is a preprocessor macro. It does _not_ respect namespaces.
0041  *
0042  * @param mat  Matrix to print
0043  * @param text Name of matrix
0044  */
0045 #define TMVA_DNN_PrintTCpuMatrix(mat, text)                                                                \
0046    {                                                                                                       \
0047       auto _dpointer = mat.GetRawDataPointer();                                                            \
0048       if (!_dpointer) {                                                                                    \
0049          std::cout << #mat << " is null pointer" << std::endl;                                             \
0050          exit(1);                                                                                          \
0051       }                                                                                                    \
0052       auto _nrows = mat.GetNrows();                                                                        \
0053       auto _ncols = mat.GetNcols();                                                                        \
0054       std::cout << "---------------------" << text << " " << #mat << "(" << _nrows << "," << _ncols << ")" \
0055                 << "--------------------" << std::endl;                                                    \
0056       for (size_t _i = 0; _i < _nrows; _i++) {                                                             \
0057          for (size_t _j = 0; _j < _ncols; _j++) {                                                          \
0058             std::cout << mat(_i, _j);                                                                      \
0059             if (_j < _ncols - 1)                                                                           \
0060                std::cout << ",";                                                                           \
0061          }                                                                                                 \
0062          std::cout << std::endl;                                                                           \
0063       }                                                                                                    \
0064    }
0065 #else
0066 #define TMVA_DNN_PrintTCpuMatrix(mat, text)
0067 #endif
0068
0069 namespace TMVA {
0070 namespace DNN {
0071
0072 /** The TCpuMatrix class.
0073  *
0074  * Matrix class for multi-threaded CPU architectures. Uses the TCpuBuffer
0075  * class to store the matrices in column-major format for compatibility with
0076  * BLAS. Provides Map and MapFrom member functions to simplify the application of
0077  * activation functions and derivatives to matrices.
0078  *
0079  * Copying and assignment of TCpuMatrix objects only performs shallow copies, i.e.
0080  * copying is fast and the resulting objects share the element data.
0081  *
0082  * \tparam AFloat The floating point type used to represent the matrix elements.
0083  */
0084 //______________________________________________________________________________
0085 template <typename AFloat>
0086 class TCpuMatrix {
0087 private:
0088    static std::vector<AFloat> fOnes; ///< Vector filled with ones used for BLAS calls.
0089
0090 public:
0091    TCpuBuffer<AFloat> fBuffer; ///< The buffer holding the matrix elements
0092                                ///< in column-major format.
0093 private:
0094    size_t fNCols;
0095    size_t fNRows;
0096
0097 public:
0098    // friend class TCpuTensor<AFloat>;
0099
0100    /** Returns pointer to a vector holding only ones with a guaranteed length
0101     *  of the number of columns of every instantiated CpuMatrix object. */
0102
0103
0104    TCpuBuffer<AFloat>& GetBuffer() {return fBuffer;}
0105    const TCpuBuffer<AFloat>& GetBuffer() const {return fBuffer;}
0106    // for compatible API with Tensor and Matrix in Cuda
0107    TCpuBuffer<AFloat> &GetDeviceBuffer() { return fBuffer; }
0108    const TCpuBuffer<AFloat> &GetDeviceBuffer() const { return fBuffer; }
0109
0110    static const AFloat *GetOnePointer() { return fOnes.data(); }
0111
0112    static size_t GetOnePointerSize() { return fOnes.size(); }
0113
0114    static void InitializeOneVector(size_t n);
0115
0116    TCpuMatrix() : fNCols(0), fNRows(0) {}
0117
0118    /** Construct matrix and allocate space for its elements. */
0119    TCpuMatrix(size_t nRows, size_t nCols);
0120    /** Construct a TCpuMatrix object by (deeply) copying from a
0121     *  TMatrixT<Double_t> matrix. */
0122    TCpuMatrix(const TMatrixT<AFloat> &);
0123    /** Construct a m-times-n matrix from the given buffer. The size must of
0124     *  course match. */
0125    TCpuMatrix(const TCpuBuffer<AFloat> &buffer, size_t m, size_t n);
0126
0127    /** copy from a TMAtrixT . Deep copy without re-creating a new buffer */
0128    TCpuMatrix<AFloat> &operator=(const TMatrixT<AFloat> &);
0129
0130    // N.B the default copy constructor does a shallow copy (NOT a deep one) !
0131    TCpuMatrix(const TCpuMatrix &) = default;
0132    TCpuMatrix(TCpuMatrix &&) = default;
0133    TCpuMatrix &operator=(const TCpuMatrix &) = default;
0134    TCpuMatrix &operator=(TCpuMatrix &&) = default;
0135    ~TCpuMatrix() = default;
0136
0137    /** Clear content of the matrix and initialize to zero elements
0138     */
0139    void Zero();
0140
0141    /** Convert to a TMatrixT<AFloat_t> object. Performs a deep copy of the matrix
0142     *  elements. */
0143    operator TMatrixT<AFloat>() const;
0144
0145    /** Map the given function over the matrix elements. Executed in parallel
0146     *  using TThreadExecutor. */
0147    template <typename Function_t>
0148    void Map(Function_t &f);
0149
0150    /** Same as maps but takes the input values from the matrix \p A and writes
0151     *  the results in this matrix. */
0152    template <typename Function_t>
0153    void MapFrom(Function_t &f, const TCpuMatrix &A);
0154
0155    size_t GetNrows() const { return fNRows; }
0156    size_t GetNcols() const { return fNCols; }
0157    size_t GetNoElements() const { return fNRows * fNCols; }
0158    size_t GetSize() const { return fNRows * fNCols; }
0159
0160    /** Return matrix element in row \p i and column \p j. */
0161    AFloat operator()(size_t i, size_t j) const { return fBuffer[j * fNRows + i]; }
0162    AFloat &operator()(size_t i, size_t j) { return fBuffer[j * fNRows + i]; }
0163
0164    /** Return raw pointer to the elements stored contiguously in column-major
0165     *  order. */
0166    AFloat *GetRawDataPointer() { return fBuffer.data(); }
0167    const AFloat *GetRawDataPointer() const { return fBuffer.data(); }
0168
0169    static Executor &GetThreadExecutor() { return TMVA::Config::Instance().GetThreadExecutor(); }
0170
0171    // static function to get the number of elements for task
0172    static size_t GetNWorkItems(size_t nelements);
0173
0174    // print matrix
0175    void Print() const
0176    {
0177       TCpuMatrix cpuMatrix = *this;
0178       TMVA_DNN_PrintTCpuMatrix(cpuMatrix, "CpuMatrix");
0179    }
0180
0181 private:
0182    void Initialize();
0183 };
0184
0185 template <typename AFloat>
0186 std::vector<AFloat> TCpuMatrix<AFloat>::fOnes{};
0187
0188 // Inline Functions.
0189 //______________________________________________________________________________
0190 template <typename AFloat>
0191 size_t TCpuMatrix<AFloat>::GetNWorkItems(size_t nElements)
0192 {
0193    // nElements should have at least 100
0194    // const size_t nWorkers = TMVA::Config::Instance().GetNCpu();
0195    // return  (nElements > nWorkers) ?  (int) nElements/nWorkers : 1;
0196    const size_t minElements = 1000;
0197    const size_t nCpu = TMVA::Config::Instance().GetNCpu();
0198    if (nElements <= minElements)
0199       return nElements;
0200    if (nElements < nCpu * minElements) {
0201       size_t nt = nElements / minElements;
0202       return nElements / nt;
0203    }
0204    return nElements / nCpu;
0205    // if (nElements < nCpu*20) return nElements/nCpu;
0206    // return nElements/(nCpu*10);
0207 }
0208
0209 //______________________________________________________________________________
0210 template <typename AFloat>
0211 template <typename Function_t>
0212 inline void TCpuMatrix<AFloat>::Map(Function_t &f)
0213 {
0214    AFloat *data = GetRawDataPointer();
0215    size_t nelements = GetNoElements();
0216    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
0217
0218    auto ff = [data, &nsteps, &nelements, &f](UInt_t workerID) {
0219       size_t jMax = std::min(workerID + nsteps, nelements);
0220       for (size_t j = workerID; j < jMax; ++j) {
0221          data[j] = f(data[j]);
0222       }
0223       return 0;
0224    };
0225
0226    if (nsteps < nelements) {
0227       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
0228
0229       // for (size_t i = 0;  i < nelements; i+=nsteps)
0230       //    ff(i);
0231
0232    } else {
0233       R__ASSERT(nelements == nsteps);
0234       ff(0);
0235    }
0236 }
0237
0238 //______________________________________________________________________________
0239 template <typename AFloat>
0240 template <typename Function_t>
0241 inline void TCpuMatrix<AFloat>::MapFrom(Function_t &f, const TCpuMatrix &A)
0242 {
0243    AFloat *dataB = GetRawDataPointer();
0244    const AFloat *dataA = A.GetRawDataPointer();
0245
0246    size_t nelements = GetNoElements();
0247    R__ASSERT(nelements == A.GetNoElements());
0248    size_t nsteps = TCpuMatrix<AFloat>::GetNWorkItems(nelements);
0249
0250    auto ff = [&dataB, &dataA, &nsteps, &nelements, &f](UInt_t workerID) {
0251       size_t jMax = std::min(workerID + nsteps, nelements);
0252       for (size_t j = workerID; j < jMax; ++j) {
0253          dataB[j] = f(dataA[j]);
0254       }
0255       return 0;
0256    };
0257    if (nsteps < nelements) {
0258       TMVA::Config::Instance().GetThreadExecutor().Foreach(ff, ROOT::TSeqI(0, nelements, nsteps));
0259       // for (size_t i = 0;  i < nelements; i+=nsteps)
0260       //    ff(i);
0261
0262    } else {
0263       R__ASSERT(nelements == nsteps);
0264       ff(0);
0265    }
0266 }
0267 //______________________________________________________________________________
0268 template <typename AFloat>
0269 void TCpuMatrix<AFloat>::Zero()
0270 {
0271    for (size_t j = 0; j < fNCols; j++) {
0272       for (size_t i = 0; i < fNRows; i++) {
0273          (*this)(i, j) = 0;
0274       }
0275    }
0276 }
0277
0278 } // namespace DNN
0279 } // namespace TMVA
0280
0281 #endif