DNN/Architectures/Cpu.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Simon Pfreundschuh 05/07/16
0003
0004 /*************************************************************************
0005  * Copyright (C) 2016, Simon Pfreundschuh                                *
0006  * All rights reserved.                                                  *
0007  *                                                                       *
0008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
0009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
0010  *************************************************************************/
0011
0012  //////////////////////////////////////////////////////////////////
0013  // Definition of the TCpu architecture, which provides a         //
0014  // multi-threaded CPU implementation of the low-level interface //
0015  // networks for Cpus using BLAS and Roots TThreadExecutor            //
0016  //////////////////////////////////////////////////////////////////
0017
0018 #ifndef TMVA_DNN_ARCHITECTURES_CPU
0019 #define TMVA_DNN_ARCHITECTURES_CPU
0020
0021 #include "TMVA/DNN/Functions.h"
0022 #include "TMVA/DNN/CNN/ContextHandles.h"
0023 //#include "TMVA/DNN/CNN/Descriptors.h"
0024 #include "TMVA/DNN/GeneralLayer.h"
0025 #include "TMVA/DNN/BatchNormLayer.h"
0026 #include "TMVA/DNN/CNN/ConvLayer.h"
0027 #include "TMVA/DNN/CNN/MaxPoolLayer.h"
0028 #include "TMVA/DNN/RNN/RNNLayer.h"
0029
0030 #include "TMVA/DNN/Architectures/Cpu/CpuBuffer.h"
0031 #include "TMVA/DNN/Architectures/Cpu/CpuMatrix.h"
0032 #include "TMVA/DNN/Architectures/Cpu/CpuTensor.h"
0033
0034 #include <vector>
0035 #include <string>
0036
0037 class TRandom;
0038
0039 namespace TMVA
0040 {
0041 namespace DNN
0042 {
0043    //class EActivationFunction;
0044  struct DummyDescriptor {};
0045  struct DummyFilterDescriptor {};
0046  struct DummyConvolutionDescriptor {};
0047  struct DummyDropoutDescriptor {};
0048  struct DummyPoolingDescriptor {};
0049  struct DummyConvolutionFwdAlgo {};
0050  struct DummyConvolutionBwdDataAlgo {};
0051  struct DummyConvolutionBwdFilterAlgo {};
0052  struct DummyDataType {};
0053
0054  struct DummyEmptyDescriptor {};
0055
0056 /** The TCpu architecture class.
0057  *
0058  * Low-level interface class for multi-threaded CPU architectures. Contains as
0059  * public types the declaration of the scalar, matrix and data loader types
0060  * for this architecture as well as the remaining functions in the low-level
0061  * interface in the form of static members.
0062  */
0063 template<typename AReal = Float_t>
0064 class TCpu
0065 {
0066 private:
0067    static TRandom * fgRandomGen;
0068 public:
0069    using Scalar_t       = AReal;
0070    using Tensor_t       = TCpuTensor<AReal>;
0071    using Matrix_t       = TCpuMatrix<AReal>;
0072    using HostBuffer_t   = TCpuBuffer<AReal>;
0073    using DeviceBuffer_t = TCpuBuffer<AReal>;
0074
0075    using ActivationDescriptor_t  = DummyDescriptor;
0076    using ConvolutionDescriptor_t = DummyDescriptor;
0077    using FilterDescriptor_t      = DummyDescriptor;
0078    using DropoutDescriptor_t     = DummyDescriptor;
0079    using PoolingDescriptor_t     = DummyDescriptor;
0080    using TensorDescriptor_t      = DummyDescriptor;
0081
0082    using AlgorithmForward_t      = DummyConvolutionFwdAlgo;
0083    using AlgorithmBackward_t     = DummyConvolutionBwdDataAlgo;
0084    using AlgorithmHelper_t       = DummyConvolutionBwdFilterAlgo;
0085    using AlgorithmDataType_t     = DummyDataType;
0086    using ReduceTensorDescriptor_t = DummyDataType;
0087    using RecurrentDescriptor_t    = DummyDataType;
0088
0089    using EmptyDescriptor_t       = DummyDescriptor; // Used if a descriptor is not needed in a class
0090
0091    using GenLayer_t              = VGeneralLayer<TCpu<AReal>>;
0092    using BNormLayer_t            = TBatchNormLayer<TCpu<AReal>>;
0093    using BNormDescriptors_t      = TDNNGenDescriptors<BNormLayer_t>;
0094
0095    using ConvLayer_t             = CNN::TConvLayer<TCpu<AReal>>;
0096    using ConvDescriptors_t       = CNN::TCNNDescriptors<ConvLayer_t>;
0097    using ConvWorkspace_t         = CNN::TCNNWorkspace<ConvLayer_t>;
0098    using PoolingLayer_t          = CNN::TMaxPoolLayer<TCpu<AReal>>;
0099    using PoolingDescriptors_t    = CNN::TCNNDescriptors<PoolingLayer_t>;
0100    using PoolingWorkspace_t      = CNN::TCNNWorkspace<PoolingLayer_t>;
0101
0102    using RNNDescriptors_t = RNN::TRNNDescriptors<TCpu<AReal>>;
0103    using RNNWorkspace_t = RNN::TRNNWorkspace<TCpu<AReal>>;
0104
0105
0106    static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::ColumnMajor; }
0107
0108    static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
0109       return Tensor_t( {c,h*w,n}, GetTensorLayout());
0110    }
0111    static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
0112       return Tensor_t( buffer, {c,h*w,n}, GetTensorLayout());
0113    }
0114    static Tensor_t CreateTensor(size_t b, size_t t, size_t w)
0115    {
0116       return Tensor_t({t, w, b}, GetTensorLayout());
0117    }
0118    static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t b, size_t t, size_t w)
0119    {
0120       return Tensor_t(buffer, {t, w, b}, GetTensorLayout());
0121    }
0122    // create a weight tensor/matrix vector   from another tensor/weight  vector using the given tensor shapes
0123    // this function is used by the optimizers to store intermediate weights representations
0124    static void  CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
0125       if (!newWeights.empty()) newWeights.clear();
0126       size_t n =  weights.size();
0127       for (size_t i = 0; i < n; ++i)
0128          newWeights.emplace_back( weights[i].GetNrows(), weights[i].GetNcols());
0129    }
0130
0131    static bool IsCudnn() { return false; }
0132    //____________________________________________________________________________
0133    //
0134    // Architecture Initialization
0135    //____________________________________________________________________________
0136
0137    /** Initialize CNN data/operator descriptors. Not used at the moment.*/
0138
0139    static void InitializeBNormDescriptors(TDescriptors * & /*descriptors*/,
0140                                           BNormLayer_t * /*L = nullptr*/) {}
0141
0142    static void InitializeConvDescriptors(TDescriptors * & /*descriptors*/,
0143                                          ConvLayer_t * /*L = nullptr*/) {}
0144    static void InitializePoolDescriptors(TDescriptors * & /*descriptors*/,
0145                                          PoolingLayer_t * /*L = nullptr*/) {}
0146    static void InitializeRNNDescriptors(TDescriptors *& /*descriptors*/, GenLayer_t * /*L*/) {}
0147    static void InitializeLSTMDescriptors(TDescriptors *& /*descriptors*/, GenLayer_t * /*L*/) {}
0148    static void InitializeGRUDescriptors(TDescriptors *& /*descriptors*/, GenLayer_t * /*L*/) {}
0149
0150    static void InitializeActivationDescriptor(ActivationDescriptor_t &/*descriptors*/, EActivationFunction /*activFunc */ , double /*coef*/ = 0.0) {}
0151
0152    /** Release CNN data/operator descriptors. Not used at the moment.*/
0153    static void ReleaseConvDescriptors(TDescriptors * & /*descriptors*/) {}
0154    static void ReleasePoolDescriptors(TDescriptors * & /*descriptors*/) {}
0155    static void ReleaseBNormDescriptors(TDescriptors * & /*descriptors*/) {}
0156    static void ReleaseRNNDescriptors(TDescriptors *& /*descriptors*/) {}
0157
0158    static void InitializeConvWorkspace(TWorkspace * & /*workspace*/,
0159                                        TDescriptors * & /*descriptors*/,
0160                                        const DNN::CNN::TConvParams & /*params*/,
0161                                        ConvLayer_t * /*L = nullptr*/) {}
0162    static void InitializePoolDropoutWorkspace(TWorkspace * & /*workspace*/,
0163                                        TDescriptors * & /*descriptors*/,
0164                                        const DNN::CNN::TConvParams & /*params*/,
0165                                        PoolingLayer_t * /*L = nullptr*/) {}
0166    static void InitializeRNNWorkspace(TWorkspace *& /*workspace*/, TDescriptors *& /*descriptors*/, GenLayer_t * /*L*/) {}
0167    static void InitializeLSTMWorkspace(TWorkspace *& /*workspace*/, TDescriptors *& /*descriptors*/, GenLayer_t * /*L*/){}
0168    static void InitializeGRUWorkspace(TWorkspace *& /*workspace*/, TDescriptors *& /*descriptors*/, GenLayer_t * /*L*/){}
0169
0170    static void FreeConvWorkspace(TWorkspace * & /*workspace*/) {}   ///< Only used for certain cudnn on-device memory
0171    static void FreePoolDropoutWorkspace(TWorkspace * & /*workspace*/) {}
0172    static void FreeRNNWorkspace(TWorkspace *& /*workspace*/) {}
0173
0174    static void ReleaseDescriptor(ActivationDescriptor_t &  /* activationDescr */) {}
0175
0176    static void InitializeRNNTensors(GenLayer_t * /*layer*/)   {}
0177    static void InitializeLSTMTensors(GenLayer_t * /*layer*/) {}
0178    static void InitializeGRUTensors(GenLayer_t * /*layer*/) {}
0179
0180    //____________________________________________________________________________
0181    //
0182    // Propagation
0183    //____________________________________________________________________________
0184
0185    /** @name Forward Propagation
0186     * Low-level functions required for the forward propagation of activations
0187     * through the network.
0188     */
0189    ///@{
0190    /** Matrix-multiply \p input with the transpose of \p weights and
0191     *  write the results into \p output. */
0192    static void MultiplyTranspose(Matrix_t &output, const Matrix_t &input, const Matrix_t &weights);
0193
0194    static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights) {
0195       Matrix_t output_matrix = output.GetMatrix();
0196       MultiplyTranspose( output_matrix, input.GetMatrix(), weights);
0197       //ensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
0198    }
0199
0200    /** Add the vectors biases row-wise to the matrix output */
0201    static void AddRowWise(Matrix_t &output,const Matrix_t &biases);
0202
0203    static void AddRowWise(Tensor_t &output, const Matrix_t &biases) {
0204       Matrix_t output_matrix = output.GetMatrix();
0205       AddRowWise(output_matrix, biases);
0206       //Tensor_t::MatrixToTensor(output_matrix, output); // this maybe is not needed
0207    }
0208
0209    /** @name Backward Propagation (Dense Layers)
0210     * Low-level functions required for the forward propagation of activations
0211     * through the network.
0212     */
0213    ///@{
0214    /** Perform the complete backward propagation step. If the provided
0215     *  \p activationGradientsBackward matrix is not empty, compute the
0216     *  gradients of the objective function with respect to the activations
0217     *  of the previous layer (backward direction).
0218     *  Also compute the weight and the bias gradients. Modifies the values
0219     *  in \p df and thus produces only a valid result, if it is applied the
0220     *  first time after the corresponding forward propagation has been per-
0221     *  formed. */
0222    static void Backward(Tensor_t & activationGradientsBackward,
0223                         Matrix_t & weightGradients,
0224                         Matrix_t & biasGradients,
0225                         const Tensor_t & df,
0226                         const Tensor_t & activationGradients,
0227                         const Matrix_t & weights,
0228                         const Tensor_t & activationBackward);
0229
0230
0231    /** Adds a the elements in matrix B scaled by c to the elements in
0232     *  the matrix A. This is required for the weight update in the gradient
0233     *  descent step.*/
0234    static void ScaleAdd(Matrix_t & A,
0235                         const Matrix_t & B,
0236                         Scalar_t beta = 1.0);
0237
0238    static void Copy(Matrix_t & B,
0239                     const Matrix_t & A);
0240
0241    // copy from another type of matrix
0242    template<typename AMatrix_t>
0243    static void CopyDiffArch(Matrix_t & B, const AMatrix_t & A);
0244
0245
0246    /** Above functions extended to vectors */
0247    static void ScaleAdd(Tensor_t & A,
0248                         const Tensor_t & B,
0249                         Scalar_t beta = 1.0);
0250
0251    static void Copy(Tensor_t & A,
0252                     const Tensor_t & B);
0253
0254    // copy from another tensor
0255    template<typename ATensor_t>
0256    static void CopyDiffArch(Tensor_t & A,
0257                      const ATensor_t & B);
0258
0259    // copy from vector of matrices of different types
0260    template<typename AMatrix_t>
0261    static void CopyDiffArch(std::vector<Matrix_t>  & A,
0262                       const std::vector<AMatrix_t> & B);
0263
0264    ///@}
0265
0266    //____________________________________________________________________________
0267    //
0268    // Activation Functions
0269    //____________________________________________________________________________
0270
0271    /** @name Activation Functions
0272     * For each activation function, the low-level interface contains two routines.
0273     * One that applies the activation function to a matrix and one that evaluate
0274     * the derivatives of the activation function at the elements of a given matrix
0275     * and writes the results into the result matrix.
0276     */
0277    ///@{
0278    /*  impl using Matrix */
0279    /*inline void evaluate(Matrix_t &A, EActivationFunction f)
0280    {
0281     Tensor_t tA(A);
0282     evaluate<TCpu<AReal>>(tA,f);
0283    }*/
0284
0285    static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
0286                           const ActivationDescriptor_t activationDescr,
0287                           const double coef = 0.0, const Scalar_t alpha = 1,
0288                           const Scalar_t beta = 0);
0289
0290    /** Computes the gradient of the activation function */
0291    static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
0292                                           const Tensor_t & dY,  const Tensor_t & X,
0293                                           EActivationFunction activFunct,
0294                                           const ActivationDescriptor_t activationDescr,
0295                                           const Scalar_t alpha = 1,
0296                                           const Scalar_t beta = 0);
0297
0298    static void IdentityDerivative(Tensor_t & B,
0299                                   const Tensor_t &A);
0300
0301    static void Relu(Tensor_t & B);
0302    static void ReluDerivative(Tensor_t & B,
0303                               const Tensor_t & A);
0304
0305    static void Sigmoid(Tensor_t & B);
0306    static void SigmoidDerivative(Tensor_t & B,
0307                                  const Tensor_t & A);
0308
0309    static void Tanh(Tensor_t & B);
0310    static void TanhDerivative(Tensor_t & B,
0311                               const Tensor_t & A);
0312
0313    // fast tanh (only when VDT is available)
0314    static void FastTanh(Tensor_t &B);
0315    static void FastTanhDerivative(Tensor_t &B, const Tensor_t &A);
0316
0317    static void SymmetricRelu(Tensor_t & B);
0318    static void SymmetricReluDerivative(Tensor_t & B,
0319                                        const Tensor_t & A);
0320
0321    static void SoftSign(Tensor_t & B);
0322    static void SoftSignDerivative(Tensor_t & B,
0323                                   const Tensor_t & A);
0324
0325    static void Gauss(Tensor_t & B);
0326    static void GaussDerivative(Tensor_t & B,
0327                                const Tensor_t & A);
0328    ///@}
0329
0330    //____________________________________________________________________________
0331    //
0332    // Loss Functions
0333    //____________________________________________________________________________
0334
0335    /** @name Loss Functions
0336     * Loss functions compute a scalar value given the \p output of the network
0337     * for a given training input and the expected network prediction \p Y that
0338     * quantifies the quality of the prediction. For each function also a routing
0339     * that computes the gradients (suffixed by Gradients) must be provided for
0340     * the starting of the backpropagation algorithm.
0341     */
0342    ///@{
0343
0344    static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
0345                                     const Matrix_t &weights);
0346    static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
0347                                          const Matrix_t &output, const Matrix_t &weights);
0348
0349    /** Sigmoid transformation is implicitly applied, thus \p output should
0350     *  hold the linear activations of the last layer in the net. */
0351    static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
0352                                 const Matrix_t &weights);
0353
0354    static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
0355                                      const Matrix_t &output, const Matrix_t &weights);
0356
0357    /** Softmax transformation is implicitly applied, thus \p output should
0358     *  hold the linear activations of the last layer in the net. */
0359    static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
0360                                        const Matrix_t &weights);
0361    static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
0362                                             const Matrix_t &output, const Matrix_t &weights);
0363    ///@}
0364
0365    //____________________________________________________________________________
0366    //
0367    // Output Functions
0368    //____________________________________________________________________________
0369
0370    /** @name Output Functions
0371     * Output functions transform the activations \p output of the
0372     * output layer in the network to a valid prediction \p YHat for
0373     * the desired usage of the network, e.g.  the identity function
0374     * for regression or the sigmoid transformation for two-class
0375     * classification.
0376     */
0377    ///@{
0378    static void Sigmoid(Matrix_t &YHat,
0379                         const Matrix_t & );
0380    static void Softmax(Matrix_t &YHat,
0381                        const Matrix_t & );
0382    ///@}
0383
0384    //____________________________________________________________________________
0385    //
0386    // Regularization
0387    //____________________________________________________________________________
0388
0389    /** @name Regularization
0390     * For each regularization type two functions are required, one named
0391     * <tt>`<Type>`Regularization</tt> that evaluates the corresponding
0392     * regularization functional for a given weight matrix and the
0393     * <tt>Add<Type>RegularizationGradients</tt>, that adds the regularization
0394     * component in the gradients to the provided matrix.
0395     */
0396    ///@{
0397
0398    static Scalar_t L1Regularization(const Matrix_t & W);
0399    static void AddL1RegularizationGradients(Matrix_t & A,
0400                                             const Matrix_t & W,
0401                                             Scalar_t weightDecay);
0402
0403    static Scalar_t L2Regularization(const Matrix_t & W);
0404    static void AddL2RegularizationGradients(Matrix_t & A,
0405                                             const Matrix_t & W,
0406                                             Scalar_t weightDecay);
0407    ///@}
0408
0409    //____________________________________________________________________________
0410    //
0411    // Initialization
0412    //____________________________________________________________________________
0413
0414    /** @name Initialization
0415     * For each initialization method, one function in the low-level interface
0416     * is provided. The naming scheme is <p>Initialize<Type></p> for a given
0417     * initialization method Type.
0418     */
0419    ///@{
0420
0421    static void InitializeGauss(Matrix_t & A);
0422    static void InitializeUniform(Matrix_t & A);
0423    static void InitializeIdentity(Matrix_t & A);
0424    static void InitializeZero(Matrix_t & A);
0425    static void InitializeZero(Tensor_t &A);
0426    static void InitializeGlorotNormal(Matrix_t & A);
0427    static void InitializeGlorotUniform(Matrix_t & A);
0428
0429    // return static instance of random generator used for initialization
0430    // if generator does not exist it is created the first time with a random seed (e.g. seed = 0)
0431    static TRandom & GetRandomGenerator();
0432    // set random seed for the static generator
0433    // if the static generator does not exists it is created
0434    static void SetRandomSeed(size_t seed);
0435    ///@}
0436
0437    //____________________________________________________________________________
0438    //
0439    // Dropout
0440    //____________________________________________________________________________
0441
0442    /** @name Dropout
0443     */
0444    ///@{
0445
0446    /** Apply dropout with activation probability \p p to the given
0447     *  tensor \p A and scale the result by reciprocal of \p p. */
0448    static void DropoutForward(Tensor_t & A,
0449                               TDescriptors * descriptors,
0450                               TWorkspace   * workspace,
0451                               Scalar_t p);
0452
0453    static void DropoutForward(Matrix_t & A, Scalar_t p) {
0454       Tensor_t tA(A);
0455       DropoutForward( tA, static_cast<TDescriptors *> (nullptr), static_cast<TWorkspace *> (nullptr), p );
0456    }
0457
0458    // Only needed for cuDNN
0459    static void DropoutBackward(Tensor_t & /*A */,
0460                                TDescriptors * /*descriptors */,
0461                                TWorkspace   * /*workspace*/) {}
0462    ///@}
0463
0464    //____________________________________________________________________________
0465    //
0466    // Batch Normalization
0467    //____________________________________________________________________________
0468
0469    /** @name Batch Normalization Layer Propagation
0470     */
0471    ///@{
0472
0473    /** The input from each batch are normalized during training to have zero mean and unit variance
0474      * and they are then scaled by two parameter, different for each input variable:
0475      *  - a scale factor `\gamma` gamma
0476      *  - an offset `\beta` beta */
0477    static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
0478                                              Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
0479                                              Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
0480                                              Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
0481
0482
0483    /** During inference the inputs are not normalized using the batch mean but the previously computed
0484      * at  running mean and variance */
0485    static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
0486                                               Tensor_t &y, const Matrix_t &runningMeans,
0487                                               const Matrix_t &runningVars, Scalar_t epsilon,
0488                                               const TensorDescriptor_t &);
0489
0490    /**
0491     * */
0492    static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
0493                                       Matrix_t &gamma, //  Matrix_t &beta, (not needed)
0494                                       Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
0495                                       const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
0496
0497    // helper function for BNorm layer
0498    static Tensor_t BatchNormLayerReshapeTensor(int axis, const Tensor_t &x);
0499
0500    ///@}
0501
0502    //____________________________________________________________________________
0503    //
0504    //  Convolutional Layer Propagation
0505    //____________________________________________________________________________
0506
0507    /** @name Forward Propagation in Convolutional Layer
0508     */
0509    ///@{
0510
0511    /** Calculate how many neurons "fit" in the output layer, given the input as well as the layer's hyperparameters.
0512     */
0513    static size_t calculateDimension(size_t imgDim, size_t fltDim, size_t padding, size_t stride);
0514
0515    /** Transform the matrix B in local view format, suitable for
0516     *  convolution, and store it in matrix A */
0517    static void Im2col(Matrix_t &A, const Matrix_t &B, size_t imgHeight, size_t imgWidth, size_t fltHeight,
0518                       size_t fltWidth, size_t strideRows, size_t strideCols, size_t zeroPaddingHeight,
0519                       size_t zeroPaddingWidth);
0520
0521    static void Im2colIndices(std::vector<int> &V, const Matrix_t &B, size_t nLocalViews, size_t imgHeight,
0522                              size_t imgWidth, size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols,
0523                              size_t zeroPaddingHeight, size_t zeroPaddingWidth);
0524    static void Im2colFast(Matrix_t &A, const Matrix_t &B, const std::vector<int> &V);
0525
0526    /** Rotates the matrix \p B, which is representing a weights,
0527     *  and stores them in the matrix \p A. */
0528    static void RotateWeights(Matrix_t &A, const Matrix_t &B, size_t filterDepth, size_t filterHeight,
0529                              size_t filterWidth, size_t numFilters);
0530
0531    /** Add the biases in the Convolutional Layer.  */
0532    static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
0533    ///@}
0534
0535    /** Dummy placeholder - preparation is currently only required for the CUDA architecture. */
0536    static void PrepareInternals(Tensor_t &) {}
0537
0538    /** Forward propagation in the Convolutional layer */
0539    static void ConvLayerForward(Tensor_t &output, Tensor_t &inputActivationFunc, const Tensor_t &input,
0540                                 const Matrix_t &weights, const Matrix_t &biases, const DNN::CNN::TConvParams &params,
0541                                 EActivationFunction activFunc, Tensor_t & /* inputPrime */,
0542                                 const ConvDescriptors_t & /*descriptors*/, // Empty struct for cuda architecture
0543                                 ConvWorkspace_t & /*workspace*/);          // Empty struct for cuda architecture
0544    // void * cudnnWorkspace = nullptr);          // Remains nullptr for cuda architecture
0545
0546    /** @name Backward Propagation in Convolutional Layer
0547     */
0548    ///@{
0549
0550    /** Perform the complete backward propagation step in a Convolutional Layer.
0551     *  If the provided \p activationGradientsBackward matrix is not empty, compute the
0552     *  gradients of the objective function with respect to the activations
0553     *  of the previous layer (backward direction).
0554     *  Also compute the weight and the bias gradients. Modifies the values
0555     *  in \p df and thus produces only a valid result, if it is applied the
0556     *  first time after the corresponding forward propagation has been per-
0557     *  formed. */
0558    static void
0559    ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients, Matrix_t &biasGradients,
0560                      Tensor_t &df, Tensor_t &activationGradients, const Matrix_t &weights,
0561                      const Tensor_t &activationBackward, const Tensor_t &outputTensor, EActivationFunction activFunc,
0562                      const ConvDescriptors_t & /*descriptors*/, ConvWorkspace_t & /*workspace*/, size_t batchSize,
0563                      size_t inputHeight, size_t inputWidth, size_t depth, size_t height, size_t width,
0564                      size_t filterDepth, size_t filterHeight, size_t filterWidth, size_t nLocalViews);
0565
0566    /** Utility function for calculating the activation gradients of the layer
0567     *  before the convolutional layer. */
0568    static void CalculateConvActivationGradients(Tensor_t &activationGradientsBackward, const Tensor_t &df,
0569                                                 const Matrix_t &weights, size_t batchSize, size_t inputHeight,
0570                                                 size_t inputWidth, size_t depth, size_t height, size_t width,
0571                                                 size_t filterDepth, size_t filterHeight, size_t filterWidth);
0572
0573    /** Utility function for calculating the weight gradients of the convolutional
0574     * layer. */
0575    static void CalculateConvWeightGradients(Matrix_t &weightGradients, const Tensor_t &df,
0576                                             const Tensor_t &activations_backward, size_t batchSize, size_t inputHeight,
0577                                             size_t inputWidth, size_t depth, size_t height, size_t width,
0578                                             size_t filterDepth, size_t filterHeight, size_t filterWidth,
0579                                             size_t nLocalViews);
0580
0581    /** Utility function for calculating the bias gradients of the convolutional
0582     *  layer */
0583    static void CalculateConvBiasGradients(Matrix_t &biasGradients, const Tensor_t &df, size_t batchSize, size_t depth,
0584                                           size_t nLocalViews);
0585    ///@}
0586
0587    //____________________________________________________________________________
0588    //
0589    //  Max Pooling Layer Propagation
0590    //____________________________________________________________________________
0591    /** @name Forward Propagation in Max Pooling Layer
0592     */
0593    ///@{
0594
0595    /** Downsample the matrix \p C to the matrix \p A, using max
0596     * operation, such that the winning indices are stored in matrix
0597     * \p B. */
0598    static void Downsample(Tensor_t &A, Tensor_t &B, const Tensor_t &C, const PoolingDescriptors_t & /*descriptors*/,
0599                           PoolingWorkspace_t & /*workspace*/, size_t imgHeight, size_t imgWidth, size_t fltHeight,
0600                           size_t fltWidth, size_t strideRows, size_t strideCols);
0601
0602    ///@}
0603
0604    /** @name Backward Propagation in Max Pooling Layer
0605     */
0606    ///@{
0607    /** Perform the complete backward propagation step in a Pooling Layer. Based on the
0608     *  winning indices stored in the index matrix, it just forwards the activation
0609     *  gradients to the previous layer. */
0610    static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
0611                                     const Tensor_t &indexMatrix, const Tensor_t & /*inputActivation*/,
0612                                     const Tensor_t & /*outputTensor*/, const PoolingDescriptors_t & /*descriptors*/,
0613                                     PoolingWorkspace_t & /*workspace*/, size_t imgHeight, size_t imgWidth,
0614                                     size_t fltHeight, size_t fltWidth, size_t strideRows, size_t strideCols,
0615                                     size_t nLocalViews);
0616
0617                                      //// Recurrent Network Functions
0618
0619    /** Backward pass for Recurrent Networks */
0620    static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward, // BxH
0621                                            Matrix_t &input_weight_gradients, Matrix_t &state_weight_gradients,
0622                                            Matrix_t &bias_gradients,
0623                                            Matrix_t &df,                  // DxH
0624                                            const Matrix_t &state,         // BxH
0625                                            const Matrix_t &weights_input, // HxD
0626                                            const Matrix_t &weights_state, // HxH
0627                                            const Matrix_t &input,         // BxD
0628                                            Matrix_t &input_gradient);
0629
0630    // dummy RNN functions
0631    static void RNNForward(const Tensor_t & /* x */, const Matrix_t & /* hx */, const Matrix_t & /* cx */,
0632                           const Tensor_t & /* weights */, Tensor_t & /* y */, Matrix_t & /* hy */, Matrix_t & /* cy */,
0633                           const RNNDescriptors_t & /* descr */, RNNWorkspace_t & /* workspace */, bool /* isTraining */)
0634    {
0635    }
0636
0637    static void RNNBackward(const Tensor_t & /* x */, const Matrix_t & /* hx */, const Matrix_t & /* cx */,
0638                            const Tensor_t & /* y */, const Tensor_t & /* dy */, const Matrix_t & /* dhy */,
0639                            const Matrix_t & /* dcy */, const Tensor_t & /* weights */, Tensor_t & /* dx */,
0640                            Matrix_t & /* dhx */, Matrix_t & /* dcx */, Tensor_t & /* dw */,
0641                            const RNNDescriptors_t & /* desc */, RNNWorkspace_t & /* workspace */)
0642    {
0643    }
0644
0645    /** Backward pass for LSTM Network */
0646    static Matrix_t & LSTMLayerBackward(TCpuMatrix<Scalar_t> & state_gradients_backward,
0647                                           TCpuMatrix<Scalar_t> & cell_gradients_backward,
0648                                           TCpuMatrix<Scalar_t> & input_weight_gradients,
0649                                        TCpuMatrix<Scalar_t> & forget_weight_gradients,
0650                                        TCpuMatrix<Scalar_t> & candidate_weight_gradients,
0651                                        TCpuMatrix<Scalar_t> & output_weight_gradients,
0652                                        TCpuMatrix<Scalar_t> & input_state_weight_gradients,
0653                                        TCpuMatrix<Scalar_t> & forget_state_weight_gradients,
0654                                        TCpuMatrix<Scalar_t> & candidate_state_weight_gradients,
0655                                        TCpuMatrix<Scalar_t> & output_state_weight_gradients,
0656                                        TCpuMatrix<Scalar_t> & input_bias_gradients,
0657                                        TCpuMatrix<Scalar_t> & forget_bias_gradients,
0658                                        TCpuMatrix<Scalar_t> & candidate_bias_gradients,
0659                                        TCpuMatrix<Scalar_t> & output_bias_gradients,
0660                                        TCpuMatrix<Scalar_t> & di,
0661                                        TCpuMatrix<Scalar_t> & df,
0662                                        TCpuMatrix<Scalar_t> & dc,
0663                                        TCpuMatrix<Scalar_t> & dout,
0664                                        const TCpuMatrix<Scalar_t> & precStateActivations,
0665                                        const TCpuMatrix<Scalar_t> & precCellActivations,
0666                                        const TCpuMatrix<Scalar_t> & fInput,
0667                                        const TCpuMatrix<Scalar_t> & fForget,
0668                                        const TCpuMatrix<Scalar_t> & fCandidate,
0669                                        const TCpuMatrix<Scalar_t> & fOutput,
0670                                        const TCpuMatrix<Scalar_t> & weights_input,
0671                                        const TCpuMatrix<Scalar_t> & weights_forget,
0672                                        const TCpuMatrix<Scalar_t> & weights_candidate,
0673                                        const TCpuMatrix<Scalar_t> & weights_output,
0674                                        const TCpuMatrix<Scalar_t> & weights_input_state,
0675                                        const TCpuMatrix<Scalar_t> & weights_forget_state,
0676                                        const TCpuMatrix<Scalar_t> & weights_candidate_state,
0677                                        const TCpuMatrix<Scalar_t> & weights_output_state,
0678                                        const TCpuMatrix<Scalar_t> & input,
0679                                        TCpuMatrix<Scalar_t> & input_gradient,
0680                                        TCpuMatrix<Scalar_t> & cell_gradient,
0681                                        TCpuMatrix<Scalar_t> & cell_tanh);
0682
0683
0684    /** Backward pass for GRU Network */
0685    static Matrix_t & GRULayerBackward(TCpuMatrix<Scalar_t> & state_gradients_backward,
0686                                       TCpuMatrix<Scalar_t> & reset_weight_gradients,
0687                                       TCpuMatrix<Scalar_t> & update_weight_gradients,
0688                                       TCpuMatrix<Scalar_t> & candidate_weight_gradients,
0689                                       TCpuMatrix<Scalar_t> & reset_state_weight_gradients,
0690                                       TCpuMatrix<Scalar_t> & update_state_weight_gradients,
0691                                       TCpuMatrix<Scalar_t> & candidate_state_weight_gradients,
0692                                       TCpuMatrix<Scalar_t> & reset_bias_gradients,
0693                                       TCpuMatrix<Scalar_t> & update_bias_gradients,
0694                                       TCpuMatrix<Scalar_t> & candidate_bias_gradients,
0695                                       TCpuMatrix<Scalar_t> & dr,
0696                                       TCpuMatrix<Scalar_t> & du,
0697                                       TCpuMatrix<Scalar_t> & dc,
0698                                       const TCpuMatrix<Scalar_t> & precStateActivations,
0699                                       const TCpuMatrix<Scalar_t> & fReset,
0700                                       const TCpuMatrix<Scalar_t> & fUpdate,
0701                                       const TCpuMatrix<Scalar_t> & fCandidate,
0702                                       const TCpuMatrix<Scalar_t> & weights_reset,
0703                                       const TCpuMatrix<Scalar_t> & weights_update,
0704                                       const TCpuMatrix<Scalar_t> & weights_candidate,
0705                                       const TCpuMatrix<Scalar_t> & weights_reset_state,
0706                                       const TCpuMatrix<Scalar_t> & weights_update_state,
0707                                       const TCpuMatrix<Scalar_t> & weights_candidate_state,
0708                                       const TCpuMatrix<Scalar_t> & input,
0709                                       TCpuMatrix<Scalar_t> & input_gradient,
0710                                       bool resetGateAfter);
0711
0712
0713    ///@}
0714
0715    //____________________________________________________________________________
0716    //
0717    //  Reshape Layer Propagation
0718    //____________________________________________________________________________
0719    /** @name Forward and Backward Propagation in Reshape Layer
0720     */
0721    ///@{
0722
0723    /** Transform the matrix \p B to a matrix with different dimensions \p A */
0724    static void Reshape(Matrix_t &A, const Matrix_t &B);
0725
0726    /** Flattens the tensor \p B, such that each matrix, is stretched in
0727     *  one row, resulting with a matrix \p A. */
0728    static void Flatten(Tensor_t &A, const Tensor_t &B); // size_t size, size_t nRows, size_t nCols);
0729
0730    /** Transforms each row of \p B to a matrix and stores it in the
0731     *  tensor \p B. */
0732    static void Deflatten(Tensor_t &A, const Tensor_t &B); // size_t index, size_t nRows,size_t nCols);
0733
0734    /** Rearrage data according to time fill B x T x D out with T x B x D matrix in*/
0735    static void Rearrange(Tensor_t &out, const Tensor_t &in);
0736
0737
0738    ///@}
0739
0740    //____________________________________________________________________________
0741    //
0742    // Additional Arithmetic Functions
0743    //____________________________________________________________________________
0744
0745    /** @name Additional Arithmetic Functions
0746     *
0747     * Additional arithmetic on CUDA matrices  used to implement the low-level
0748     * interface.
0749     */
0750    ///@{
0751
0752    /** Standard multiplication of two matrices \p A and \p B with the result being
0753     *  written into C.
0754     */
0755    static void Multiply(Matrix_t &C, const Matrix_t &A, const Matrix_t &B);
0756    /** Matrix multiplication of two matrices \p A and \p B^T (transposed) with the
0757     *  result being written into C.
0758     */
0759    static void TransposeMultiply(Matrix_t &output, const Matrix_t &input, const Matrix_t &Weights, Scalar_t alpha = 1.0,
0760                                  Scalar_t beta = 0.);
0761    /** In-place Hadamard (element-wise) product of matrices \p A and \p B
0762     *  with the result being written into \p A.
0763     */
0764    static void Hadamard(Tensor_t &A, const Tensor_t &B);
0765    static void Hadamard(Matrix_t &A, const Matrix_t &B);
0766    // {
0767    //    Tensor_t tA(A);
0768    //    Hadamard( tA, Tensor_t(B));
0769    // }
0770
0771    /** Sum columns of (m x n) matrix \p A and write the results into the first
0772     * m elements in \p A.
0773     */
0774    static void SumColumns(Matrix_t &B, const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.);
0775
0776    /** Compute the sum of all elements in \p A */
0777    static Scalar_t Sum(const Matrix_t &A);
0778
0779    /** Check two matrices for equality, taking floating point arithmetic errors into account. */
0780    static bool AlmostEquals(const Matrix_t &A, const Matrix_t &B, double epsilon = 0.1);
0781
0782    /** Add the constant \p beta to all the elements of matrix \p A and write the
0783     * result into \p A.
0784     */
0785    static void ConstAdd(Matrix_t &A, Scalar_t beta);
0786
0787    /** Multiply the constant \p beta to all the elements of matrix \p A and write the
0788     * result into \p A.
0789     */
0790    static void ConstMult(Matrix_t &A, Scalar_t beta);
0791
0792    /** Reciprocal each element of the matrix \p A and write the result into
0793     * \p A
0794     */
0795    static void ReciprocalElementWise(Matrix_t &A);
0796
0797    /** Square each element of the matrix \p A and write the result into
0798     * \p A
0799     */
0800    static void SquareElementWise(Matrix_t &A);
0801
0802    /** Square root each element of the matrix \p A and write the result into
0803     * \p A
0804     */
0805    static void SqrtElementWise(Matrix_t &A);
0806
0807    // optimizer functions
0808    static void AdamUpdate(Matrix_t &A, const Matrix_t &M, const Matrix_t &V, Scalar_t alpha, Scalar_t eps);
0809    static void AdamUpdateFirstMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta);
0810    static void AdamUpdateSecondMom(Matrix_t &A, const Matrix_t &B, Scalar_t beta);
0811
0812    // printing of tensor
0813    static void PrintTensor(const Tensor_t &A, const std::string name = "Cpu-tensor", bool truncate = false);
0814
0815 };
0816
0817 //____________________________________________________________________________
0818 template <typename AReal>
0819 template <typename AMatrix_t>
0820 void TCpu<AReal>::CopyDiffArch(TCpuMatrix<AReal> &B,
0821                         const AMatrix_t &A)
0822 {
0823    // copy from another architecture using the reference one
0824    // this is not very efficient since creates temporary objects
0825    TMatrixT<AReal> tmp = A;  // this works also if A is a tensor
0826    Copy(B, TCpuMatrix<AReal>(tmp) );
0827 }
0828
0829 //____________________________________________________________________________
0830 template <typename AReal>
0831 template <typename ATensor_t>
0832 void TCpu<AReal>::CopyDiffArch(TCpuTensor<AReal> &B,
0833                             const ATensor_t &A)
0834 {
0835
0836    R__ASSERT(A.GetSize() == B.GetSize());
0837    // suppose A is of (B,D,H.W) and we want to convert to B,HW,D  or (D,HW,B) in ColumnMajor format
0838    for (size_t i = 0; i < A.GetFirstSize(); ++i) {
0839       TMatrixT<AReal> tmpIn = A.At(i);  // this convert tensor (B,D,H,W) in  (D,H,W)i -> (D,HW)i
0840
0841       TCpuMatrix<AReal> tmpOut = B.At(i).GetMatrix();    // matrix (D,HW)
0842       Copy(tmpOut, TCpuMatrix<AReal>(tmpIn));
0843    }
0844
0845    // ATensor_t tmpIn = A.Reshape({A.GetNrows(), A.GetNcols()});
0846    // auto tmpOut = B.Reshape({A.GetNrows(), A.GetNcols()});
0847    // Matrix_t mOut = tmpOut.GetMatrix();
0848    // CopyDiffArch(mOut, tmpIn.GetMatrix());
0849 }
0850
0851 // Implementation using vector of matrices for the weights
0852 template <typename AReal>
0853 template <typename AMatrix_t>
0854 void TCpu<AReal>::CopyDiffArch(std::vector<TCpuMatrix<AReal>> &A, const std::vector<AMatrix_t> &B)
0855 {
0856    for (size_t i = 0; i < A.size(); ++i) {
0857       CopyDiffArch(A[i], B[i]);
0858    }
0859 }
0860
0861 template <typename AReal>
0862 void TCpu<AReal>::PrintTensor(const typename TCpu<AReal>::Tensor_t & A, const std::string name, bool truncate )
0863 {
0864    std::cout << name << " size = " << A.GetSize() << " shape = { ";
0865    auto shape = A.GetShape();
0866    for (size_t k = 0; k < shape.size()-1; ++k)
0867       std::cout << shape[k] << " , ";
0868    std::cout << shape.back() << " } ";
0869
0870    // print elements
0871    // need to find way to nice printing all elements
0872    std::cout << " tensor count " << A.GetBufferUseCount() << std::endl;
0873    if (A.GetShape().size() == 2 ) {
0874       for (size_t i = 0; i < A.GetShape()[0]; ++i) {
0875          std::cout << "{ ";
0876          size_t n =  A.GetShape()[1];
0877          if (truncate) n = std::min(n,size_t(10));
0878          for (size_t j = 0; j < n; ++j) {
0879             std::cout << A(i,j) << " ";
0880          }
0881           if (truncate && n < A.GetShape()[1]) std::cout << " ...... ";
0882          std::cout << " } " << std::endl;
0883       }
0884    } else if  (A.GetShape().size() == 3 ) {
0885       for (size_t i = 0; i < A.GetFirstSize(); ++i) {
0886          std::cout << "{ ";
0887          for (size_t j = 0; j < A.GetHSize(); ++j) {
0888             std::cout << "{ ";
0889             size_t n =  A.GetWSize();
0890             if (truncate)  n = std::min(n,size_t(10));
0891             for (size_t k = 0; k < n; ++k) {
0892                std::cout << A(i,j,k) << " ";
0893             }
0894             if (truncate && n < A.GetWSize()) std::cout << " ...... ";
0895             std::cout << " } " << std::endl;
0896          }
0897          std::cout << " } " << std::endl;
0898       }
0899    }
0900    else {
0901       for (size_t l = 0; l < A.GetSize(); ++l) {
0902          std::cout << A.GetData()[l] << " ";
0903       }
0904       std::cout << "\n";
0905    }
0906 }
0907
0908
0909
0910
0911 } // namespace DNN
0912 } // namespace TMVA
0913
0914 #endif