TMVA/DNN/DeepNet.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Vladimir Ilievski
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TDeepNet                                                              *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Deep Neural Network                                                       *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Akshay Vashistha     <akshayvashistha1995@gmail.com> - CERN, Switzerland  *
0015  *      Vladimir Ilievski    <ilievski.vladimir@live.com>  - CERN, Switzerland    *
0016  *      Saurav Shekhar       <sauravshekhar01@gmail.com> - CERN, Switzerland      *
0017  *                                                                                *
0018  * Copyright (c) 2005-2015:                                                       *
0019  *      CERN, Switzerland                                                         *
0020  *      U. of Victoria, Canada                                                    *
0021  *      MPI-K Heidelberg, Germany                                                 *
0022  *      U. of Bonn, Germany                                                       *
0023  *                                                                                *
0024  * Redistribution and use in source and binary forms, with or without             *
0025  * modification, are permitted according to the terms listed in LICENSE           *
0026  * (see tmva/doc/LICENSE)                                          *
0027  **********************************************************************************/
0028
0029 #ifndef TMVA_DNN_DEEPNET
0030 #define TMVA_DNN_DEEPNET
0031
0032 #include "TMVA/DNN/Functions.h"
0033 #include "TMVA/DNN/TensorDataLoader.h"
0034
0035 #include "TMVA/DNN/GeneralLayer.h"
0036 #include "TMVA/DNN/DenseLayer.h"
0037 #include "TMVA/DNN/ReshapeLayer.h"
0038 #include "TMVA/DNN/BatchNormLayer.h"
0039
0040 #include "TMVA/DNN/CNN/ConvLayer.h"
0041 #include "TMVA/DNN/CNN/MaxPoolLayer.h"
0042
0043 #include "TMVA/DNN/RNN/RNNLayer.h"
0044 #include "TMVA/DNN/RNN/LSTMLayer.h"
0045 #include "TMVA/DNN/RNN/GRULayer.h"
0046
0047 #ifdef HAVE_DAE
0048 #include "TMVA/DNN/DAE/CompressionLayer.h"
0049 #include "TMVA/DNN/DAE/CorruptionLayer.h"
0050 #include "TMVA/DNN/DAE/ReconstructionLayer.h"
0051 #include "TMVA/DNN/DAE/LogisticRegressionLayer.h"
0052 #endif
0053
0054 #include <vector>
0055 #include <cmath>
0056
0057
0058 namespace TMVA {
0059 namespace DNN {
0060
0061    using namespace CNN;
0062    using namespace RNN;
0063
0064    //using namespace DAE;
0065
0066 /** \class TDeepNet
0067     Generic Deep Neural Network class.
0068     This class encapsulates the information for all types of Deep Neural Networks.
0069     \tparam Architecture The Architecture type that holds the
0070     architecture-specific data types.
0071  */
0072 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>>
0073 class TDeepNet {
0074 public:
0075
0076    using Tensor_t = typename Architecture_t::Tensor_t;
0077    using Matrix_t = typename Architecture_t::Matrix_t;
0078    using Scalar_t = typename Architecture_t::Scalar_t;
0079
0080
0081 private:
0082    bool inline isInteger(Scalar_t x) const { return x == floor(x); }
0083    size_t calculateDimension(int imgDim, int fltDim, int padding, int stride);
0084
0085 private:
0086    std::vector<Layer_t *> fLayers; ///< The layers consisting the DeepNet
0087
0088    size_t fBatchSize;   ///< Batch size used for training and evaluation.
0089    size_t fInputDepth;  ///< The depth of the input.
0090    size_t fInputHeight; ///< The height of the input.
0091    size_t fInputWidth;  ///< The width of the input.
0092
0093    size_t fBatchDepth;  ///< The depth of the batch used for training/testing.
0094    size_t fBatchHeight; ///< The height of the batch used for training/testing.
0095    size_t fBatchWidth;  ///< The width of the batch used for training/testing.
0096
0097    bool fIsTraining; ///< Is the network training?
0098
0099    ELossFunction fJ;      ///< The loss function of the network.
0100    EInitialization fI;    ///< The initialization method of the network.
0101    ERegularization fR;    ///< The regularization used for the network.
0102    Scalar_t fWeightDecay; ///< The weight decay factor.
0103
0104 public:
0105    /*! Default Constructor */
0106    TDeepNet();
0107
0108    /*! Constructor */
0109    TDeepNet(size_t BatchSize, size_t InputDepth, size_t InputHeight, size_t InputWidth, size_t BatchDepth,
0110             size_t BatchHeight, size_t BatchWidth, ELossFunction fJ, EInitialization fI = EInitialization::kZero,
0111             ERegularization fR = ERegularization::kNone, Scalar_t fWeightDecay = 0.0, bool isTraining = false);
0112
0113    /*! Copy-constructor */
0114    TDeepNet(const TDeepNet &);
0115
0116    /*! Destructor */
0117    ~TDeepNet();
0118
0119    /*! Function for adding Convolution layer in the Deep Neural Network,
0120     *  with a given depth, filter height and width, striding in rows and columns,
0121     *  the zero paddings, as well as the activation function and the dropout
0122     *  probability. Based on these parameters, it calculates the width and height
0123     *  of the convolutional layer. */
0124    TConvLayer<Architecture_t> *AddConvLayer(size_t depth, size_t filterHeight, size_t filterWidth, size_t strideRows,
0125                                             size_t strideCols, size_t paddingHeight, size_t paddingWidth,
0126                                             EActivationFunction f, Scalar_t dropoutProbability = 1.0);
0127
0128    /*! Function for adding Convolution Layer in the Deep Neural Network,
0129     *  when the layer is already created.  */
0130    void AddConvLayer(TConvLayer<Architecture_t> *convLayer);
0131
0132    /*! Function for adding Pooling layer in the Deep Neural Network,
0133     *  with a given filter height and width, striding in rows and columns as
0134     *  well as the dropout probability. The depth is same as the previous
0135     *  layer depth. Based on these parameters, it calculates the width and
0136     *  height of the pooling layer. */
0137    TMaxPoolLayer<Architecture_t> *AddMaxPoolLayer(size_t frameHeight, size_t frameWidth, size_t strideRows,
0138                                                   size_t strideCols, Scalar_t dropoutProbability = 1.0);
0139    /*! Function for adding Max Pooling layer in the Deep Neural Network,
0140     *  when the layer is already created. */
0141    void AddMaxPoolLayer(CNN::TMaxPoolLayer<Architecture_t> *maxPoolLayer);
0142
0143
0144    /*! Function for adding Recurrent Layer in the Deep Neural Network,
0145     * with given parameters */
0146    TBasicRNNLayer<Architecture_t> *AddBasicRNNLayer(size_t stateSize, size_t inputSize, size_t timeSteps,
0147                                                     bool rememberState = false,bool returnSequence = false,
0148                                                     EActivationFunction f = EActivationFunction::kTanh);
0149
0150    /*! Function for adding Vanilla RNN when the layer is already created
0151     */
0152    void AddBasicRNNLayer(TBasicRNNLayer<Architecture_t> *basicRNNLayer);
0153
0154    /*! Function for adding LSTM Layer in the Deep Neural Network,
0155     * with given parameters */
0156    TBasicLSTMLayer<Architecture_t> *AddBasicLSTMLayer(size_t stateSize, size_t inputSize, size_t timeSteps,
0157                                                     bool rememberState = false, bool returnSequence = false);
0158
0159    /*! Function for adding LSTM Layer in the Deep Neural Network,
0160     * when the layer is already created. */
0161    void AddBasicLSTMLayer(TBasicLSTMLayer<Architecture_t> *basicLSTMLayer);
0162
0163    /*! Function for adding GRU Layer in the Deep Neural Network,
0164     * with given parameters */
0165    TBasicGRULayer<Architecture_t> *AddBasicGRULayer(size_t stateSize, size_t inputSize, size_t timeSteps,
0166                                                     bool rememberState = false, bool returnSequence = false,
0167                                                     bool resetGateAfter = false);
0168
0169    /*! Function for adding GRU Layer in the Deep Neural Network,
0170     * when the layer is already created. */
0171    void AddBasicGRULayer(TBasicGRULayer<Architecture_t> *basicGRULayer);
0172
0173    /*! Function for adding Dense Connected Layer in the Deep Neural Network,
0174     *  with a given width, activation function and dropout probability.
0175     *  Based on the previous layer dimensions, it calculates the input width
0176     *  of the fully connected layer. */
0177    TDenseLayer<Architecture_t> *AddDenseLayer(size_t width, EActivationFunction f, Scalar_t dropoutProbability = 1.0);
0178
0179    /*! Function for adding Dense Layer in the Deep Neural Network, when
0180     *  the layer is already created. */
0181    void AddDenseLayer(TDenseLayer<Architecture_t> *denseLayer);
0182
0183    /*! Function for adding Reshape Layer in the Deep Neural Network, with a given
0184     *  height and width. It will take every matrix from the previous layer and
0185     *  reshape it to a matrix with new dimensions. */
0186    TReshapeLayer<Architecture_t> *AddReshapeLayer(size_t depth, size_t height, size_t width, bool flattening);
0187
0188    /*! Function for adding a Batch Normalization layer with given parameters */
0189    TBatchNormLayer<Architecture_t> *AddBatchNormLayer(Scalar_t momentum = -1, Scalar_t epsilon = 0.0001);
0190
0191    /*! Function for adding Reshape Layer in the Deep Neural Network, when
0192     *  the layer is already created. */
0193    void AddReshapeLayer(TReshapeLayer<Architecture_t> *reshapeLayer);
0194
0195 #ifdef HAVE_DAE   /// DAE functions
0196    /*! Function for adding Corruption layer in the Deep Neural Network,
0197     *  with given number of visibleUnits and hiddenUnits. It corrupts input
0198     *  according to given corruptionLevel and dropoutProbability. */
0199    TCorruptionLayer<Architecture_t> *AddCorruptionLayer(size_t visibleUnits, size_t hiddenUnits,
0200                                                         Scalar_t dropoutProbability, Scalar_t corruptionLevel);
0201
0202    /*! Function for adding Corruption Layer in the Deep Neural Network,
0203      *  when the layer is already created.  */
0204    void AddCorruptionLayer(TCorruptionLayer<Architecture_t> *corruptionLayer);
0205
0206    /*! Function for adding Compression layer in the Deep Neural Network,
0207     *  with given number of visibleUnits and hiddenUnits. It compresses the input units
0208     *   taking weights and biases from prev layers. */
0209    TCompressionLayer<Architecture_t> *AddCompressionLayer(size_t visibleUnits, size_t hiddenUnits,
0210                                                           Scalar_t dropoutProbability, EActivationFunction f,
0211                                                           std::vector<Matrix_t> weights, std::vector<Matrix_t> biases);
0212
0213    /*! Function for adding Compression Layer in the Deep Neural Network, when
0214     *  the layer is already created. */
0215    void AddCompressionLayer(TCompressionLayer<Architecture_t> *compressionLayer);
0216
0217    /*! Function for adding Reconstruction layer in the Deep Neural Network,
0218     *  with given number of visibleUnits and hiddenUnits. It reconstructs the input units
0219     *  taking weights and biases from prev layers. Same corruptionLevel and dropoutProbability
0220     *  must be passed as in corruptionLayer. */
0221    TReconstructionLayer<Architecture_t> *AddReconstructionLayer(size_t visibleUnits, size_t hiddenUnits,
0222                                                                 Scalar_t learningRate, EActivationFunction f,
0223                                                                 std::vector<Matrix_t> weights,
0224                                                                 std::vector<Matrix_t> biases, Scalar_t corruptionLevel,
0225                                                                 Scalar_t dropoutProbability);
0226
0227    /*! Function for adding Reconstruction Layer in the Deep Neural Network, when
0228     *  the layer is already created. */
0229    void AddReconstructionLayer(TReconstructionLayer<Architecture_t> *reconstructionLayer);
0230
0231    /*! Function for adding logisticRegressionLayer in the Deep Neural Network,
0232     *  with given number of inputUnits and outputUnits. It classifies the outputUnits. */
0233    TLogisticRegressionLayer<Architecture_t> *AddLogisticRegressionLayer(size_t inputUnits, size_t outputUnits,
0234                                                                         size_t testDataBatchSize,
0235                                                                         Scalar_t learningRate);
0236
0237    /*! Function for adding logisticRegressionLayer in the Deep Neural Network, when
0238     *  the layer is already created. */
0239    void AddLogisticRegressionLayer(TLogisticRegressionLayer<Architecture_t> *logisticRegressionLayer);
0240
0241    /* To train the Deep AutoEncoder network with required number of Corruption, Compression and Reconstruction
0242     * layers. */
0243    void PreTrain(std::vector<Matrix_t> &input, std::vector<size_t> numHiddenUnitsPerLayer, Scalar_t learningRate,
0244                  Scalar_t corruptionLevel, Scalar_t dropoutProbability, size_t epochs, EActivationFunction f,
0245                  bool applyDropout = false);
0246
0247    /* To classify outputLabel in Deep AutoEncoder. Should be used after PreTrain if required.
0248     * Currently, it used Logistic Regression Layer. Otherwise we can use any other classification layer also.
0249    */
0250    void FineTune(std::vector<Matrix_t> &input, std::vector<Matrix_t> &testInput, std::vector<Matrix_t> &outputLabel,
0251                  size_t outputUnits, size_t testDataBatchSize, Scalar_t learningRate, size_t epochs);
0252 #endif
0253
0254    /*! Function for initialization of the Neural Net. */
0255    void Initialize();
0256
0257    /*! Function that executes the entire forward pass in the network. */
0258    void Forward(Tensor_t &input, bool applyDropout = false);
0259
0260     /*! Function that reset some training flags after looping all the events but not the weights*/
0261    void ResetTraining();
0262
0263
0264
0265    /*! Function that executes the entire backward pass in the network. */
0266    void Backward(const Tensor_t &input, const Matrix_t &groundTruth, const Matrix_t &weights);
0267
0268
0269 #ifdef USE_PARALLEL_DEEPNET
0270    /*! Function for parallel forward in the vector of deep nets, where the master
0271     *  net is the net calling this function. There is one batch for one deep net.*/
0272    void ParallelForward(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
0273                         std::vector<TTensorBatch<Architecture_t>> &batches, bool applyDropout = false);
0274
0275    /*! Function for parallel backward in the vector of deep nets, where the master
0276     *  net is the net calling this function and getting the updates from the other nets.
0277     * There is one batch for one deep net.*/
0278    void ParallelBackward(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
0279                          std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t learningRate);
0280
0281    /*! Function for parallel backward in the vector of deep nets, where the master
0282     *  net is the net calling this function and getting the updates from the other nets,
0283     *  following the momentum strategy. There is one batch for one deep net.*/
0284    void ParallelBackwardMomentum(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
0285                                  std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t learningRate,
0286                                  Scalar_t momentum);
0287
0288    /*! Function for parallel backward in the vector of deep nets, where the master
0289     *  net is the net calling this function and getting the updates from the other nets,
0290     *  following the Nestorov momentum strategy. There is one batch for one deep net.*/
0291    void ParallelBackwardNestorov(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
0292                                  std::vector<TTensorBatch<Architecture_t>> &batches, Scalar_t learningRate,
0293                                  Scalar_t momentum);
0294
0295 #endif // endif use parallel deepnet
0296
0297    /*! Function that will update the weights and biases in the layers that
0298     *  contain weights and biases.  */
0299    void Update(Scalar_t learningRate);
0300
0301    /*! Function for evaluating the loss, based on the activations stored
0302     *  in the last layer. */
0303    Scalar_t Loss(const Matrix_t &groundTruth, const Matrix_t &weights, bool includeRegularization = true) const;
0304
0305    /*! Function for evaluating the loss, based on the propagation of the given input. */
0306    Scalar_t Loss(Tensor_t &input, const Matrix_t &groundTruth, const Matrix_t &weights,
0307                  bool inTraining = false, bool includeRegularization = true);
0308
0309    /*! Function for computing the regularizaton term to be added to the loss function  */
0310    Scalar_t RegularizationTerm() const;
0311
0312    /*! Prediction based on activations stored in the last layer. */
0313    void Prediction(Matrix_t &predictions, EOutputFunction f) const;
0314
0315    /*! Prediction for the given inputs, based on what network learned. */
0316    void Prediction(Matrix_t &predictions, Tensor_t & input, EOutputFunction f);
0317
0318    /*! Print the Deep Net Info */
0319    void Print() const;
0320
0321    /*! Get the layer in the vector of layers at position i */
0322    inline Layer_t *GetLayerAt(size_t i) { return fLayers[i]; }
0323    inline const Layer_t *GetLayerAt(size_t i) const { return fLayers[i]; }
0324
0325    /* Depth and the output width of the network. */
0326    inline size_t GetDepth() const { return fLayers.size(); }
0327    inline size_t GetOutputWidth() const { return fLayers.back()->GetWidth(); }
0328
0329    /* Return a reference to the layers. */
0330    inline std::vector<Layer_t *> &GetLayers() { return fLayers; }
0331    inline const std::vector<Layer_t *> &GetLayers() const { return fLayers; }
0332
0333    /*! Remove all layers from the network. */
0334    inline void Clear() { fLayers.clear(); }
0335
0336    /*! Getters */
0337    inline size_t GetBatchSize() const { return fBatchSize; }
0338    inline size_t GetInputDepth() const { return fInputDepth; }
0339    inline size_t GetInputHeight() const { return fInputHeight; }
0340    inline size_t GetInputWidth() const { return fInputWidth; }
0341
0342    inline size_t GetBatchDepth() const { return fBatchDepth; }
0343    inline size_t GetBatchHeight() const { return fBatchHeight; }
0344    inline size_t GetBatchWidth() const { return fBatchWidth; }
0345
0346    inline bool IsTraining() const { return fIsTraining; }
0347
0348    inline ELossFunction GetLossFunction() const { return fJ; }
0349    inline EInitialization GetInitialization() const { return fI; }
0350    inline ERegularization GetRegularization() const { return fR; }
0351    inline Scalar_t GetWeightDecay() const { return fWeightDecay; }
0352
0353    /*! Setters */
0354    // FIXME many of these won't work as the data structure storing activations
0355    // and gradients have not changed in all the layers, also params in layers
0356    // have not changed either
0357    inline void SetBatchSize(size_t batchSize) { fBatchSize = batchSize; }
0358    inline void SetInputDepth(size_t inputDepth) { fInputDepth = inputDepth; }
0359    inline void SetInputHeight(size_t inputHeight) { fInputHeight = inputHeight; }
0360    inline void SetInputWidth(size_t inputWidth) { fInputWidth = inputWidth; }
0361    inline void SetBatchDepth(size_t batchDepth) { fBatchDepth = batchDepth; }
0362    inline void SetBatchHeight(size_t batchHeight) { fBatchHeight = batchHeight; }
0363    inline void SetBatchWidth(size_t batchWidth) { fBatchWidth = batchWidth; }
0364    inline void SetLossFunction(ELossFunction J) { fJ = J; }
0365    inline void SetInitialization(EInitialization I) { fI = I; }
0366    inline void SetRegularization(ERegularization R) { fR = R; }
0367    inline void SetWeightDecay(Scalar_t weightDecay) { fWeightDecay = weightDecay; }
0368
0369    void SetDropoutProbabilities(const std::vector<Double_t> & probabilities);
0370
0371 };
0372
0373 //
0374 //  Deep Net Class - Implementation
0375 //
0376 //______________________________________________________________________________
0377 template <typename Architecture_t, typename Layer_t>
0378 TDeepNet<Architecture_t, Layer_t>::TDeepNet()
0379    : fLayers(), fBatchSize(0), fInputDepth(0), fInputHeight(0), fInputWidth(0), fBatchDepth(0), fBatchHeight(0),
0380      fBatchWidth(0), fJ(ELossFunction::kMeanSquaredError), fI(EInitialization::kZero), fR(ERegularization::kNone),
0381      fIsTraining(true), fWeightDecay(0.0)
0382 {
0383    // Nothing to do here.
0384 }
0385
0386 //______________________________________________________________________________
0387 template <typename Architecture_t, typename Layer_t>
0388 TDeepNet<Architecture_t, Layer_t>::TDeepNet(size_t batchSize, size_t inputDepth, size_t inputHeight, size_t inputWidth,
0389                                             size_t batchDepth, size_t batchHeight, size_t batchWidth, ELossFunction J,
0390                                             EInitialization I, ERegularization R, Scalar_t weightDecay, bool isTraining)
0391    : fLayers(), fBatchSize(batchSize), fInputDepth(inputDepth), fInputHeight(inputHeight), fInputWidth(inputWidth),
0392      fBatchDepth(batchDepth), fBatchHeight(batchHeight), fBatchWidth(batchWidth), fIsTraining(isTraining), fJ(J), fI(I),
0393      fR(R), fWeightDecay(weightDecay)
0394 {
0395    // Nothing to do here.
0396 }
0397
0398 //______________________________________________________________________________
0399 template <typename Architecture_t, typename Layer_t>
0400 TDeepNet<Architecture_t, Layer_t>::TDeepNet(const TDeepNet &deepNet)
0401    : fLayers(), fBatchSize(deepNet.fBatchSize), fInputDepth(deepNet.fInputDepth), fInputHeight(deepNet.fInputHeight),
0402      fInputWidth(deepNet.fInputWidth), fBatchDepth(deepNet.fBatchDepth), fBatchHeight(deepNet.fBatchHeight),
0403      fBatchWidth(deepNet.fBatchWidth), fIsTraining(deepNet.fIsTraining), fJ(deepNet.fJ), fI(deepNet.fI), fR(deepNet.fR),
0404      fWeightDecay(deepNet.fWeightDecay)
0405 {
0406    // Nothing to do here.
0407 }
0408
0409 //______________________________________________________________________________
0410 template <typename Architecture_t, typename Layer_t>
0411 TDeepNet<Architecture_t, Layer_t>::~TDeepNet()
0412 {
0413    // Relese the layers memory
0414    for (auto  layer : fLayers)
0415       delete layer;
0416    fLayers.clear();
0417 }
0418
0419 //______________________________________________________________________________
0420 template <typename Architecture_t, typename Layer_t>
0421 auto TDeepNet<Architecture_t, Layer_t>::calculateDimension(int imgDim, int fltDim, int padding, int stride) -> size_t
0422 {
0423    Scalar_t dimension = ((imgDim - fltDim + 2 * padding) / stride) + 1;
0424    if (!isInteger(dimension) || dimension <= 0) {
0425       this->Print();
0426       int iLayer = fLayers.size();
0427       Fatal("calculateDimension","Not compatible hyper parameters for layer %d - (imageDim, filterDim, padding, stride) %d , %d , %d , %d",
0428             iLayer, imgDim, fltDim, padding, stride);
0429       // std::cout << " calculateDimension - Not compatible hyper parameters (imgDim, fltDim, padding, stride)"
0430       //           << imgDim << " , " << fltDim << " , " <<  padding << " , " << stride<< " resulting dim is " << dimension << std::endl;
0431       // std::exit(EXIT_FAILURE);
0432    }
0433
0434    return (size_t)dimension;
0435 }
0436
0437 //______________________________________________________________________________
0438 template <typename Architecture_t, typename Layer_t>
0439 TConvLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddConvLayer(size_t depth, size_t filterHeight,
0440                                                                             size_t filterWidth, size_t strideRows,
0441                                                                             size_t strideCols, size_t paddingHeight,
0442                                                                             size_t paddingWidth, EActivationFunction f,
0443                                                                             Scalar_t dropoutProbability)
0444 {
0445    // All variables defining a convolutional layer
0446    size_t batchSize = this->GetBatchSize();
0447    size_t inputDepth;
0448    size_t inputHeight;
0449    size_t inputWidth;
0450    EInitialization init = this->GetInitialization();
0451    ERegularization reg = this->GetRegularization();
0452    Scalar_t decay = this->GetWeightDecay();
0453
0454    if (fLayers.size() == 0) {
0455       inputDepth = this->GetInputDepth();
0456       inputHeight = this->GetInputHeight();
0457       inputWidth = this->GetInputWidth();
0458    } else {
0459       Layer_t *lastLayer = fLayers.back();
0460       inputDepth = lastLayer->GetDepth();
0461       inputHeight = lastLayer->GetHeight();
0462       inputWidth = lastLayer->GetWidth();
0463    }
0464
0465
0466
0467    // Create the conv layer
0468    TConvLayer<Architecture_t> *convLayer = new TConvLayer<Architecture_t>(
0469            batchSize, inputDepth, inputHeight, inputWidth, depth, init, filterHeight, filterWidth, strideRows,
0470            strideCols, paddingHeight, paddingWidth, dropoutProbability, f, reg, decay);
0471
0472    fLayers.push_back(convLayer);
0473    return convLayer;
0474 }
0475
0476 //______________________________________________________________________________
0477 template <typename Architecture_t, typename Layer_t>
0478 void TDeepNet<Architecture_t, Layer_t>::AddConvLayer(TConvLayer<Architecture_t> *convLayer)
0479 {
0480    fLayers.push_back(convLayer);
0481 }
0482
0483 //______________________________________________________________________________
0484 template <typename Architecture_t, typename Layer_t>
0485 TMaxPoolLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddMaxPoolLayer(size_t frameHeight, size_t frameWidth,
0486                                                                                   size_t strideRows, size_t strideCols,
0487                                                                                   Scalar_t dropoutProbability)
0488 {
0489    size_t batchSize = this->GetBatchSize();
0490    size_t inputDepth;
0491    size_t inputHeight;
0492    size_t inputWidth;
0493
0494    if (fLayers.size() == 0) {
0495       inputDepth = this->GetInputDepth();
0496       inputHeight = this->GetInputHeight();
0497       inputWidth = this->GetInputWidth();
0498    } else {
0499       Layer_t *lastLayer = fLayers.back();
0500       inputDepth = lastLayer->GetDepth();
0501       inputHeight = lastLayer->GetHeight();
0502       inputWidth = lastLayer->GetWidth();
0503    }
0504
0505    TMaxPoolLayer<Architecture_t> *maxPoolLayer = new TMaxPoolLayer<Architecture_t>(
0506       batchSize, inputDepth, inputHeight, inputWidth, frameHeight, frameWidth,
0507       strideRows, strideCols, dropoutProbability);
0508
0509    // But this creates a copy or what?
0510    fLayers.push_back(maxPoolLayer);
0511
0512    return maxPoolLayer;
0513 }
0514
0515 //______________________________________________________________________________
0516 template <typename Architecture_t, typename Layer_t>
0517 void TDeepNet<Architecture_t, Layer_t>::AddMaxPoolLayer(TMaxPoolLayer<Architecture_t> *maxPoolLayer)
0518 {
0519    fLayers.push_back(maxPoolLayer);
0520 }
0521
0522 //______________________________________________________________________________
0523 template <typename Architecture_t, typename Layer_t>
0524 TBasicRNNLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddBasicRNNLayer(size_t stateSize, size_t inputSize,
0525                                                                                     size_t timeSteps,
0526                                                                                     bool rememberState, bool returnSequence,
0527                                                                                     EActivationFunction f)
0528 {
0529
0530    // should check if input and time size are consistent
0531
0532    //std::cout << "Create RNN " << fLayers.size() << "  " << this->GetInputHeight() << "  " << this->GetInputWidth() << std::endl;
0533    size_t inputHeight, inputWidth, inputDepth;
0534    if (fLayers.size() == 0) {
0535       inputHeight = this->GetInputHeight();
0536       inputWidth = this->GetInputWidth();
0537       inputDepth = this->GetInputDepth();
0538    } else {
0539       Layer_t *lastLayer = fLayers.back();
0540       inputHeight = lastLayer->GetHeight();
0541       inputWidth = lastLayer->GetWidth();
0542       inputDepth = lastLayer->GetDepth();
0543    }
0544    if (inputSize != inputWidth) {
0545       Error("AddBasicRNNLayer","Inconsistent input size with input layout  - it should be %zu instead of %zu",inputSize, inputWidth);
0546    }
0547    if (timeSteps != inputHeight && timeSteps != inputDepth) {
0548       Error("AddBasicRNNLayer","Inconsistent time steps with input layout - it should be %zu instead of %zu or %zu",timeSteps, inputHeight,inputDepth);
0549    }
0550
0551    TBasicRNNLayer<Architecture_t> *basicRNNLayer =
0552       new TBasicRNNLayer<Architecture_t>(this->GetBatchSize(), stateSize, inputSize, timeSteps, rememberState, returnSequence,
0553                                          f, fIsTraining, this->GetInitialization());
0554    fLayers.push_back(basicRNNLayer);
0555    return basicRNNLayer;
0556 }
0557
0558 //______________________________________________________________________________
0559 template <typename Architecture_t, typename Layer_t>
0560 void TDeepNet<Architecture_t, Layer_t>::AddBasicRNNLayer(TBasicRNNLayer<Architecture_t> *basicRNNLayer)
0561 {
0562    fLayers.push_back(basicRNNLayer);
0563 }
0564
0565 //______________________________________________________________________________
0566 template <typename Architecture_t, typename Layer_t>
0567 TBasicLSTMLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddBasicLSTMLayer(size_t stateSize, size_t inputSize,
0568                                                                                       size_t timeSteps, bool rememberState, bool returnSequence)
0569 {
0570    // should check if input and time size are consistent
0571    size_t inputHeight, inputWidth, inputDepth;
0572    if (fLayers.size() == 0) {
0573       inputHeight = this->GetInputHeight();
0574       inputWidth = this->GetInputWidth();
0575       inputDepth = this->GetInputDepth();
0576    } else {
0577       Layer_t *lastLayer = fLayers.back();
0578       inputHeight = lastLayer->GetHeight();
0579       inputWidth = lastLayer->GetWidth();
0580       inputDepth = lastLayer->GetDepth();
0581    }
0582    if (inputSize != inputWidth) {
0583       Error("AddBasicLSTMLayer", "Inconsistent input size with input layout  - it should be %zu instead of %zu", inputSize, inputWidth);
0584    }
0585    if (timeSteps != inputHeight && timeSteps != inputDepth) {
0586       Error("AddBasicLSTMLayer", "Inconsistent time steps with input layout - it should be %zu instead of %zu", timeSteps, inputHeight);
0587    }
0588
0589    TBasicLSTMLayer<Architecture_t> *basicLSTMLayer =
0590       new TBasicLSTMLayer<Architecture_t>(this->GetBatchSize(), stateSize, inputSize, timeSteps, rememberState, returnSequence,
0591                                          DNN::EActivationFunction::kSigmoid,
0592                                          DNN::EActivationFunction::kTanh,
0593                                          fIsTraining, this->GetInitialization());
0594    fLayers.push_back(basicLSTMLayer);
0595    return basicLSTMLayer;
0596 }
0597
0598 //______________________________________________________________________________
0599 template <typename Architecture_t, typename Layer_t>
0600 void TDeepNet<Architecture_t, Layer_t>::AddBasicLSTMLayer(TBasicLSTMLayer<Architecture_t> *basicLSTMLayer)
0601 {
0602    fLayers.push_back(basicLSTMLayer);
0603 }
0604
0605
0606 //______________________________________________________________________________
0607 template <typename Architecture_t, typename Layer_t>
0608 TBasicGRULayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddBasicGRULayer(size_t stateSize, size_t inputSize,
0609                                                                                       size_t timeSteps, bool rememberState, bool returnSequence, bool resetGateAfter)
0610 {
0611    // should check if input and time size are consistent
0612    size_t inputHeight, inputWidth, inputDepth;
0613    if (fLayers.size() == 0) {
0614       inputHeight = this->GetInputHeight();
0615       inputWidth = this->GetInputWidth();
0616       inputDepth = this->GetInputDepth();
0617    } else {
0618       Layer_t *lastLayer = fLayers.back();
0619       inputHeight = lastLayer->GetHeight();
0620       inputWidth = lastLayer->GetWidth();
0621       inputDepth = lastLayer->GetDepth();
0622    }
0623    if (inputSize != inputWidth) {
0624       Error("AddBasicGRULayer", "Inconsistent input size with input layout  - it should be %zu instead of %zu", inputSize, inputWidth);
0625    }
0626    if (timeSteps != inputHeight && timeSteps != inputDepth) {
0627       Error("AddBasicGRULayer", "Inconsistent time steps with input layout - it should be %zu instead of %zu", timeSteps, inputHeight);
0628    }
0629
0630    TBasicGRULayer<Architecture_t> *basicGRULayer =
0631       new TBasicGRULayer<Architecture_t>(this->GetBatchSize(), stateSize, inputSize, timeSteps, rememberState, returnSequence, resetGateAfter,
0632                                          DNN::EActivationFunction::kSigmoid,
0633                                          DNN::EActivationFunction::kTanh,
0634                                          fIsTraining, this->GetInitialization());
0635    fLayers.push_back(basicGRULayer);
0636    return basicGRULayer;
0637 }
0638
0639 //______________________________________________________________________________
0640 template <typename Architecture_t, typename Layer_t>
0641 void TDeepNet<Architecture_t, Layer_t>::AddBasicGRULayer(TBasicGRULayer<Architecture_t> *basicGRULayer)
0642 {
0643    fLayers.push_back(basicGRULayer);
0644 }
0645
0646
0647
0648 //DAE
0649 #ifdef HAVE_DAE
0650
0651 //______________________________________________________________________________
0652 template <typename Architecture_t, typename Layer_t>
0653 TCorruptionLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddCorruptionLayer(size_t visibleUnits,
0654                                                                                         size_t hiddenUnits,
0655                                                                                         Scalar_t dropoutProbability,
0656                                                                                         Scalar_t corruptionLevel)
0657 {
0658    size_t batchSize = this->GetBatchSize();
0659
0660    TCorruptionLayer<Architecture_t> *corruptionLayer =
0661       new TCorruptionLayer<Architecture_t>(batchSize, visibleUnits, hiddenUnits, dropoutProbability, corruptionLevel);
0662    fLayers.push_back(corruptionLayer);
0663    return corruptionLayer;
0664 }
0665 //______________________________________________________________________________
0666
0667 template <typename Architecture_t, typename Layer_t>
0668 void TDeepNet<Architecture_t, Layer_t>::AddCorruptionLayer(TCorruptionLayer<Architecture_t> *corruptionLayer)
0669 {
0670    fLayers.push_back(corruptionLayer);
0671 }
0672
0673 //______________________________________________________________________________
0674 template <typename Architecture_t, typename Layer_t>
0675 TCompressionLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddCompressionLayer(
0676    size_t visibleUnits, size_t hiddenUnits, Scalar_t dropoutProbability, EActivationFunction f,
0677    std::vector<Matrix_t> weights, std::vector<Matrix_t> biases)
0678 {
0679    size_t batchSize = this->GetBatchSize();
0680
0681    TCompressionLayer<Architecture_t> *compressionLayer = new TCompressionLayer<Architecture_t>(
0682       batchSize, visibleUnits, hiddenUnits, dropoutProbability, f, weights, biases);
0683    fLayers.push_back(compressionLayer);
0684    return compressionLayer;
0685 }
0686 //______________________________________________________________________________
0687
0688 template <typename Architecture_t, typename Layer_t>
0689 void TDeepNet<Architecture_t, Layer_t>::AddCompressionLayer(TCompressionLayer<Architecture_t> *compressionLayer)
0690 {
0691    fLayers.push_back(compressionLayer);
0692 }
0693
0694 //______________________________________________________________________________
0695 template <typename Architecture_t, typename Layer_t>
0696 TReconstructionLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddReconstructionLayer(
0697    size_t visibleUnits, size_t hiddenUnits, Scalar_t learningRate, EActivationFunction f, std::vector<Matrix_t> weights,
0698    std::vector<Matrix_t> biases, Scalar_t corruptionLevel, Scalar_t dropoutProbability)
0699 {
0700    size_t batchSize = this->GetBatchSize();
0701
0702    TReconstructionLayer<Architecture_t> *reconstructionLayer = new TReconstructionLayer<Architecture_t>(
0703       batchSize, visibleUnits, hiddenUnits, learningRate, f, weights, biases, corruptionLevel, dropoutProbability);
0704    fLayers.push_back(reconstructionLayer);
0705    return reconstructionLayer;
0706 }
0707 //______________________________________________________________________________
0708
0709 template <typename Architecture_t, typename Layer_t>
0710 void TDeepNet<Architecture_t, Layer_t>::AddReconstructionLayer(
0711    TReconstructionLayer<Architecture_t> *reconstructionLayer)
0712 {
0713    fLayers.push_back(reconstructionLayer);
0714 }
0715
0716 //______________________________________________________________________________
0717 template <typename Architecture_t, typename Layer_t>
0718 TLogisticRegressionLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddLogisticRegressionLayer(
0719    size_t inputUnits, size_t outputUnits, size_t testDataBatchSize, Scalar_t learningRate)
0720 {
0721    size_t batchSize = this->GetBatchSize();
0722
0723    TLogisticRegressionLayer<Architecture_t> *logisticRegressionLayer =
0724       new TLogisticRegressionLayer<Architecture_t>(batchSize, inputUnits, outputUnits, testDataBatchSize, learningRate);
0725    fLayers.push_back(logisticRegressionLayer);
0726    return logisticRegressionLayer;
0727 }
0728 //______________________________________________________________________________
0729 template <typename Architecture_t, typename Layer_t>
0730 void TDeepNet<Architecture_t, Layer_t>::AddLogisticRegressionLayer(
0731    TLogisticRegressionLayer<Architecture_t> *logisticRegressionLayer)
0732 {
0733    fLayers.push_back(logisticRegressionLayer);
0734 }
0735 #endif
0736
0737
0738 //______________________________________________________________________________
0739 template <typename Architecture_t, typename Layer_t>
0740 TDenseLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddDenseLayer(size_t width, EActivationFunction f,
0741                                                                               Scalar_t dropoutProbability)
0742 {
0743    size_t batchSize = this->GetBatchSize();
0744    size_t inputWidth;
0745    EInitialization init = this->GetInitialization();
0746    ERegularization reg = this->GetRegularization();
0747    Scalar_t decay = this->GetWeightDecay();
0748
0749    if (fLayers.size() == 0) {
0750       inputWidth = this->GetInputWidth();
0751    } else {
0752       Layer_t *lastLayer = fLayers.back();
0753       inputWidth = lastLayer->GetWidth();
0754    }
0755
0756    TDenseLayer<Architecture_t> *denseLayer =
0757       new TDenseLayer<Architecture_t>(batchSize, inputWidth, width, init, dropoutProbability, f, reg, decay);
0758
0759    fLayers.push_back(denseLayer);
0760
0761    return denseLayer;
0762 }
0763
0764 //______________________________________________________________________________
0765 template <typename Architecture_t, typename Layer_t>
0766 void TDeepNet<Architecture_t, Layer_t>::AddDenseLayer(TDenseLayer<Architecture_t> *denseLayer)
0767 {
0768    fLayers.push_back(denseLayer);
0769 }
0770
0771 //______________________________________________________________________________
0772 template <typename Architecture_t, typename Layer_t>
0773 TReshapeLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddReshapeLayer(size_t depth, size_t height,
0774                                                                                   size_t width, bool flattening)
0775 {
0776    size_t batchSize = this->GetBatchSize();
0777    size_t inputDepth;
0778    size_t inputHeight;
0779    size_t inputWidth;
0780    size_t outputNSlices;
0781    size_t outputNRows;
0782    size_t outputNCols;
0783
0784    if (fLayers.size() == 0) {
0785       inputDepth = this->GetInputDepth();
0786       inputHeight = this->GetInputHeight();
0787       inputWidth = this->GetInputWidth();
0788    } else {
0789       Layer_t *lastLayer = fLayers.back();
0790       inputDepth = lastLayer->GetDepth();
0791       inputHeight = lastLayer->GetHeight();
0792       inputWidth = lastLayer->GetWidth();
0793    }
0794
0795    if (flattening) {
0796       outputNSlices = 1;
0797       outputNRows = this->GetBatchSize();
0798       outputNCols = depth * height * width;
0799       size_t inputNCols =  inputDepth * inputHeight *  inputWidth;
0800       if (outputNCols != 0 && outputNCols != inputNCols ) {
0801          Info("AddReshapeLayer","Dimensions not compatibles - product of input %zu x %zu x %zu should be equal to output %zu x %zu x %zu - Force flattening output to be %zu",
0802               inputDepth, inputHeight, inputWidth, depth, height, width,inputNCols);
0803       }
0804       outputNCols = inputNCols;
0805       depth = 1;
0806       height = 1;
0807       width = outputNCols;
0808    } else {
0809       outputNSlices = this->GetBatchSize();
0810       outputNRows = depth;
0811       outputNCols = height * width;
0812    }
0813
0814    TReshapeLayer<Architecture_t> *reshapeLayer =
0815       new TReshapeLayer<Architecture_t>(batchSize, inputDepth, inputHeight, inputWidth, depth, height, width,
0816                                         outputNSlices, outputNRows, outputNCols, flattening);
0817
0818    fLayers.push_back(reshapeLayer);
0819
0820    return reshapeLayer;
0821 }
0822
0823 //______________________________________________________________________________
0824 template <typename Architecture_t, typename Layer_t>
0825 TBatchNormLayer<Architecture_t> *TDeepNet<Architecture_t, Layer_t>::AddBatchNormLayer(Scalar_t momentum, Scalar_t epsilon)
0826 {
0827    int axis = -1;
0828    size_t batchSize = this->GetBatchSize();
0829    size_t inputDepth = 0;
0830    size_t inputHeight = 0;
0831    size_t inputWidth = 0;
0832    // this is the shape of the output tensor (it is columnmajor by default)
0833    // and it is normally (depth, hw, bsize)  and for dense layers  (bsize, w, 1)
0834    std::vector<size_t>  shape = {1, 1, 1};
0835    if (fLayers.size() == 0) {
0836       inputDepth = this->GetInputDepth();
0837       inputHeight = this->GetInputHeight();
0838       inputWidth = this->GetInputWidth();
0839       // assume that is like for a dense layer
0840       shape[0] = batchSize;
0841       shape[1] = inputWidth;
0842       shape[2] = 1;
0843    } else {
0844       Layer_t *lastLayer = fLayers.back();
0845       inputDepth = lastLayer->GetDepth();
0846       inputHeight = lastLayer->GetHeight();
0847       inputWidth = lastLayer->GetWidth();
0848       shape = lastLayer->GetOutput().GetShape();
0849       if (dynamic_cast<TConvLayer<Architecture_t> *>(lastLayer) != nullptr ||
0850           dynamic_cast<TMaxPoolLayer<Architecture_t> *>(lastLayer) != nullptr)
0851          axis = 1; // use axis = channel axis for convolutional layer
0852       if (shape.size() > 3) {
0853          for (size_t i = 3; i < shape.size(); ++i)
0854             shape[2] *= shape[i];
0855       }
0856    }
0857    // std::cout << "addBNormLayer " << inputDepth << " , " << inputHeight << " , " << inputWidth << " , " << shape[0]
0858    //           << "  " << shape[1] << "  " << shape[2] << std::endl;
0859
0860    auto bnormLayer =
0861       new TBatchNormLayer<Architecture_t>(batchSize, inputDepth, inputHeight, inputWidth, shape, axis, momentum, epsilon);
0862
0863    fLayers.push_back(bnormLayer);
0864
0865    return bnormLayer;
0866 }
0867
0868 //______________________________________________________________________________
0869 template <typename Architecture_t, typename Layer_t>
0870 void TDeepNet<Architecture_t, Layer_t>::AddReshapeLayer(TReshapeLayer<Architecture_t> *reshapeLayer)
0871 {
0872    fLayers.push_back(reshapeLayer);
0873 }
0874
0875 //______________________________________________________________________________
0876 template <typename Architecture_t, typename Layer_t>
0877 auto TDeepNet<Architecture_t, Layer_t>::Initialize() -> void
0878 {
0879    for (size_t i = 0; i < fLayers.size(); i++) {
0880       fLayers[i]->Initialize();
0881    }
0882 }
0883
0884 //______________________________________________________________________________
0885 template <typename Architecture_t, typename Layer_t>
0886 auto TDeepNet<Architecture_t, Layer_t>::ResetTraining() -> void
0887 {
0888    for (size_t i = 0; i < fLayers.size(); i++) {
0889       fLayers[i]->ResetTraining();
0890    }
0891 }
0892
0893
0894 //______________________________________________________________________________
0895 template <typename Architecture_t, typename Layer_t>
0896 auto TDeepNet<Architecture_t, Layer_t>::Forward( Tensor_t &input, bool applyDropout) -> void
0897 {
0898    fLayers.front()->Forward(input, applyDropout);
0899
0900    for (size_t i = 1; i < fLayers.size(); i++) {
0901       fLayers[i]->Forward(fLayers[i - 1]->GetOutput(), applyDropout);
0902       //std::cout << "forward for layer " << i << std::endl;
0903       // fLayers[i]->GetOutput()[0].Print();
0904    }
0905 }
0906
0907
0908 #ifdef HAVE_DAE
0909 //_____________________________________________________________________________
0910 template <typename Architecture_t, typename Layer_t>
0911 auto TDeepNet<Architecture_t, Layer_t>::PreTrain(std::vector<Matrix_t> &input,
0912                                                  std::vector<size_t> numHiddenUnitsPerLayer, Scalar_t learningRate,
0913                                                  Scalar_t corruptionLevel, Scalar_t dropoutProbability, size_t epochs,
0914                                                  EActivationFunction f, bool applyDropout) -> void
0915 {
0916    std::vector<Matrix_t> inp1;
0917    std::vector<Matrix_t> inp2;
0918    size_t numOfHiddenLayers = sizeof(numHiddenUnitsPerLayer) / sizeof(numHiddenUnitsPerLayer[0]);
0919    // size_t batchSize = this->GetBatchSize();
0920    size_t visibleUnits = (size_t)input[0].GetNrows();
0921
0922    AddCorruptionLayer(visibleUnits, numHiddenUnitsPerLayer[0], dropoutProbability, corruptionLevel);
0923    fLayers.back()->Initialize();
0924    fLayers.back()->Forward(input, applyDropout);
0925    // fLayers.back()->Print();
0926
0927    AddCompressionLayer(visibleUnits, numHiddenUnitsPerLayer[0], dropoutProbability, f, fLayers.back()->GetWeights(),
0928                        fLayers.back()->GetBiases());
0929    fLayers.back()->Initialize();
0930    fLayers.back()->Forward(fLayers[fLayers.size() - 2]->GetOutput(), applyDropout); // as we have to pass corrupt input
0931
0932    AddReconstructionLayer(visibleUnits, numHiddenUnitsPerLayer[0], learningRate, f, fLayers.back()->GetWeights(),
0933                           fLayers.back()->GetBiases(), corruptionLevel, dropoutProbability);
0934    fLayers.back()->Initialize();
0935    fLayers.back()->Forward(fLayers[fLayers.size() - 2]->GetOutput(),
0936                            applyDropout); // as we have to pass compressed Input
0937    fLayers.back()->Backward(fLayers[fLayers.size() - 2]->GetOutput(), inp1, fLayers[fLayers.size() - 3]->GetOutput(),
0938                             input);
0939    // three layers are added, now pointer is on third layer
0940    size_t weightsSize = fLayers.back()->GetWeights().size();
0941    size_t biasesSize = fLayers.back()->GetBiases().size();
0942    for (size_t epoch = 0; epoch < epochs - 1; epoch++) {
0943       // fLayers[fLayers.size() - 3]->Forward(input,applyDropout);
0944       for (size_t j = 0; j < weightsSize; j++) {
0945          Architecture_t::Copy(fLayers[fLayers.size() - 2]->GetWeightsAt(j), fLayers.back()->GetWeightsAt(j));
0946       }
0947       for (size_t j = 0; j < biasesSize; j++) {
0948          Architecture_t::Copy(fLayers[fLayers.size() - 2]->GetBiasesAt(j), fLayers.back()->GetBiasesAt(j));
0949       }
0950       fLayers[fLayers.size() - 2]->Forward(fLayers[fLayers.size() - 3]->GetOutput(), applyDropout);
0951       fLayers[fLayers.size() - 1]->Forward(fLayers[fLayers.size() - 2]->GetOutput(), applyDropout);
0952       fLayers[fLayers.size() - 1]->Backward(fLayers[fLayers.size() - 2]->GetOutput(), inp1,
0953                                             fLayers[fLayers.size() - 3]->GetOutput(), input);
0954    }
0955    fLayers.back()->Print();
0956
0957    for (size_t i = 1; i < numOfHiddenLayers; i++) {
0958
0959       AddCorruptionLayer(numHiddenUnitsPerLayer[i - 1], numHiddenUnitsPerLayer[i], dropoutProbability, corruptionLevel);
0960       fLayers.back()->Initialize();
0961       fLayers.back()->Forward(fLayers[fLayers.size() - 3]->GetOutput(),
0962                               applyDropout); // as we have to pass compressed Input
0963
0964       AddCompressionLayer(numHiddenUnitsPerLayer[i - 1], numHiddenUnitsPerLayer[i], dropoutProbability, f,
0965                           fLayers.back()->GetWeights(), fLayers.back()->GetBiases());
0966       fLayers.back()->Initialize();
0967       fLayers.back()->Forward(fLayers[fLayers.size() - 2]->GetOutput(), applyDropout);
0968
0969       AddReconstructionLayer(numHiddenUnitsPerLayer[i - 1], numHiddenUnitsPerLayer[i], learningRate, f,
0970                              fLayers.back()->GetWeights(), fLayers.back()->GetBiases(), corruptionLevel,
0971                              dropoutProbability);
0972       fLayers.back()->Initialize();
0973       fLayers.back()->Forward(fLayers[fLayers.size() - 2]->GetOutput(),
0974                               applyDropout); // as we have to pass compressed Input
0975       fLayers.back()->Backward(fLayers[fLayers.size() - 2]->GetOutput(), inp1, fLayers[fLayers.size() - 3]->GetOutput(),
0976                                fLayers[fLayers.size() - 5]->GetOutput());
0977
0978       // three layers are added, now pointer is on third layer
0979       size_t _weightsSize = fLayers.back()->GetWeights().size();
0980       size_t _biasesSize = fLayers.back()->GetBiases().size();
0981       for (size_t epoch = 0; epoch < epochs - 1; epoch++) {
0982          // fLayers[fLayers.size() - 3]->Forward(input,applyDropout);
0983          for (size_t j = 0; j < _weightsSize; j++) {
0984             Architecture_t::Copy(fLayers[fLayers.size() - 2]->GetWeightsAt(j), fLayers.back()->GetWeightsAt(j));
0985          }
0986          for (size_t j = 0; j < _biasesSize; j++) {
0987             Architecture_t::Copy(fLayers[fLayers.size() - 2]->GetBiasesAt(j), fLayers.back()->GetBiasesAt(j));
0988          }
0989          fLayers[fLayers.size() - 2]->Forward(fLayers[fLayers.size() - 3]->GetOutput(), applyDropout);
0990          fLayers[fLayers.size() - 1]->Forward(fLayers[fLayers.size() - 2]->GetOutput(), applyDropout);
0991          fLayers[fLayers.size() - 1]->Backward(fLayers[fLayers.size() - 2]->GetOutput(), inp1,
0992                                                fLayers[fLayers.size() - 3]->GetOutput(),
0993                                                fLayers[fLayers.size() - 5]->GetOutput());
0994       }
0995       fLayers.back()->Print();
0996    }
0997 }
0998
0999 //______________________________________________________________________________
1000 template <typename Architecture_t, typename Layer_t>
1001 auto TDeepNet<Architecture_t, Layer_t>::FineTune(std::vector<Matrix_t> &input, std::vector<Matrix_t> &testInput,
1002                                                  std::vector<Matrix_t> &inputLabel, size_t outputUnits,
1003                                                  size_t testDataBatchSize, Scalar_t learningRate, size_t epochs) -> void
1004 {
1005    std::vector<Matrix_t> inp1;
1006    std::vector<Matrix_t> inp2;
1007    if (fLayers.size() == 0) // only Logistic Regression Layer
1008    {
1009       size_t inputUnits = input[0].GetNrows();
1010
1011       AddLogisticRegressionLayer(inputUnits, outputUnits, testDataBatchSize, learningRate);
1012       fLayers.back()->Initialize();
1013       for (size_t i = 0; i < epochs; i++) {
1014          fLayers.back()->Backward(inputLabel, inp1, input, inp2);
1015       }
1016       fLayers.back()->Forward(input, false);
1017       fLayers.back()->Print();
1018    } else { // if used after any other layer
1019       size_t inputUnits = fLayers.back()->GetOutputAt(0).GetNrows();
1020       AddLogisticRegressionLayer(inputUnits, outputUnits, testDataBatchSize, learningRate);
1021       fLayers.back()->Initialize();
1022       for (size_t i = 0; i < epochs; i++) {
1023          fLayers.back()->Backward(inputLabel, inp1, fLayers[fLayers.size() - 2]->GetOutput(), inp2);
1024       }
1025       fLayers.back()->Forward(testInput, false);
1026       fLayers.back()->Print();
1027    }
1028 }
1029 #endif
1030
1031 //______________________________________________________________________________
1032 template <typename Architecture_t, typename Layer_t>
1033 auto TDeepNet<Architecture_t, Layer_t>::Backward(const Tensor_t &input, const Matrix_t &groundTruth,
1034                                                  const Matrix_t &weights) -> void
1035 {
1036    //Tensor_t inp1;
1037    //Tensor_t inp2;
1038    // Last layer should be dense layer
1039    Matrix_t last_actgrad = fLayers.back()->GetActivationGradientsAt(0);
1040    Matrix_t last_output = fLayers.back()->GetOutputAt(0);
1041    evaluateGradients<Architecture_t>(last_actgrad, this->GetLossFunction(), groundTruth,
1042                                      last_output, weights);
1043
1044    for (size_t i = fLayers.size() - 1; i > 0; i--) {
1045       auto &activation_gradient_backward = fLayers[i - 1]->GetActivationGradients();
1046       auto &activations_backward = fLayers[i - 1]->GetOutput();
1047       fLayers[i]->Backward(activation_gradient_backward, activations_backward);
1048    }
1049
1050    // need to have a dummy tensor (size=0) to pass for activation gradient backward which
1051    // are not computed for the first layer
1052    Tensor_t dummy;
1053    fLayers[0]->Backward(dummy, input);
1054 }
1055
1056 #ifdef USE_PARALLEL_DEEPNET
1057
1058 //______________________________________________________________________________
1059 template <typename Architecture_t, typename Layer_t>
1060 auto TDeepNet<Architecture_t, Layer_t>::ParallelForward(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
1061                                                         std::vector<TTensorBatch<Architecture_t>> &batches,
1062                                                         bool applyDropout) -> void
1063 {
1064    size_t depth = this->GetDepth();
1065
1066    // The first layer of each deep net
1067    for (size_t i = 0; i < nets.size(); i++) {
1068       nets[i].GetLayerAt(0)->Forward(batches[i].GetInput(), applyDropout);
1069    }
1070
1071    // The i'th layer of each deep net
1072    for (size_t i = 1; i < depth; i++) {
1073       for (size_t j = 0; j < nets.size(); j++) {
1074          nets[j].GetLayerAt(i)->Forward(nets[j].GetLayerAt(i - 1)->GetOutput(), applyDropout);
1075       }
1076    }
1077 }
1078
1079 //______________________________________________________________________________
1080 template <typename Architecture_t, typename Layer_t>
1081 auto TDeepNet<Architecture_t, Layer_t>::ParallelBackward(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
1082                                                          std::vector<TTensorBatch<Architecture_t>> &batches,
1083                                                          Scalar_t learningRate) -> void
1084 {
1085    std::vector<Matrix_t> inp1;
1086    std::vector<Matrix_t> inp2;
1087    size_t depth = this->GetDepth();
1088
1089    // Evaluate the gradients of the last layers in each deep net
1090    for (size_t i = 0; i < nets.size(); i++) {
1091       evaluateGradients<Architecture_t>(nets[i].GetLayerAt(depth - 1)->GetActivationGradientsAt(0),
1092                                         nets[i].GetLossFunction(), batches[i].GetOutput(),
1093                                         nets[i].GetLayerAt(depth - 1)->GetOutputAt(0), batches[i].GetWeights());
1094    }
1095
1096    // Backpropagate the error in i'th layer of each deep net
1097    for (size_t i = depth - 1; i > 0; i--) {
1098       for (size_t j = 0; j < nets.size(); j++) {
1099          nets[j].GetLayerAt(i)->Backward(nets[j].GetLayerAt(i - 1)->GetActivationGradients(),
1100                                          nets[j].GetLayerAt(i - 1)->GetOutput(), inp1, inp2);
1101       }
1102    }
1103
1104    std::vector<Matrix_t> dummy;
1105
1106    // First layer of each deep net
1107    for (size_t i = 0; i < nets.size(); i++) {
1108       nets[i].GetLayerAt(0)->Backward(dummy, batches[i].GetInput(), inp1, inp2);
1109    }
1110
1111    // Update and copy
1112    for (size_t i = 0; i < nets.size(); i++) {
1113       for (size_t j = 0; j < depth; j++) {
1114          Layer_t *masterLayer = this->GetLayerAt(j);
1115          Layer_t *layer = nets[i].GetLayerAt(j);
1116
1117          masterLayer->UpdateWeights(layer->GetWeightGradients(), learningRate);
1118          layer->CopyWeights(masterLayer->GetWeights());
1119
1120          masterLayer->UpdateBiases(layer->GetBiasGradients(), learningRate);
1121          layer->CopyBiases(masterLayer->GetBiases());
1122       }
1123    }
1124 }
1125
1126 //______________________________________________________________________________
1127 template <typename Architecture_t, typename Layer_t>
1128 auto TDeepNet<Architecture_t, Layer_t>::ParallelBackwardMomentum(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
1129                                                                  std::vector<TTensorBatch<Architecture_t>> &batches,
1130                                                                  Scalar_t learningRate, Scalar_t momentum) -> void
1131 {
1132    std::vector<Matrix_t> inp1;
1133    std::vector<Matrix_t> inp2;
1134    size_t depth = this->GetDepth();
1135
1136    // Evaluate the gradients of the last layers in each deep net
1137    for (size_t i = 0; i < nets.size(); i++) {
1138       evaluateGradients<Architecture_t>(nets[i].GetLayerAt(depth - 1)->GetActivationGradientsAt(0),
1139                                         nets[i].GetLossFunction(), batches[i].GetOutput(),
1140                                         nets[i].GetLayerAt(depth - 1)->GetOutputAt(0), batches[i].GetWeights());
1141    }
1142
1143    // Backpropagate the error in i'th layer of each deep net
1144    for (size_t i = depth - 1; i > 0; i--) {
1145       Layer_t *masterLayer = this->GetLayerAt(i);
1146
1147       for (size_t j = 0; j < nets.size(); j++) {
1148          Layer_t *layer = nets[j].GetLayerAt(i);
1149
1150          layer->Backward(nets[j].GetLayerAt(i - 1)->GetActivationGradients(), nets[j].GetLayerAt(i - 1)->GetOutput(),
1151                          inp1, inp2);
1152          masterLayer->UpdateWeightGradients(layer->GetWeightGradients(), learningRate / momentum);
1153          masterLayer->UpdateBiasGradients(layer->GetBiasGradients(), learningRate / momentum);
1154       }
1155
1156       masterLayer->UpdateWeightGradients(masterLayer->GetWeightGradients(), 1.0 - momentum);
1157       masterLayer->UpdateBiasGradients(masterLayer->GetBiasGradients(), 1.0 - momentum);
1158    }
1159
1160    std::vector<Matrix_t> dummy;
1161
1162    // First layer of each deep net
1163    Layer_t *masterFirstLayer = this->GetLayerAt(0);
1164    for (size_t i = 0; i < nets.size(); i++) {
1165       Layer_t *layer = nets[i].GetLayerAt(0);
1166
1167       layer->Backward(dummy, batches[i].GetInput(), inp1, inp2);
1168
1169       masterFirstLayer->UpdateWeightGradients(layer->GetWeightGradients(), learningRate / momentum);
1170       masterFirstLayer->UpdateBiasGradients(layer->GetBiasGradients(), learningRate / momentum);
1171    }
1172
1173    masterFirstLayer->UpdateWeightGradients(masterFirstLayer->GetWeightGradients(), 1.0 - momentum);
1174    masterFirstLayer->UpdateBiasGradients(masterFirstLayer->GetBiasGradients(), 1.0 - momentum);
1175
1176    for (size_t i = 0; i < depth; i++) {
1177       Layer_t *masterLayer = this->GetLayerAt(i);
1178       masterLayer->Update(1.0);
1179
1180       for (size_t j = 0; j < nets.size(); j++) {
1181          Layer_t *layer = nets[j].GetLayerAt(i);
1182
1183          layer->CopyWeights(masterLayer->GetWeights());
1184          layer->CopyBiases(masterLayer->GetBiases());
1185       }
1186    }
1187 }
1188
1189 //______________________________________________________________________________
1190 template <typename Architecture_t, typename Layer_t>
1191 auto TDeepNet<Architecture_t, Layer_t>::ParallelBackwardNestorov(std::vector<TDeepNet<Architecture_t, Layer_t>> &nets,
1192                                                                  std::vector<TTensorBatch<Architecture_t>> &batches,
1193                                                                  Scalar_t learningRate, Scalar_t momentum) -> void
1194 {
1195    std::cout << "Parallel Backward Nestorov" << std::endl;
1196    std::vector<Matrix_t> inp1;
1197    std::vector<Matrix_t> inp2;
1198    size_t depth = this->GetDepth();
1199
1200    // Evaluate the gradients of the last layers in each deep net
1201    for (size_t i = 0; i < nets.size(); i++) {
1202       evaluateGradients<Architecture_t>(nets[i].GetLayerAt(depth - 1)->GetActivationGradientsAt(0),
1203                                         nets[i].GetLossFunction(), batches[i].GetOutput(),
1204                                         nets[i].GetLayerAt(depth - 1)->GetOutputAt(0), batches[i].GetWeights());
1205    }
1206
1207    // Backpropagate the error in i'th layer of each deep net
1208    for (size_t i = depth - 1; i > 0; i--) {
1209       for (size_t j = 0; j < nets.size(); j++) {
1210          Layer_t *layer = nets[j].GetLayerAt(i);
1211
1212          layer->Backward(nets[j].GetLayerAt(i - 1)->GetActivationGradients(), nets[j].GetLayerAt(i - 1)->GetOutput(),
1213                          inp1, inp2);
1214       }
1215    }
1216
1217    std::vector<Matrix_t> dummy;
1218
1219    // First layer of each deep net
1220    for (size_t i = 0; i < nets.size(); i++) {
1221       Layer_t *layer = nets[i].GetLayerAt(0);
1222       layer->Backward(dummy, batches[i].GetInput(), inp1, inp2);
1223    }
1224
1225    for (size_t i = 0; i < depth; i++) {
1226       Layer_t *masterLayer = this->GetLayerAt(i);
1227       for (size_t j = 0; j < nets.size(); j++) {
1228          Layer_t *layer = nets[j].GetLayerAt(i);
1229
1230          layer->CopyWeights(masterLayer->GetWeights());
1231          layer->CopyBiases(masterLayer->GetBiases());
1232
1233          layer->UpdateWeights(masterLayer->GetWeightGradients(), 1.0);
1234          layer->UpdateBiases(masterLayer->GetBiasGradients(), 1.0);
1235       }
1236
1237       for (size_t j = 0; j < nets.size(); j++) {
1238          Layer_t *layer = nets[j].GetLayerAt(i);
1239
1240          masterLayer->UpdateWeightGradients(layer->GetWeightGradients(), learningRate / momentum);
1241          masterLayer->UpdateBiasGradients(layer->GetBiasGradients(), learningRate / momentum);
1242       }
1243
1244       masterLayer->UpdateWeightGradients(masterLayer->GetWeightGradients(), 1.0 - momentum);
1245       masterLayer->UpdateBiasGradients(masterLayer->GetBiasGradients(), 1.0 - momentum);
1246
1247       masterLayer->Update(1.0);
1248    }
1249 }
1250 #endif   // use parallel deep net
1251
1252 //______________________________________________________________________________
1253 template <typename Architecture_t, typename Layer_t>
1254 auto TDeepNet<Architecture_t, Layer_t>::Update(Scalar_t learningRate) -> void
1255 {
1256    for (size_t i = 0; i < fLayers.size(); i++) {
1257       fLayers[i]->Update(learningRate);
1258    }
1259 }
1260
1261 //______________________________________________________________________________
1262 template <typename Architecture_t, typename Layer_t>
1263 auto TDeepNet<Architecture_t, Layer_t>::Loss(const Matrix_t &groundTruth, const Matrix_t &weights,
1264                                              bool includeRegularization) const -> Scalar_t
1265 {
1266    // Last layer should not be deep
1267    auto loss = evaluate<Architecture_t>(this->GetLossFunction(), groundTruth, fLayers.back()->GetOutputAt(0), weights);
1268
1269    includeRegularization &= (this->GetRegularization() != ERegularization::kNone);
1270    if (includeRegularization) {
1271       loss += RegularizationTerm();
1272    }
1273
1274    return loss;
1275 }
1276
1277 //______________________________________________________________________________
1278 template <typename Architecture_t, typename Layer_t>
1279 auto TDeepNet<Architecture_t, Layer_t>::Loss(Tensor_t &input, const Matrix_t &groundTruth,
1280                                              const Matrix_t &weights, bool inTraining, bool includeRegularization)
1281    -> Scalar_t
1282 {
1283    Forward(input, inTraining);
1284    return Loss(groundTruth, weights, includeRegularization);
1285 }
1286
1287 //______________________________________________________________________________
1288 template <typename Architecture_t, typename Layer_t>
1289 auto TDeepNet<Architecture_t, Layer_t>::RegularizationTerm() const -> Scalar_t
1290 {
1291    Scalar_t reg = 0.0;
1292    for (size_t i = 0; i < fLayers.size(); i++) {
1293       for (size_t j = 0; j < (fLayers[i]->GetWeights()).size(); j++) {
1294          reg += regularization<Architecture_t>(fLayers[i]->GetWeightsAt(j), this->GetRegularization());
1295       }
1296    }
1297    return this->GetWeightDecay() * reg;
1298 }
1299
1300
1301 //______________________________________________________________________________
1302 template <typename Architecture_t, typename Layer_t>
1303 auto TDeepNet<Architecture_t, Layer_t>::Prediction(Matrix_t &predictions, EOutputFunction f) const -> void
1304 {
1305    // Last layer should not be deep (assume output is a matrix)
1306    evaluate<Architecture_t>(predictions, f, fLayers.back()->GetOutputAt(0));
1307 }
1308
1309 //______________________________________________________________________________
1310 template <typename Architecture_t, typename Layer_t>
1311 auto TDeepNet<Architecture_t, Layer_t>::Prediction(Matrix_t &predictions, Tensor_t & input,
1312                                                    EOutputFunction f) -> void
1313 {
1314    Forward(input, false);
1315    // Last layer should not be deep
1316    evaluate<Architecture_t>(predictions, f, fLayers.back()->GetOutputAt(0));
1317 }
1318
1319 //______________________________________________________________________________
1320 template <typename Architecture_t, typename Layer_t>
1321 auto TDeepNet<Architecture_t, Layer_t>::Print() const -> void
1322 {
1323    std::cout << "DEEP NEURAL NETWORK:   Depth = " << this->GetDepth();
1324    std::cout << "  Input = ( " << this->GetInputDepth();
1325    std::cout << ", " << this->GetInputHeight();
1326    std::cout << ", " << this->GetInputWidth() << " )";
1327    std::cout << "  Batch size = " << this->GetBatchSize();
1328    std::cout << "  Loss function = " << static_cast<char>(this->GetLossFunction()) << std::endl;
1329
1330    //std::cout << "\t Layers: " << std::endl;
1331
1332    for (size_t i = 0; i < fLayers.size(); i++) {
1333       std::cout << "\tLayer " << i << "\t";
1334       fLayers[i]->Print();
1335    }
1336 }
1337
1338 //______________________________________________________________________________
1339 template <typename Architecture_t, typename Layer_t>
1340 void TDeepNet<Architecture_t, Layer_t>::SetDropoutProbabilities(
1341     const std::vector<Double_t> & probabilities)
1342 {
1343    for (size_t i = 0; i < fLayers.size(); i++) {
1344       if (i < probabilities.size()) {
1345          fLayers[i]->SetDropoutProbability(probabilities[i]);
1346       } else {
1347          fLayers[i]->SetDropoutProbability(1.0);
1348       }
1349    }
1350 }
1351
1352
1353 } // namespace DNN
1354 } // namespace TMVA
1355
1356 #endif