TMVA/DNN/SGD.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Ravi Kiran S
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TSGD                                                                  *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Stochastic Batch Gradient Descent Optimizer Class                         *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
0015  *                                                                                *
0016  * Copyright (c) 2005-2018:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026
0027 #ifndef TMVA_DNN_SGD
0028 #define TMVA_DNN_SGD
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038 /** \class TSGD
0039  *  Stochastic Batch Gradient Descent Optimizer class
0040  *
0041  *  This class represents the Stochastic Batch Gradient Descent Optimizer with options for applying momentum
0042  *  and nesterov momentum.
0043  */
0044 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0045           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0046 class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0047 public:
0048    using Matrix_t = typename Architecture_t::Matrix_t;
0049    using Scalar_t = typename Architecture_t::Scalar_t;
0050
0051 protected:
0052    Scalar_t fMomentum; ///< The momentum used for training.
0053    std::vector<std::vector<Matrix_t>>
0054       fPastWeightGradients; ///< The sum of the past weight gradients associated with the deep net.
0055    std::vector<std::vector<Matrix_t>>
0056       fPastBiasGradients; ///< The sum of the past bias gradients associated with the deep net.
0057
0058    /*! Update the weights, given the current weight gradients. */
0059    void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients) override;
0060
0061    /*! Update the biases, given the current bias gradients. */
0062    void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients) override;
0063
0064 public:
0065    /*! Constructor. */
0066    TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
0067
0068    /*! Destructor. */
0069    ~TSGD() = default;
0070
0071    /*! Getters */
0072    Scalar_t GetMomentum() const { return fMomentum; }
0073
0074    std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
0075    std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
0076
0077    std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
0078    std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
0079 };
0080
0081 //
0082 //
0083 //  The Stochastic Gradient Descent Optimizer Class - Implementation
0084 //_________________________________________________________________________________________________
0085 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0086 TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
0087    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
0088 {
0089    std::vector<Layer_t *> &layers = deepNet.GetLayers();
0090    size_t layersNSlices = layers.size();
0091    fPastWeightGradients.resize(layersNSlices);
0092    fPastBiasGradients.resize(layersNSlices);
0093
0094    for (size_t i = 0; i < layersNSlices; i++) {
0095
0096       Architecture_t::CreateWeightTensors( fPastWeightGradients[i], layers[i]->GetWeights());
0097       size_t weightsNSlices = fPastWeightGradients[i].size();
0098       for (size_t j = 0; j < weightsNSlices; j++) {
0099          initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
0100       }
0101
0102       Architecture_t::CreateWeightTensors( fPastBiasGradients[i], layers[i]->GetBiases());
0103       size_t biasesNSlices = fPastBiasGradients[i].size();
0104       for (size_t j = 0; j < biasesNSlices; j++) {
0105          initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
0106       }
0107    }
0108 }
0109
0110
0111
0112 //_________________________________________________________________________________________________
0113 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0114 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0115                                                              const std::vector<Matrix_t> &weightGradients) -> void
0116 {
0117    // accumulating the current layer past weight gradients to include the current weight gradients.
0118    // Vt = momentum * Vt-1 + currentGradients
0119
0120    std::vector<Matrix_t> &currentLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
0121
0122    for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
0123       Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
0124       Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
0125    }
0126
0127    // updating the weights.
0128    // theta = theta - learningRate * Vt
0129    for (size_t i = 0; i < weights.size(); i++) {
0130       Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
0131    }
0132 }
0133
0134 //_________________________________________________________________________________________________
0135 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0136 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0137                                                             const std::vector<Matrix_t> &biasGradients) -> void
0138 {
0139    // accumulating the current layer past bias gradients to include the current bias gradients.
0140    // Vt = momentum * Vt-1 + currentGradients
0141
0142    std::vector<Matrix_t> &currentLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
0143
0144    for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
0145       Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
0146       Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
0147    }
0148
0149    // updating the biases
0150    // theta = theta - learningRate * Vt
0151    for (size_t i = 0; i < biases.size(); i++) {
0152       Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
0153    }
0154 }
0155
0156 } // namespace DNN
0157 } // namespace TMVA
0158
0159 #endif