TMVA/DNN/Adam.h

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Ravi Kiran S
0003
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TAdam                                                                 *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Adam Optimizer Class                                                      *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
0015  *                                                                                *
0016  * Copyright (c) 2005-2018:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026
0027 #ifndef TMVA_DNN_ADAM
0028 #define TMVA_DNN_ADAM
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038 /** \class TAdam
0039  *  Adam Optimizer class
0040  *
0041  *  This class represents the Adam Optimizer.
0042  */
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TAdam : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047    using Matrix_t = typename Architecture_t::Matrix_t;
0048    using Scalar_t = typename Architecture_t::Scalar_t;
0049
0050 protected:
0051    Scalar_t fBeta1;   ///< The Beta1 constant used by the optimizer.
0052    Scalar_t fBeta2;   ///< The Beta2 constant used by the optimizer.
0053    Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
0054
0055    std::vector<std::vector<Matrix_t>> fFirstMomentWeights; ///< The decaying average of the first moment of the past
0056                                                            /// weight gradients associated with the deep net.
0057    std::vector<std::vector<Matrix_t>> fFirstMomentBiases; ///< The decaying average of the first moment of the past bias
0058                                                           /// gradients associated with the deep net.
0059
0060    std::vector<std::vector<Matrix_t>> fSecondMomentWeights; ///< The decaying average of the second moment of the past
0061                                                             /// weight gradients associated with the deep net.
0062    std::vector<std::vector<Matrix_t>> fSecondMomentBiases;  ///< The decaying average of the second moment of the past
0063                                                             /// bias gradients associated with the deep net.
0064
0065    /*! Update the weights, given the current weight gradients. */
0066    void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients) override;
0067
0068    /*! Update the biases, given the current bias gradients. */
0069    void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients) override;
0070
0071 public:
0072    /*! Constructor. */
0073    TAdam(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t beta1 = 0.9, Scalar_t beta2 = 0.999,
0074          Scalar_t epsilon = 1e-7);
0075
0076    /*! Destructor. */
0077    ~TAdam() = default;
0078
0079    /*! Getters */
0080    Scalar_t GetBeta1() const { return fBeta1; }
0081    Scalar_t GetBeta2() const { return fBeta2; }
0082    Scalar_t GetEpsilon() const { return fEpsilon; }
0083
0084    std::vector<std::vector<Matrix_t>> &GetFirstMomentWeights() { return fFirstMomentWeights; }
0085    std::vector<Matrix_t> &GetFirstMomentWeightsAt(size_t i) { return fFirstMomentWeights[i]; }
0086
0087    std::vector<std::vector<Matrix_t>> &GetFirstMomentBiases() { return fFirstMomentBiases; }
0088    std::vector<Matrix_t> &GetFirstMomentBiasesAt(size_t i) { return fFirstMomentBiases[i]; }
0089
0090    std::vector<std::vector<Matrix_t>> &GetSecondMomentWeights() { return fSecondMomentWeights; }
0091    std::vector<Matrix_t> &GetSecondMomentWeightsAt(size_t i) { return fSecondMomentWeights[i]; }
0092
0093    std::vector<std::vector<Matrix_t>> &GetSecondMomentBiases() { return fSecondMomentBiases; }
0094    std::vector<Matrix_t> &GetSecondMomentBiasesAt(size_t i) { return fSecondMomentBiases[i]; }
0095 };
0096
0097 //
0098 //
0099 //  The Adam Optimizer Class - Implementation
0100 //_________________________________________________________________________________________________
0101 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0102 TAdam<Architecture_t, Layer_t, DeepNet_t>::TAdam(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t beta1,
0103                                                  Scalar_t beta2, Scalar_t epsilon)
0104    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fBeta1(beta1), fBeta2(beta2),
0105      fEpsilon(epsilon)
0106 {
0107    std::vector<Layer_t *> &layers = deepNet.GetLayers();
0108    const size_t layersNSlices = layers.size();
0109    fFirstMomentWeights.resize(layersNSlices);
0110    fFirstMomentBiases.resize(layersNSlices);
0111    fSecondMomentWeights.resize(layersNSlices);
0112    fSecondMomentBiases.resize(layersNSlices);
0113
0114
0115    for (size_t i = 0; i < layersNSlices; i++) {
0116
0117       Architecture_t::CreateWeightTensors( fFirstMomentWeights[i], layers[i]->GetWeights());
0118       Architecture_t::CreateWeightTensors( fSecondMomentWeights[i], layers[i]->GetWeights());
0119
0120       const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0121
0122       for (size_t j = 0; j < weightsNSlices; j++) {
0123          initialize<Architecture_t>(fFirstMomentWeights[i][j], EInitialization::kZero);
0124          initialize<Architecture_t>(fSecondMomentWeights[i][j], EInitialization::kZero);
0125       }
0126
0127       const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0128
0129       Architecture_t::CreateWeightTensors( fFirstMomentBiases[i], layers[i]->GetBiases());
0130       Architecture_t::CreateWeightTensors( fSecondMomentBiases[i], layers[i]->GetBiases());
0131
0132       for (size_t j = 0; j < biasesNSlices; j++) {
0133          initialize<Architecture_t>(fFirstMomentBiases[i][j], EInitialization::kZero);
0134          initialize<Architecture_t>(fSecondMomentBiases[i][j], EInitialization::kZero);
0135       }
0136    }
0137 }
0138
0139 //_________________________________________________________________________________________________
0140 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0141 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0142                                                               const std::vector<Matrix_t> &weightGradients) -> void
0143 {
0144    // update of weights using Adam algorithm
0145    // we use the formulation defined before section 2.1 in the original paper
0146    // 'Adam: A method for stochastic optimization, D. Kingma, J. Ba, see https://arxiv.org/abs/1412.6980
0147
0148    std::vector<Matrix_t> &currentLayerFirstMomentWeights = this->GetFirstMomentWeightsAt(layerIndex);
0149    std::vector<Matrix_t> &currentLayerSecondMomentWeights = this->GetSecondMomentWeightsAt(layerIndex);
0150
0151    // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
0152    Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
0153                     (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
0154
0155    /// Adam update of first and second momentum of the weights
0156    for (size_t i = 0; i < weights.size(); i++) {
0157       // Mt = beta1 * Mt-1 + (1-beta1) * WeightGradients
0158       Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentWeights[i], weightGradients[i], this->GetBeta1() );
0159       // Vt = beta2 * Vt-1 + (1-beta2) * WeightGradients^2
0160       Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentWeights[i], weightGradients[i], this->GetBeta2() );
0161       // Weight = Weight - alpha * Mt / (sqrt(Vt) + epsilon)
0162       Architecture_t::AdamUpdate(weights[i], currentLayerFirstMomentWeights[i], currentLayerSecondMomentWeights[i],
0163                                  alpha, this->GetEpsilon() );
0164    }
0165 }
0166
0167 //_________________________________________________________________________________________________
0168 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0169 auto TAdam<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0170                                                              const std::vector<Matrix_t> &biasGradients) -> void
0171 {
0172    std::vector<Matrix_t> &currentLayerFirstMomentBiases = this->GetFirstMomentBiasesAt(layerIndex);
0173    std::vector<Matrix_t> &currentLayerSecondMomentBiases = this->GetSecondMomentBiasesAt(layerIndex);
0174
0175    // alpha = learningRate * sqrt(1 - beta2^t) / (1-beta1^t)
0176    Scalar_t alpha = (this->GetLearningRate()) * (sqrt(1 - pow(this->GetBeta2(), this->GetGlobalStep()))) /
0177                     (1 - pow(this->GetBeta1(), this->GetGlobalStep()));
0178
0179    // updating of the biases.
0180    for (size_t i = 0; i < biases.size(); i++) {
0181       // Mt = beta1 * Mt-1 + (1-beta1) * BiasGradients
0182       Architecture_t::AdamUpdateFirstMom(currentLayerFirstMomentBiases[i], biasGradients[i], this->GetBeta1() );
0183       // Vt = beta2 * Vt-1 + (1-beta2) * BiasGradients^2
0184       Architecture_t::AdamUpdateSecondMom(currentLayerSecondMomentBiases[i], biasGradients[i], this->GetBeta2() );
0185       // theta = theta - alpha * Mt / (sqrt(Vt) + epsilon)
0186       Architecture_t::AdamUpdate(biases[i], currentLayerFirstMomentBiases[i], currentLayerSecondMomentBiases[i],
0187                                  alpha, this->GetEpsilon() );
0188    }
0189 }
0190
0191 } // namespace DNN
0192 } // namespace TMVA
0193
0194 #endif