Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:10:57

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Ravi Kiran S
0003 
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TRMSProp                                                              *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      RMSProp Optimizer Class                                                   *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
0015  *                                                                                *
0016  * Copyright (c) 2005-2018:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026 
0027 #ifndef TMVA_DNN_RMSPROP
0028 #define TMVA_DNN_RMSPROP
0029 
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034 
0035 namespace TMVA {
0036 namespace DNN {
0037 
0038 /** \class TRMSProp
0039  *  RMSProp Optimizer class
0040  *
0041  *  This class represents the RMSProp Optimizer with options for applying momentum.
0042  */
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TRMSProp : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047    using Matrix_t = typename Architecture_t::Matrix_t;
0048    using Scalar_t = typename Architecture_t::Scalar_t;
0049 
0050 protected:
0051    Scalar_t fMomentum; ///< The momentum used for training.
0052    Scalar_t fRho;      ///< The Rho constant used by the optimizer.
0053    Scalar_t fEpsilon;  ///< The Smoothing term used to avoid division by zero.
0054    std::vector<std::vector<Matrix_t>>
0055       fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
0056    std::vector<std::vector<Matrix_t>>
0057       fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
0058 
0059    std::vector<std::vector<Matrix_t>> fWeightUpdates; ///< The accumulation of the past Weights for performing updates.
0060    std::vector<std::vector<Matrix_t>> fBiasUpdates;   ///< The accumulation of the past Biases for performing updates.
0061    std::vector<std::vector<Matrix_t>>
0062       fWorkWeightTensor1; ///< working tensor used to keep a temporary copy of weights or weight gradients
0063    std::vector<std::vector<Matrix_t>>
0064       fWorkBiasTensor1; ///< working tensor used to keep a temporary copy of bias or bias gradients
0065    std::vector<std::vector<Matrix_t>>
0066       fWorkWeightTensor2; ///< working tensor used to keep a temporary copy of weights or weight gradients
0067    std::vector<std::vector<Matrix_t>>
0068       fWorkBiasTensor2; ///< working tensor used to keep a temporary copy of bias or bias gradients
0069 
0070    /*! Update the weights, given the current weight gradients. */
0071    void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
0072 
0073    /*! Update the biases, given the current bias gradients. */
0074    void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
0075 
0076 public:
0077    /*! Constructor. */
0078    TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
0079             Scalar_t epsilon = 1e-7);
0080 
0081    /*! Destructor. */
0082    ~TRMSProp() = default;
0083 
0084    /*! Getters */
0085    Scalar_t GetMomentum() const { return fMomentum; }
0086    Scalar_t GetRho() const { return fRho; }
0087    Scalar_t GetEpsilon() const { return fEpsilon; }
0088 
0089    std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
0090    std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
0091 
0092    std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
0093    std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
0094 
0095    std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { return fWeightUpdates; }
0096    std::vector<Matrix_t> &GetWeightUpdatesAt(size_t i) { return fWeightUpdates[i]; }
0097 
0098    std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { return fBiasUpdates; }
0099    std::vector<Matrix_t> &GetBiasUpdatesAt(size_t i) { return fBiasUpdates[i]; }
0100 };
0101 
0102 //
0103 //
0104 //  The RMSProp Optimizer Class - Implementation
0105 //_________________________________________________________________________________________________
0106 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0107 TRMSProp<Architecture_t, Layer_t, DeepNet_t>::TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t momentum,
0108                                                        Scalar_t rho, Scalar_t epsilon)
0109    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
0110      fEpsilon(epsilon)
0111 {
0112    std::vector<Layer_t *> &layers = deepNet.GetLayers();
0113    const size_t layersNSlices = layers.size();
0114    fPastSquaredWeightGradients.resize(layersNSlices);
0115    fPastSquaredBiasGradients.resize(layersNSlices);
0116    fWeightUpdates.resize(layersNSlices);
0117    fBiasUpdates.resize(layersNSlices);
0118    fWorkWeightTensor1.resize(layersNSlices);
0119    fWorkBiasTensor1.resize(layersNSlices);
0120    fWorkWeightTensor2.resize(layersNSlices);
0121    fWorkBiasTensor2.resize(layersNSlices);
0122 
0123    for (size_t i = 0; i < layersNSlices; i++) {
0124       const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0125 
0126       Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());
0127       Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());
0128 
0129       for (size_t j = 0; j < weightsNSlices; j++) {
0130          initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
0131          initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
0132       }
0133 
0134       const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0135 
0136       Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases()); 
0137       Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases()); 
0138 
0139       for (size_t j = 0; j < biasesNSlices; j++) {
0140          initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
0141          initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
0142       }
0143       Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
0144       Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
0145       Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
0146       Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
0147    }
0148 }
0149 
0150 //_________________________________________________________________________________________________
0151 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0152 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0153                                                                  const std::vector<Matrix_t> &weightGradients) -> void
0154 {
0155    std::vector<Matrix_t> &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
0156    std::vector<Matrix_t> &currentLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
0157 
0158    for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
0159 
0160       // accumulation matrix used for temporary storing of the current accumulation
0161       auto &accumulation = fWorkWeightTensor1[layerIndex][k];
0162       auto &currentSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];
0163 
0164       // Vt = rho * Vt-1 + (1-rho) * currentSquaredWeightGradients
0165       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0166 
0167       Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
0168       Architecture_t::SquareElementWise(currentSquaredWeightGradients);
0169       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
0170       Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
0171       Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
0172 
0173       // Wt = momentum * Wt-1 + (learningRate * currentWeightGradients) / (sqrt(Vt + epsilon))
0174       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0175       auto &dummy = fWorkWeightTensor2[layerIndex][k]; // reuse working tensor
0176       Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
0177       Architecture_t::ConstAdd(dummy, this->GetEpsilon());
0178       Architecture_t::SqrtElementWise(dummy);
0179       Architecture_t::ReciprocalElementWise(dummy);
0180       Architecture_t::Hadamard(dummy, weightGradients[k]);
0181 
0182       Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
0183       Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
0184       Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
0185    }
0186 
0187    // updating the weights.
0188    // theta = theta - Wt
0189    for (size_t i = 0; i < weights.size(); i++) {
0190       Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
0191    }
0192 }
0193 
0194 //_________________________________________________________________________________________________
0195 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0196 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0197                                                                 const std::vector<Matrix_t> &biasGradients) -> void
0198 {
0199    std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
0200    std::vector<Matrix_t> &currentLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
0201 
0202    for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
0203 
0204       // accumulation matrix used for temporary storing of the current accumulation
0205       auto &accumulation = fWorkBiasTensor1[layerIndex][k];
0206       auto &currentSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];
0207       
0208       // Vt = rho * Vt-1 + (1-rho) * currentSquaredBiasGradients
0209       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0210       Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
0211       Architecture_t::SquareElementWise(currentSquaredBiasGradients);
0212       Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
0213       Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
0214       Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
0215 
0216       // Wt = momentum * Wt-1 + (learningRate * currentBiasGradients) / (sqrt(Vt + epsilon))
0217       initialize<Architecture_t>(accumulation, EInitialization::kZero);
0218       auto &dummy = fWorkBiasTensor2[layerIndex][k]; // reuse working tensor
0219 
0220       Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
0221       Architecture_t::ConstAdd(dummy, this->GetEpsilon());
0222       Architecture_t::SqrtElementWise(dummy);
0223       Architecture_t::ReciprocalElementWise(dummy);
0224       Architecture_t::Hadamard(dummy, biasGradients[k]);
0225 
0226       Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
0227       Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
0228       Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
0229    }
0230 
0231    // updating the Biases.
0232    // theta = theta - Wt
0233    for (size_t i = 0; i < biases.size(); i++) {
0234       Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);
0235    }
0236 }
0237 
0238 } // namespace DNN
0239 } // namespace TMVA
0240 
0241 #endif