Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-18 10:10:55

0001 // @(#)root/tmva/tmva/dnn:$Id$
0002 // Author: Ravi Kiran S
0003 
0004 /**********************************************************************************
0005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
0006  * Package: TMVA                                                                  *
0007  * Class  : TAdagrad                                                                 *
0008  *                                             *
0009  *                                                                                *
0010  * Description:                                                                   *
0011  *      Adagrad Optimizer Class                                                      *
0012  *                                                                                *
0013  * Authors (alphabetical):                                                        *
0014  *      Ravi Kiran S      <sravikiran0606@gmail.com>  - CERN, Switzerland         *
0015  *                                                                                *
0016  * Copyright (c) 2005-2018:                                                       *
0017  *      CERN, Switzerland                                                         *
0018  *      U. of Victoria, Canada                                                    *
0019  *      MPI-K Heidelberg, Germany                                                 *
0020  *      U. of Bonn, Germany                                                       *
0021  *                                                                                *
0022  * Redistribution and use in source and binary forms, with or without             *
0023  * modification, are permitted according to the terms listed in LICENSE           *
0024  * (see tmva/doc/LICENSE)                                          *
0025  **********************************************************************************/
0026 
0027 #ifndef TMVA_DNN_ADAGRAD
0028 #define TMVA_DNN_ADAGRAD
0029 
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034 
0035 namespace TMVA {
0036 namespace DNN {
0037 
0038 /** \class TAdagrad
0039  *  Adagrad Optimizer class
0040  *
0041  *  This class represents the Adagrad Optimizer.
0042  */
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044           typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TAdagrad : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047    using Matrix_t = typename Architecture_t::Matrix_t;
0048    using Scalar_t = typename Architecture_t::Scalar_t;
0049 
0050 protected:
0051    Scalar_t fEpsilon; ///< The Smoothing term used to avoid division by zero.
0052 
0053    std::vector<std::vector<Matrix_t>>
0054       fPastSquaredWeightGradients; ///< The sum of the square of the past weight gradients associated with the deep net.
0055    std::vector<std::vector<Matrix_t>>
0056       fPastSquaredBiasGradients; ///< The sum of the square of the past bias gradients associated with the deep net.
0057    std::vector<std::vector<Matrix_t>>
0058       fWorkWeightTensor; ///< working tensor used to keep a temporary copy of weights or weight gradients
0059    std::vector<std::vector<Matrix_t>>
0060       fWorkBiasTensor; ///< working tensor used to keep a temporary copy of bias or bias gradients
0061 
0062    /*! Update the weights, given the current weight gradients. */
0063    void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
0064 
0065    /*! Update the biases, given the current bias gradients. */
0066    void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
0067 
0068 public:
0069    /*! Constructor. */
0070    TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate = 0.01, Scalar_t epsilon = 1e-8);
0071 
0072    /*! Destructor. */
0073    ~TAdagrad() = default;
0074 
0075    /*! Getters */
0076    Scalar_t GetEpsilon() const { return fEpsilon; }
0077 
0078    std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
0079    std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
0080 
0081    std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
0082    std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
0083 };
0084 
0085 //
0086 //
0087 //  The Adagrad Optimizer Class - Implementation
0088 //_________________________________________________________________________________________________
0089 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0090 TAdagrad<Architecture_t, Layer_t, DeepNet_t>::TAdagrad(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t epsilon)
0091    : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fEpsilon(epsilon)
0092 {
0093    std::vector<Layer_t *> &layers = deepNet.GetLayers();
0094    const size_t layersNSlices = layers.size();
0095    fPastSquaredWeightGradients.resize(layersNSlices);
0096    fPastSquaredBiasGradients.resize(layersNSlices);
0097    fWorkWeightTensor.resize(layersNSlices);
0098    fWorkBiasTensor.resize(layersNSlices);
0099 
0100    for (size_t i = 0; i < layersNSlices; i++) {
0101       const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0102 
0103       // weight and weight gradients  tensors should have same
0104       Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights()); 
0105 
0106       for (size_t j = 0; j < weightsNSlices; j++) {
0107          initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
0108       }
0109 
0110       const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0111 
0112       Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases()); 
0113 
0114       for (size_t j = 0; j < biasesNSlices; j++) {
0115          initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
0116       }
0117 
0118       Architecture_t::CreateWeightTensors(fWorkWeightTensor[i], layers[i]->GetWeights());
0119       Architecture_t::CreateWeightTensors(fWorkBiasTensor[i], layers[i]->GetBiases());
0120 
0121    }
0122 }
0123 
0124 //_________________________________________________________________________________________________
0125 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0126 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0127                                                                  const std::vector<Matrix_t> &weightGradients) -> void
0128 {
0129    auto &currentLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
0130   
0131 
0132    const size_t weightsNSlices = weights.size();
0133    assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
0134 
0135    for (size_t i = 0; i < weightsNSlices; i++) {
0136 
0137       auto &currentSquaredWeightGradients = fWorkWeightTensor[layerIndex][i];
0138       // Vt = Vt-1 + currentSquaredWeightGradients
0139       Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
0140       Architecture_t::SquareElementWise(currentSquaredWeightGradients);
0141       Architecture_t::ScaleAdd(currentLayerPastSquaredWeightGradients[i], currentSquaredWeightGradients, 1.0);
0142 
0143       // updating the weights.
0144       // theta = theta - learningRate * currentWeightGradients / (sqrt(Vt + epsilon))
0145 
0146       auto &currentWeightUpdates = fWorkWeightTensor[layerIndex][i]; // reuse the work tensor for the weight updates now
0147       Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
0148       Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
0149       Architecture_t::SqrtElementWise(currentWeightUpdates);
0150       Architecture_t::ReciprocalElementWise(currentWeightUpdates);
0151       Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
0152       Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
0153    }
0154 }
0155 
0156 //_________________________________________________________________________________________________
0157 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0158 auto TAdagrad<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0159                                                                 const std::vector<Matrix_t> &biasGradients) -> void
0160 {
0161    std::vector<Matrix_t> &currentLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
0162 
0163    const size_t biasesNSlices = biases.size();
0164    assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
0165    for (size_t i = 0; i < biasesNSlices; i++) {
0166 
0167       // Vt = Vt-1 + currentSquaredBiasGradients
0168       auto &currentSquaredBiasGradients = fWorkBiasTensor[layerIndex][i];
0169       Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
0170       Architecture_t::SquareElementWise(currentSquaredBiasGradients);
0171       Architecture_t::ScaleAdd(currentLayerPastSquaredBiasGradients[i], currentSquaredBiasGradients, 1.0);
0172 
0173       // updating the biases.
0174       // theta = theta - learningRate * currentBiasGradients / (sqrt(Vt + epsilon))
0175 
0176       auto &currentBiasUpdates = fWorkBiasTensor[layerIndex][i];  
0177       Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
0178       Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
0179       Architecture_t::SqrtElementWise(currentBiasUpdates);
0180       Architecture_t::ReciprocalElementWise(currentBiasUpdates);
0181       Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
0182       Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
0183    }
0184 }
0185 
0186 } // namespace DNN
0187 } // namespace TMVA
0188 
0189 #endif