File indexing completed on 2025-01-18 10:10:57
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #ifndef TMVA_DNN_SGD
0028 #define TMVA_DNN_SGD
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038
0039
0040
0041
0042
0043
0044 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0045 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0046 class TSGD : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0047 public:
0048 using Matrix_t = typename Architecture_t::Matrix_t;
0049 using Scalar_t = typename Architecture_t::Scalar_t;
0050
0051 protected:
0052 Scalar_t fMomentum;
0053 std::vector<std::vector<Matrix_t>>
0054 fPastWeightGradients;
0055 std::vector<std::vector<Matrix_t>>
0056 fPastBiasGradients;
0057
0058
0059 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
0060
0061
0062 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
0063
0064 public:
0065
0066 TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum);
0067
0068
0069 ~TSGD() = default;
0070
0071
0072 Scalar_t GetMomentum() const { return fMomentum; }
0073
0074 std::vector<std::vector<Matrix_t>> &GetPastWeightGradients() { return fPastWeightGradients; }
0075 std::vector<Matrix_t> &GetPastWeightGradientsAt(size_t i) { return fPastWeightGradients[i]; }
0076
0077 std::vector<std::vector<Matrix_t>> &GetPastBiasGradients() { return fPastBiasGradients; }
0078 std::vector<Matrix_t> &GetPastBiasGradientsAt(size_t i) { return fPastBiasGradients[i]; }
0079 };
0080
0081
0082
0083
0084
0085 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0086 TSGD<Architecture_t, Layer_t, DeepNet_t>::TSGD(Scalar_t learningRate, DeepNet_t &deepNet, Scalar_t momentum)
0087 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum)
0088 {
0089 std::vector<Layer_t *> &layers = deepNet.GetLayers();
0090 size_t layersNSlices = layers.size();
0091 fPastWeightGradients.resize(layersNSlices);
0092 fPastBiasGradients.resize(layersNSlices);
0093
0094 for (size_t i = 0; i < layersNSlices; i++) {
0095
0096 Architecture_t::CreateWeightTensors( fPastWeightGradients[i], layers[i]->GetWeights());
0097 size_t weightsNSlices = fPastWeightGradients[i].size();
0098 for (size_t j = 0; j < weightsNSlices; j++) {
0099 initialize<Architecture_t>(fPastWeightGradients[i][j], EInitialization::kZero);
0100 }
0101
0102 Architecture_t::CreateWeightTensors( fPastBiasGradients[i], layers[i]->GetBiases());
0103 size_t biasesNSlices = fPastBiasGradients[i].size();
0104 for (size_t j = 0; j < biasesNSlices; j++) {
0105 initialize<Architecture_t>(fPastBiasGradients[i][j], EInitialization::kZero);
0106 }
0107 }
0108 }
0109
0110
0111
0112
0113 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0114 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0115 const std::vector<Matrix_t> &weightGradients) -> void
0116 {
0117
0118
0119
0120 std::vector<Matrix_t> ¤tLayerPastWeightGradients = this->GetPastWeightGradientsAt(layerIndex);
0121
0122 for (size_t k = 0; k < currentLayerPastWeightGradients.size(); k++) {
0123 Architecture_t::ConstMult(currentLayerPastWeightGradients[k], this->GetMomentum());
0124 Architecture_t::ScaleAdd(currentLayerPastWeightGradients[k], weightGradients[k], 1.0);
0125 }
0126
0127
0128
0129 for (size_t i = 0; i < weights.size(); i++) {
0130 Architecture_t::ScaleAdd(weights[i], currentLayerPastWeightGradients[i], -this->GetLearningRate());
0131 }
0132 }
0133
0134
0135 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0136 auto TSGD<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0137 const std::vector<Matrix_t> &biasGradients) -> void
0138 {
0139
0140
0141
0142 std::vector<Matrix_t> ¤tLayerPastBiasGradients = this->GetPastBiasGradientsAt(layerIndex);
0143
0144 for (size_t k = 0; k < currentLayerPastBiasGradients.size(); k++) {
0145 Architecture_t::ConstMult(currentLayerPastBiasGradients[k], this->GetMomentum());
0146 Architecture_t::ScaleAdd(currentLayerPastBiasGradients[k], biasGradients[k], 1.0);
0147 }
0148
0149
0150
0151 for (size_t i = 0; i < biases.size(); i++) {
0152 Architecture_t::ScaleAdd(biases[i], currentLayerPastBiasGradients[i], -this->GetLearningRate());
0153 }
0154 }
0155
0156 }
0157 }
0158
0159 #endif