File indexing completed on 2025-01-18 10:10:57
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #ifndef TMVA_DNN_RMSPROP
0028 #define TMVA_DNN_RMSPROP
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038
0039
0040
0041
0042
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TRMSProp : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047 using Matrix_t = typename Architecture_t::Matrix_t;
0048 using Scalar_t = typename Architecture_t::Scalar_t;
0049
0050 protected:
0051 Scalar_t fMomentum;
0052 Scalar_t fRho;
0053 Scalar_t fEpsilon;
0054 std::vector<std::vector<Matrix_t>>
0055 fPastSquaredWeightGradients;
0056 std::vector<std::vector<Matrix_t>>
0057 fPastSquaredBiasGradients;
0058
0059 std::vector<std::vector<Matrix_t>> fWeightUpdates;
0060 std::vector<std::vector<Matrix_t>> fBiasUpdates;
0061 std::vector<std::vector<Matrix_t>>
0062 fWorkWeightTensor1;
0063 std::vector<std::vector<Matrix_t>>
0064 fWorkBiasTensor1;
0065 std::vector<std::vector<Matrix_t>>
0066 fWorkWeightTensor2;
0067 std::vector<std::vector<Matrix_t>>
0068 fWorkBiasTensor2;
0069
0070
0071 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
0072
0073
0074 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
0075
0076 public:
0077
0078 TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate = 0.001, Scalar_t momentum = 0.0, Scalar_t rho = 0.9,
0079 Scalar_t epsilon = 1e-7);
0080
0081
0082 ~TRMSProp() = default;
0083
0084
0085 Scalar_t GetMomentum() const { return fMomentum; }
0086 Scalar_t GetRho() const { return fRho; }
0087 Scalar_t GetEpsilon() const { return fEpsilon; }
0088
0089 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
0090 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
0091
0092 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
0093 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
0094
0095 std::vector<std::vector<Matrix_t>> &GetWeightUpdates() { return fWeightUpdates; }
0096 std::vector<Matrix_t> &GetWeightUpdatesAt(size_t i) { return fWeightUpdates[i]; }
0097
0098 std::vector<std::vector<Matrix_t>> &GetBiasUpdates() { return fBiasUpdates; }
0099 std::vector<Matrix_t> &GetBiasUpdatesAt(size_t i) { return fBiasUpdates[i]; }
0100 };
0101
0102
0103
0104
0105
0106 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0107 TRMSProp<Architecture_t, Layer_t, DeepNet_t>::TRMSProp(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t momentum,
0108 Scalar_t rho, Scalar_t epsilon)
0109 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fMomentum(momentum), fRho(rho),
0110 fEpsilon(epsilon)
0111 {
0112 std::vector<Layer_t *> &layers = deepNet.GetLayers();
0113 const size_t layersNSlices = layers.size();
0114 fPastSquaredWeightGradients.resize(layersNSlices);
0115 fPastSquaredBiasGradients.resize(layersNSlices);
0116 fWeightUpdates.resize(layersNSlices);
0117 fBiasUpdates.resize(layersNSlices);
0118 fWorkWeightTensor1.resize(layersNSlices);
0119 fWorkBiasTensor1.resize(layersNSlices);
0120 fWorkWeightTensor2.resize(layersNSlices);
0121 fWorkBiasTensor2.resize(layersNSlices);
0122
0123 for (size_t i = 0; i < layersNSlices; i++) {
0124 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0125
0126 Architecture_t::CreateWeightTensors(fPastSquaredWeightGradients[i], layers[i]->GetWeights());
0127 Architecture_t::CreateWeightTensors(fWeightUpdates[i], layers[i]->GetWeights());
0128
0129 for (size_t j = 0; j < weightsNSlices; j++) {
0130 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
0131 initialize<Architecture_t>(fWeightUpdates[i][j], EInitialization::kZero);
0132 }
0133
0134 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0135
0136 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
0137 Architecture_t::CreateWeightTensors( fBiasUpdates[i], layers[i]->GetBiases());
0138
0139 for (size_t j = 0; j < biasesNSlices; j++) {
0140 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
0141 initialize<Architecture_t>(fBiasUpdates[i][j], EInitialization::kZero);
0142 }
0143 Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
0144 Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
0145 Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
0146 Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
0147 }
0148 }
0149
0150
0151 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0152 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0153 const std::vector<Matrix_t> &weightGradients) -> void
0154 {
0155 std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
0156 std::vector<Matrix_t> ¤tLayerWeightUpdates = this->GetWeightUpdatesAt(layerIndex);
0157
0158 for (size_t k = 0; k < currentLayerPastSquaredWeightGradients.size(); k++) {
0159
0160
0161 auto &accumulation = fWorkWeightTensor1[layerIndex][k];
0162 auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][k];
0163
0164
0165 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0166
0167 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[k]);
0168 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
0169 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[k], this->GetRho());
0170 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
0171 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[k], accumulation);
0172
0173
0174 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0175 auto &dummy = fWorkWeightTensor2[layerIndex][k];
0176 Architecture_t::Copy(dummy, currentLayerPastSquaredWeightGradients[k]);
0177 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
0178 Architecture_t::SqrtElementWise(dummy);
0179 Architecture_t::ReciprocalElementWise(dummy);
0180 Architecture_t::Hadamard(dummy, weightGradients[k]);
0181
0182 Architecture_t::ScaleAdd(accumulation, currentLayerWeightUpdates[k], this->GetMomentum());
0183 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
0184 Architecture_t::Copy(currentLayerWeightUpdates[k], accumulation);
0185 }
0186
0187
0188
0189 for (size_t i = 0; i < weights.size(); i++) {
0190 Architecture_t::ScaleAdd(weights[i], currentLayerWeightUpdates[i], -1.0);
0191 }
0192 }
0193
0194
0195 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0196 auto TRMSProp<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0197 const std::vector<Matrix_t> &biasGradients) -> void
0198 {
0199 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
0200 std::vector<Matrix_t> ¤tLayerBiasUpdates = this->GetBiasUpdatesAt(layerIndex);
0201
0202 for (size_t k = 0; k < currentLayerPastSquaredBiasGradients.size(); k++) {
0203
0204
0205 auto &accumulation = fWorkBiasTensor1[layerIndex][k];
0206 auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][k];
0207
0208
0209 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0210 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[k]);
0211 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
0212 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[k], this->GetRho());
0213 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
0214 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[k], accumulation);
0215
0216
0217 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0218 auto &dummy = fWorkBiasTensor2[layerIndex][k];
0219
0220 Architecture_t::Copy(dummy, currentLayerPastSquaredBiasGradients[k]);
0221 Architecture_t::ConstAdd(dummy, this->GetEpsilon());
0222 Architecture_t::SqrtElementWise(dummy);
0223 Architecture_t::ReciprocalElementWise(dummy);
0224 Architecture_t::Hadamard(dummy, biasGradients[k]);
0225
0226 Architecture_t::ScaleAdd(accumulation, currentLayerBiasUpdates[k], this->GetMomentum());
0227 Architecture_t::ScaleAdd(accumulation, dummy, this->GetLearningRate());
0228 Architecture_t::Copy(currentLayerBiasUpdates[k], accumulation);
0229 }
0230
0231
0232
0233 for (size_t i = 0; i < biases.size(); i++) {
0234 Architecture_t::ScaleAdd(biases[i], currentLayerBiasUpdates[i], -1.0);
0235 }
0236 }
0237
0238 }
0239 }
0240
0241 #endif