File indexing completed on 2025-01-18 10:10:54
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027 #ifndef TMVA_DNN_ADADELTA
0028 #define TMVA_DNN_ADADELTA
0029
0030 #include "TMatrix.h"
0031 #include "TMVA/DNN/Optimizer.h"
0032 #include "TMVA/DNN/Functions.h"
0033 #include <vector>
0034
0035 namespace TMVA {
0036 namespace DNN {
0037
0038
0039
0040
0041
0042
0043 template <typename Architecture_t, typename Layer_t = VGeneralLayer<Architecture_t>,
0044 typename DeepNet_t = TDeepNet<Architecture_t, Layer_t>>
0045 class TAdadelta : public VOptimizer<Architecture_t, Layer_t, DeepNet_t> {
0046 public:
0047 using Matrix_t = typename Architecture_t::Matrix_t;
0048 using Scalar_t = typename Architecture_t::Scalar_t;
0049
0050 protected:
0051 Scalar_t fRho;
0052 Scalar_t fEpsilon;
0053 std::vector<std::vector<Matrix_t>> fPastSquaredWeightGradients;
0054
0055 std::vector<std::vector<Matrix_t>> fPastSquaredBiasGradients;
0056
0057
0058 std::vector<std::vector<Matrix_t>> fPastSquaredWeightUpdates;
0059
0060 std::vector<std::vector<Matrix_t>> fPastSquaredBiasUpdates;
0061
0062 std::vector<std::vector<Matrix_t>> fWorkWeightTensor1;
0063 std::vector<std::vector<Matrix_t>> fWorkBiasTensor1;
0064 std::vector<std::vector<Matrix_t>> fWorkWeightTensor2;
0065 std::vector<std::vector<Matrix_t>> fWorkBiasTensor2;
0066
0067
0068 void UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights, const std::vector<Matrix_t> &weightGradients);
0069
0070
0071 void UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases, const std::vector<Matrix_t> &biasGradients);
0072
0073 public:
0074
0075 TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate = 1.0, Scalar_t rho = 0.95, Scalar_t epsilon = 1e-8);
0076
0077
0078 ~TAdadelta() = default;
0079
0080
0081 Scalar_t GetRho() const { return fRho; }
0082 Scalar_t GetEpsilon() const { return fEpsilon; }
0083
0084 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightGradients() { return fPastSquaredWeightGradients; }
0085 std::vector<Matrix_t> &GetPastSquaredWeightGradientsAt(size_t i) { return fPastSquaredWeightGradients[i]; }
0086
0087 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasGradients() { return fPastSquaredBiasGradients; }
0088 std::vector<Matrix_t> &GetPastSquaredBiasGradientsAt(size_t i) { return fPastSquaredBiasGradients[i]; }
0089
0090 std::vector<std::vector<Matrix_t>> &GetPastSquaredWeightUpdates() { return fPastSquaredWeightUpdates; }
0091 std::vector<Matrix_t> &GetPastSquaredWeightUpdatesAt(size_t i) { return fPastSquaredWeightUpdates[i]; }
0092
0093 std::vector<std::vector<Matrix_t>> &GetPastSquaredBiasUpdates() { return fPastSquaredBiasUpdates; }
0094 std::vector<Matrix_t> &GetPastSquaredBiasUpdatesAt(size_t i) { return fPastSquaredBiasUpdates[i]; }
0095 };
0096
0097
0098
0099
0100
0101 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0102 TAdadelta<Architecture_t, Layer_t, DeepNet_t>::TAdadelta(DeepNet_t &deepNet, Scalar_t learningRate, Scalar_t rho,
0103 Scalar_t epsilon)
0104 : VOptimizer<Architecture_t, Layer_t, DeepNet_t>(learningRate, deepNet), fRho(rho), fEpsilon(epsilon)
0105 {
0106 std::vector<Layer_t *> &layers = deepNet.GetLayers();
0107 const size_t layersNSlices = layers.size();
0108 fPastSquaredWeightGradients.resize(layersNSlices);
0109 fPastSquaredBiasGradients.resize(layersNSlices);
0110 fPastSquaredWeightUpdates.resize(layersNSlices);
0111 fPastSquaredBiasUpdates.resize(layersNSlices);
0112 fWorkWeightTensor1.resize(layersNSlices);
0113 fWorkBiasTensor1.resize(layersNSlices);
0114 fWorkWeightTensor2.resize(layersNSlices);
0115 fWorkBiasTensor2.resize(layersNSlices);
0116
0117 for (size_t i = 0; i < layersNSlices; i++) {
0118 const size_t weightsNSlices = (layers[i]->GetWeights()).size();
0119
0120 Architecture_t::CreateWeightTensors( fPastSquaredWeightGradients[i], layers[i]->GetWeights());
0121 Architecture_t::CreateWeightTensors( fPastSquaredWeightUpdates[i], layers[i]->GetWeights());
0122
0123 for (size_t j = 0; j < weightsNSlices; j++) {
0124 initialize<Architecture_t>(fPastSquaredWeightGradients[i][j], EInitialization::kZero);
0125 initialize<Architecture_t>(fPastSquaredWeightUpdates[i][j], EInitialization::kZero);
0126 }
0127
0128 const size_t biasesNSlices = (layers[i]->GetBiases()).size();
0129
0130 Architecture_t::CreateWeightTensors( fPastSquaredBiasGradients[i], layers[i]->GetBiases());
0131 Architecture_t::CreateWeightTensors( fPastSquaredBiasUpdates[i], layers[i]->GetBiases());
0132
0133 for (size_t j = 0; j < biasesNSlices; j++) {
0134 initialize<Architecture_t>(fPastSquaredBiasGradients[i][j], EInitialization::kZero);
0135 initialize<Architecture_t>(fPastSquaredBiasUpdates[i][j], EInitialization::kZero);
0136 }
0137
0138 Architecture_t::CreateWeightTensors(fWorkWeightTensor1[i], layers[i]->GetWeights());
0139 Architecture_t::CreateWeightTensors(fWorkBiasTensor1[i], layers[i]->GetBiases());
0140 Architecture_t::CreateWeightTensors(fWorkWeightTensor2[i], layers[i]->GetWeights());
0141 Architecture_t::CreateWeightTensors(fWorkBiasTensor2[i], layers[i]->GetBiases());
0142 }
0143 }
0144
0145
0146 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0147 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateWeights(size_t layerIndex, std::vector<Matrix_t> &weights,
0148 const std::vector<Matrix_t> &weightGradients) -> void
0149 {
0150 std::vector<Matrix_t> ¤tLayerPastSquaredWeightGradients = this->GetPastSquaredWeightGradientsAt(layerIndex);
0151 std::vector<Matrix_t> ¤tLayerPastSquaredWeightUpdates = this->GetPastSquaredWeightUpdatesAt(layerIndex);
0152
0153 const size_t weightsNSlices = weights.size();
0154 assert(currentLayerPastSquaredWeightGradients.size() == weightsNSlices);
0155
0156 for (size_t i = 0; i < weightsNSlices; i++) {
0157
0158 auto &accumulation = fWorkWeightTensor1[layerIndex][i];
0159 auto ¤tSquaredWeightGradients = fWorkWeightTensor2[layerIndex][i];
0160
0161
0162 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0163
0164 Architecture_t::Copy(currentSquaredWeightGradients, weightGradients[i]);
0165 Architecture_t::SquareElementWise(currentSquaredWeightGradients);
0166 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightGradients[i], this->GetRho());
0167 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightGradients, 1 - (this->GetRho()));
0168 Architecture_t::Copy(currentLayerPastSquaredWeightGradients[i], accumulation);
0169
0170
0171
0172
0173
0174
0175 auto &dummy1 = fWorkWeightTensor1[layerIndex][i];
0176 Architecture_t::Copy(dummy1, currentLayerPastSquaredWeightUpdates[i]);
0177 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
0178 Architecture_t::SqrtElementWise(dummy1);
0179
0180 auto ¤tWeightUpdates = fWorkWeightTensor2[layerIndex][i];
0181
0182 Architecture_t::Copy(currentWeightUpdates, currentLayerPastSquaredWeightGradients[i]);
0183 Architecture_t::ConstAdd(currentWeightUpdates, this->GetEpsilon());
0184 Architecture_t::SqrtElementWise(currentWeightUpdates);
0185 Architecture_t::ReciprocalElementWise(currentWeightUpdates);
0186 Architecture_t::Hadamard(currentWeightUpdates, weightGradients[i]);
0187 Architecture_t::Hadamard(currentWeightUpdates, dummy1);
0188
0189
0190 Architecture_t::ScaleAdd(weights[i], currentWeightUpdates, -this->GetLearningRate());
0191
0192
0193
0194 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0195 auto ¤tSquaredWeightUpdates = fWorkWeightTensor2[layerIndex][i];
0196 Architecture_t::Copy(currentSquaredWeightUpdates, currentWeightUpdates);
0197 Architecture_t::SquareElementWise(currentSquaredWeightUpdates);
0198 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredWeightUpdates[i], this->GetRho());
0199 Architecture_t::ScaleAdd(accumulation, currentSquaredWeightUpdates, 1 - (this->GetRho()));
0200 Architecture_t::Copy(currentLayerPastSquaredWeightUpdates[i], accumulation);
0201 }
0202 }
0203
0204
0205 template <typename Architecture_t, typename Layer_t, typename DeepNet_t>
0206 auto TAdadelta<Architecture_t, Layer_t, DeepNet_t>::UpdateBiases(size_t layerIndex, std::vector<Matrix_t> &biases,
0207 const std::vector<Matrix_t> &biasGradients) -> void
0208 {
0209 std::vector<Matrix_t> ¤tLayerPastSquaredBiasGradients = this->GetPastSquaredBiasGradientsAt(layerIndex);
0210 std::vector<Matrix_t> ¤tLayerPastSquaredBiasUpdates = this->GetPastSquaredBiasUpdatesAt(layerIndex);
0211
0212 const size_t biasesNSlices = biases.size();
0213 assert(currentLayerPastSquaredBiasGradients.size() == biasesNSlices);
0214 for (size_t i = 0; i < biasesNSlices; i++) {
0215
0216
0217 auto &accumulation = fWorkBiasTensor1[layerIndex][i];
0218
0219
0220 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0221
0222 auto ¤tSquaredBiasGradients = fWorkBiasTensor2[layerIndex][i];
0223 Architecture_t::Copy(currentSquaredBiasGradients, biasGradients[i]);
0224 Architecture_t::SquareElementWise(currentSquaredBiasGradients);
0225 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasGradients[i], this->GetRho());
0226 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasGradients, 1 - (this->GetRho()));
0227 Architecture_t::Copy(currentLayerPastSquaredBiasGradients[i], accumulation);
0228
0229
0230
0231
0232
0233 auto &dummy1 = fWorkBiasTensor1[layerIndex][i];
0234 Architecture_t::Copy(dummy1, currentLayerPastSquaredBiasUpdates[i]);
0235 Architecture_t::ConstAdd(dummy1, this->GetEpsilon());
0236 Architecture_t::SqrtElementWise(dummy1);
0237
0238 auto ¤tBiasUpdates = fWorkBiasTensor2[layerIndex][i];
0239 Architecture_t::Copy(currentBiasUpdates, currentLayerPastSquaredBiasGradients[i]);
0240 Architecture_t::ConstAdd(currentBiasUpdates, this->GetEpsilon());
0241 Architecture_t::SqrtElementWise(currentBiasUpdates);
0242 Architecture_t::ReciprocalElementWise(currentBiasUpdates);
0243 Architecture_t::Hadamard(currentBiasUpdates, biasGradients[i]);
0244 Architecture_t::Hadamard(currentBiasUpdates, dummy1);
0245
0246
0247 Architecture_t::ScaleAdd(biases[i], currentBiasUpdates, -this->GetLearningRate());
0248
0249
0250
0251
0252 initialize<Architecture_t>(accumulation, EInitialization::kZero);
0253 auto ¤tSquaredBiasUpdates = fWorkBiasTensor2[layerIndex][i];
0254 Architecture_t::Copy(currentSquaredBiasUpdates, currentBiasUpdates);
0255 Architecture_t::SquareElementWise(currentSquaredBiasUpdates);
0256 Architecture_t::ScaleAdd(accumulation, currentLayerPastSquaredBiasUpdates[i], this->GetRho());
0257 Architecture_t::ScaleAdd(accumulation, currentSquaredBiasUpdates, 1 - (this->GetRho()));
0258 Architecture_t::Copy(currentLayerPastSquaredBiasUpdates[i], accumulation);
0259 }
0260 }
0261
0262 }
0263 }
0264
0265 #endif