File indexing completed on 2025-01-18 10:10:53
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 #ifndef TMVA_DNN_ARCHITECTURES_CUDNN
0019 #define TMVA_DNN_ARCHITECTURES_CUDNN
0020
0021 #include "RConfigure.h" // for definition of R__HAS_CUDNN
0022
0023 #ifndef R__HAS_CUDNN
0024 #error This file can be compiled only when cudnn is available in ROOT
0025 #else
0026
0027 #include "cudnn.h"
0028
0029 #include "TMVA/DNN/Functions.h"
0030 #include "TMVA/DNN/CNN/ContextHandles.h"
0031
0032 #include "TMVA/DNN/BatchNormLayer.h"
0033 #include "TMVA/DNN/CNN/ConvLayer.h"
0034 #include "TMVA/DNN/CNN/MaxPoolLayer.h"
0035 #include "TMVA/DNN/RNN/RNNLayer.h"
0036 #include "TMVA/DNN/RNN/LSTMLayer.h"
0037 #include "TMVA/DNN/RNN/GRULayer.h"
0038
0039
0040 #include "Cuda/CudaBuffers.h"
0041 #include "Cuda/CudaTensor.h"
0042 #include "TMVA/DNN/TensorDataLoader.h"
0043 #include <utility>
0044 #include <vector>
0045 #include <string>
0046
0047 #include "TMVA/DNN/Architectures/Cuda.h"
0048
0049 class TRandom;
0050
0051 namespace TMVA
0052 {
0053 namespace DNN
0054 {
0055
0056 struct TCudnnEmptyDescriptor {};
0057
0058
0059
0060
0061
0062
0063
0064
0065
0066 template<typename AFloat = Float_t>
0067 class TCudnn
0068 {
0069 private:
0070 static TRandom * fgRandomGen;
0071 public:
0072
0073 using Scalar_t = AFloat;
0074 using Matrix_t = TCudaTensor<AFloat>;
0075 using Tensor_t = TCudaTensor<AFloat>;
0076 using DeviceBuffer_t = TCudaDeviceBuffer<AFloat>;
0077 using HostBuffer_t = TCudaHostBuffer<AFloat>;
0078
0079
0080 using ActivationDescriptor_t = cudnnActivationDescriptor_t;
0081 using ConvolutionDescriptor_t = cudnnConvolutionDescriptor_t;
0082 using DropoutDescriptor_t = cudnnDropoutDescriptor_t;
0083 using FilterDescriptor_t = cudnnFilterDescriptor_t;
0084
0085 using PoolingDescriptor_t = cudnnPoolingDescriptor_t;
0086
0087 using AlgorithmForward_t = cudnnConvolutionFwdAlgo_t;
0088 using AlgorithmBackward_t = cudnnConvolutionBwdDataAlgo_t;
0089 using AlgorithmHelper_t = cudnnConvolutionBwdFilterAlgo_t;
0090 using AlgorithmDataType_t = cudnnDataType_t;
0091 using ReduceTensorDescriptor_t = cudnnReduceTensorDescriptor_t;
0092 using TensorDescriptor_t = cudnnTensorDescriptor_t;
0093 using RecurrentDescriptor_t = cudnnRNNDescriptor_t;
0094 #if (CUDNN_VERSION >= 8000)
0095 using RNNDataDescriptor_t = cudnnRNNDataDescriptor_t;
0096 #else
0097 using RNNDataDescriptor_t = TCudnnEmptyDescriptor;
0098 #endif
0099 using EmptyDescriptor_t = TCudnnEmptyDescriptor;
0100
0101 using BNormLayer_t = TBatchNormLayer<TCudnn<AFloat>>;
0102 using BNormDescriptors_t = TDNNGenDescriptors<BNormLayer_t>;
0103
0104 using ConvLayer_t = CNN::TConvLayer<TCudnn<AFloat>>;
0105 using ConvDescriptors_t = CNN::TCNNDescriptors<ConvLayer_t>;
0106 using ConvWorkspace_t = CNN::TCNNWorkspace<ConvLayer_t>;
0107 using PoolingLayer_t = CNN::TMaxPoolLayer<TCudnn<AFloat>>;
0108 using PoolingDescriptors_t = CNN::TCNNDescriptors<PoolingLayer_t>;
0109 using PoolingWorkspace_t = CNN::TCNNWorkspace<PoolingLayer_t>;
0110
0111 using RNNLayer_t = RNN::TBasicRNNLayer<TCudnn<AFloat>>;
0112 using RNNDescriptors_t = RNN::TRNNDescriptors<TCudnn<AFloat>>;
0113 using RNNWorkspace_t = RNN::TRNNWorkspace<TCudnn<AFloat>>;
0114
0115 using LSTMLayer_t = RNN::TBasicLSTMLayer<TCudnn<AFloat>>;
0116
0117
0118
0119 using GRULayer_t = RNN::TBasicGRULayer<TCudnn<AFloat>>;
0120
0121
0122
0123
0124
0125
0126
0127
0128 struct CNNOptions {
0129
0130 static int ConvFwdAlgorithm;
0131 static int ConvBwdDataAlgorithm;
0132 static int ConvBwdFilterAlgorithm;
0133
0134 static Long_t ConvMaxWorkspaceSize;
0135 };
0136
0137 static TMVA::Experimental::MemoryLayout GetTensorLayout() { return TMVA::Experimental::MemoryLayout::RowMajor; }
0138
0139
0140 static Tensor_t CreateTensor(size_t n, size_t c, size_t h, size_t w) {
0141 return Tensor_t( {n,c,h,w}, GetTensorLayout(), 0, 0);
0142 }
0143
0144 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t h, size_t w) {
0145 return Tensor_t( buffer, {n,c,h,w}, GetTensorLayout(), 0, 0);
0146 }
0147
0148 static Tensor_t CreateTensor(size_t n, size_t c, size_t w)
0149 {
0150 return Tensor_t({n, c, w}, GetTensorLayout(), 0, 0);
0151 }
0152
0153 static Tensor_t CreateTensor(DeviceBuffer_t buffer, size_t n, size_t c, size_t w)
0154 {
0155 return Tensor_t(buffer, {n, c, w}, GetTensorLayout(), 0, 0);
0156 }
0157
0158 static bool IsCudnn() { return true; }
0159
0160
0161
0162 static void CreateWeightTensors( std::vector<Matrix_t> & newWeights, const std::vector<Matrix_t> & weights) {
0163 if (!newWeights.empty()) newWeights.clear();
0164 size_t n = weights.size();
0165 for (size_t i = 0; i < n; ++i)
0166 newWeights.emplace_back( weights[i].GetShape(), weights[i].GetLayout(), 0, 0);
0167 }
0168
0169
0170
0171
0172
0173 static void InitializeBNormDescriptors(TDescriptors * & descriptors,
0174 BNormLayer_t *L = nullptr);
0175
0176 static void InitializeConvDescriptors(TDescriptors * & descriptors,
0177 ConvLayer_t *L = nullptr);
0178
0179 static void InitializePoolDescriptors(TDescriptors * & descriptors,
0180 PoolingLayer_t *L = nullptr);
0181
0182 static void InitializeRNNDescriptors(TDescriptors *&descriptors, RNNLayer_t *layer)
0183 {
0184 InitializeRecurrentDescriptors<RNNLayer_t>(descriptors, layer);
0185 }
0186 static void InitializeLSTMDescriptors(TDescriptors *&descriptors, LSTMLayer_t *layer) {
0187 InitializeRecurrentDescriptors<LSTMLayer_t>(descriptors, layer);
0188 }
0189 static void InitializeGRUDescriptors(TDescriptors *&descriptors, GRULayer_t *layer) {
0190 InitializeRecurrentDescriptors<GRULayer_t>(descriptors, layer);
0191 }
0192 template<typename RNNLayer>
0193 static void InitializeRecurrentDescriptors(TDescriptors *&descriptors, RNNLayer *L);
0194
0195
0196
0197 static void InitializeActivationDescriptor(ActivationDescriptor_t & descriptors, EActivationFunction activFunc, double coef = 0.0);
0198
0199 static void ReleaseConvDescriptors(TDescriptors * descriptors );
0200 static void ReleasePoolDescriptors(TDescriptors * descriptors );
0201 static void ReleaseRNNDescriptors(TDescriptors *descriptors);
0202 static void ReleaseBNormDescriptors(TDescriptors * descriptors );
0203 static void ReleaseDescriptor(EmptyDescriptor_t & emptyDescr) {}
0204 static void ReleaseDescriptor(ActivationDescriptor_t & activationDescr);
0205 static void ReleaseDescriptor(ConvolutionDescriptor_t & convolutionDescr);
0206 static void ReleaseDescriptor(DropoutDescriptor_t & dropoutDescr);
0207 static void ReleaseDescriptor(FilterDescriptor_t & filterDescr);
0208 static void ReleaseDescriptor(PoolingDescriptor_t & poolingDescr);
0209 static void ReleaseDescriptor(TensorDescriptor_t & tensorDescr);
0210
0211
0212 static void InitializeConvWorkspace(TWorkspace * & workspace,
0213 TDescriptors * & descriptors,
0214 const DNN::CNN::TConvParams & params,
0215 ConvLayer_t *L = nullptr);
0216 static void InitializePoolDropoutWorkspace(TWorkspace * & workspace,
0217 TDescriptors * & descriptors,
0218 const DNN::CNN::TConvParams & params,
0219 PoolingLayer_t *L = nullptr);
0220
0221 static void InitializeRNNWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, RNNLayer_t *layer)
0222 {
0223 InitializeRecurrentWorkspace<RNNLayer_t>(workspace, descriptors, layer);
0224 }
0225 static void InitializeLSTMWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, LSTMLayer_t *layer)
0226 {
0227 InitializeRecurrentWorkspace<LSTMLayer_t>(workspace, descriptors, layer);
0228 }
0229 static void InitializeGRUWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors, GRULayer_t *layer)
0230 {
0231 InitializeRecurrentWorkspace<GRULayer_t>(workspace, descriptors, layer);
0232 }
0233 template<typename RNNLayer>
0234 static void InitializeRecurrentWorkspace(TWorkspace *&workspace, TDescriptors *&descriptors,
0235 RNNLayer *layer);
0236
0237 static void FreeConvWorkspace(TWorkspace * workspace);
0238 static void FreePoolDropoutWorkspace(TWorkspace * workspace);
0239 static void FreeRNNWorkspace(TWorkspace *workspace);
0240
0241
0242 static void InitializeRNNTensors(RNNLayer_t *layer) { InitializeRecurrentTensors<RNNLayer_t>(layer); }
0243 static void InitializeLSTMTensors(LSTMLayer_t *layer) { InitializeRecurrentTensors<LSTMLayer_t>(layer); }
0244 static void InitializeGRUTensors(GRULayer_t *layer) { InitializeRecurrentTensors<GRULayer_t>(layer); }
0245 template <typename RNNLayer>
0246 static void InitializeRecurrentTensors(RNNLayer *layer);
0247
0248
0249
0250
0251
0252
0253
0254
0255
0256
0257
0258
0259
0260 static void MultiplyTranspose(Tensor_t &output, const Tensor_t &input, const Matrix_t &weights);
0261
0262
0263 static void AddRowWise(Tensor_t &output,const Matrix_t &biases);
0264
0265
0266
0267
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278 static void Backward(Tensor_t & activationGradientsBackward,
0279 Matrix_t & weightGradients,
0280 Matrix_t & biasGradients,
0281 Tensor_t & df,
0282 const Tensor_t & activationGradients,
0283 const Matrix_t & weights,
0284 const Tensor_t & activationBackward);
0285
0286
0287 static void ScaleAdd(Tensor_t & A, const Tensor_t & B,
0288 Scalar_t alpha = 1.0,
0289 Scalar_t beta = 1.0);
0290
0291
0292 static void Copy(Tensor_t & A, const Tensor_t & B);
0293
0294
0295 template<typename ATensor_t>
0296 static void CopyDiffArch(Tensor_t & A,
0297 const ATensor_t & B);
0298
0299 template <typename ATensor_t>
0300 static void CopyWeightsDiffArch(Tensor_t &A, const ATensor_t &B);
0301
0302
0303 static void CopyDiffArch(Tensor_t A, const Tensor_t & B ) { Copy(A,B); }
0304
0305
0306 template<typename AMatrix_t>
0307 static void CopyDiffArch(std::vector<Tensor_t> & A,
0308 const std::vector<AMatrix_t> & B);
0309
0310
0311
0312
0313
0314
0315
0316
0317
0318
0319
0320
0321
0322
0323 static void Identity(Tensor_t & X) {}
0324 static void IdentityDerivative(Tensor_t & dX, Tensor_t& X,
0325 Tensor_t & Y, Tensor_t & dY,
0326 ActivationDescriptor_t activationDescr,
0327 const AFloat alpha = 1,
0328 const AFloat beta = 1) {}
0329
0330 static void ActivationFunctionForward(Tensor_t & X, EActivationFunction activFunct,
0331 const ActivationDescriptor_t activationDescr,
0332 const double coef = 0.0, const AFloat alpha = 1,
0333 const AFloat beta = 0);
0334
0335
0336 static void ActivationFunctionForward(Tensor_t &Y, const Tensor_t & X, EActivationFunction activFunct,
0337 const ActivationDescriptor_t activationDescr, const double coef = 0.0,
0338 const AFloat alpha = 1, const AFloat beta = 0);
0339
0340
0341 static void ActivationFunctionBackward(Tensor_t & dX, const Tensor_t & Y,
0342 const Tensor_t & dY, const Tensor_t & X,
0343 EActivationFunction activFunct,
0344 const ActivationDescriptor_t activationDescr,
0345 const AFloat alpha = 1,
0346 const AFloat beta = 0);
0347
0348
0349
0350
0351
0352
0353
0354 static void Relu(Tensor_t &) {}
0355 static void Sigmoid(Tensor_t &) {}
0356 static void Tanh(Tensor_t &) {}
0357 static void FastTanh(Tensor_t &) {}
0358 static void SymmetricRelu(Tensor_t &) {}
0359 static void SoftSign(Tensor_t &) {}
0360 static void Gauss(Tensor_t &) {}
0361
0362 static void IdentityDerivative(Tensor_t &, const Tensor_t &) {}
0363 static void ReluDerivative(Tensor_t &, const Tensor_t &) {}
0364 static void SigmoidDerivative(Tensor_t &, const Tensor_t &) {}
0365 static void TanhDerivative(Tensor_t &, const Tensor_t &) {}
0366 static void FastTanhDerivative(Tensor_t &, const Tensor_t &) {}
0367 static void SymmetricReluDerivative(Tensor_t & , const Tensor_t & ) {}
0368 static void SoftSignDerivative(Tensor_t & , const Tensor_t & ) {}
0369 static void GaussDerivative(Tensor_t & , const Tensor_t & ) {}
0370
0371
0372
0373
0374
0375
0376
0377
0378
0379
0380
0381
0382
0383
0384
0385
0386 static Scalar_t MeanSquaredError(const Matrix_t &Y, const Matrix_t &output,
0387 const Matrix_t &weights);
0388 static void MeanSquaredErrorGradients(Matrix_t &dY, const Matrix_t &Y,
0389 const Matrix_t &output, const Matrix_t &weights);
0390
0391
0392
0393 static Scalar_t CrossEntropy(const Matrix_t &Y, const Matrix_t &output,
0394 const Matrix_t &weights);
0395
0396 static void CrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
0397 const Matrix_t &output, const Matrix_t &weights);
0398
0399
0400
0401 static Scalar_t SoftmaxCrossEntropy(const Matrix_t &Y, const Matrix_t &output,
0402 const Matrix_t &weights);
0403 static void SoftmaxCrossEntropyGradients(Matrix_t &dY, const Matrix_t &Y,
0404 const Matrix_t &output, const Matrix_t &weights);
0405
0406
0407
0408
0409
0410
0411
0412
0413
0414
0415
0416
0417
0418
0419
0420 static void Sigmoid(Matrix_t &YHat,
0421 const Matrix_t & );
0422 static void Softmax(Matrix_t &YHat,
0423 const Matrix_t & );
0424
0425
0426
0427
0428
0429
0430
0431
0432
0433
0434
0435
0436
0437
0438
0439 static void DropoutForward(Tensor_t & A,
0440 TDescriptors * descriptors,
0441 TWorkspace * workspace,
0442 Scalar_t p);
0443
0444 static void DropoutBackward(Tensor_t & A,
0445 TDescriptors * descriptors,
0446 TWorkspace * workspace);
0447
0448
0449
0450
0451
0452
0453
0454
0455
0456
0457
0458
0459
0460
0461
0462
0463
0464 static void BatchNormLayerForwardTraining(int axis, const Tensor_t &x, Tensor_t &y, Matrix_t &gamma, Matrix_t &beta,
0465 Matrix_t &mean, Matrix_t &, Matrix_t &iVariance, Matrix_t &runningMeans,
0466 Matrix_t &runningVars, Scalar_t nTrainedBatches, Scalar_t momentum,
0467 Scalar_t epsilon, const TensorDescriptor_t &bnParDescriptor);
0468
0469
0470
0471
0472 static void BatchNormLayerForwardInference(int axis, const Tensor_t &x, Matrix_t &gamma, Matrix_t &beta,
0473 Tensor_t &y, const Matrix_t &runningMeans,
0474 const Matrix_t &runningVars, Scalar_t epsilon,
0475 const TensorDescriptor_t &);
0476
0477 static void BatchNormLayerBackward(int axis, const Tensor_t &x, const Tensor_t &dy, Tensor_t &dx,
0478 Matrix_t &gamma,
0479 Matrix_t &dgamma, Matrix_t &dbeta, const Matrix_t &mean, const Matrix_t &variance,
0480 const Matrix_t &iVariance, Scalar_t epsilon, const TensorDescriptor_t &);
0481
0482
0483
0484
0485
0486
0487
0488
0489
0490
0491
0492
0493
0494
0495 static Scalar_t L1Regularization(const Matrix_t &W)
0496 {
0497 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
0498 return TCuda<AFloat>::L1Regularization(mW);
0499 }
0500 static void AddL1RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
0501 {
0502 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
0503 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
0504 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
0505 }
0506
0507 static Scalar_t L2Regularization(const Matrix_t &W)
0508 {
0509 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
0510 return TCuda<AFloat>::L2Regularization(mW);
0511 }
0512 static void AddL2RegularizationGradients(Matrix_t &A, const Matrix_t &W, Scalar_t weightDecay)
0513 {
0514 TCudaMatrix<AFloat> mA(A.GetDeviceBuffer(), A.GetSize(), 1);
0515 TCudaMatrix<AFloat> mW(W.GetDeviceBuffer(), W.GetSize(), 1);
0516 return TCuda<AFloat>::AddL1RegularizationGradients(mA, mW, weightDecay);
0517 }
0518
0519
0520
0521
0522
0523
0524
0525
0526
0527
0528
0529
0530
0531
0532 static void InitializeGauss(Matrix_t &A);
0533 static void InitializeUniform(Matrix_t &A);
0534 static void InitializeIdentity(Matrix_t &A);
0535 static void InitializeZero(Matrix_t &A);
0536 static void InitializeGlorotNormal(Matrix_t &A);
0537 static void InitializeGlorotUniform(Matrix_t &A);
0538
0539
0540
0541 static TRandom &GetRandomGenerator();
0542
0543
0544 static void SetRandomSeed(size_t seed);
0545
0546
0547
0548
0549
0550
0551
0552
0553
0554
0555
0556
0557
0558 static void Dropout(Tensor_t &A, Scalar_t p) {}
0559
0560
0561
0562
0563
0564
0565
0566
0567
0568
0569
0570
0571
0572 static void AddConvBiases(Matrix_t &output, const Matrix_t &biases);
0573
0574
0575
0576 static void PrepareInternals(Tensor_t &) {}
0577
0578
0579 static void ConvLayerForward(Tensor_t &output,
0580 Tensor_t &inputActivationFunc,
0581 const Tensor_t &input, const Matrix_t &weights, const Matrix_t &biases,
0582 const DNN::CNN::TConvParams ¶ms, EActivationFunction activFunc,
0583 Tensor_t & , const ConvDescriptors_t &descriptors,
0584 ConvWorkspace_t &workspace);
0585
0586
0587
0588
0589
0590
0591
0592
0593
0594
0595
0596
0597
0598
0599
0600 static void ConvLayerBackward(Tensor_t &activationGradientsBackward, Matrix_t &weightGradients,
0601 Matrix_t &biasGradients, Tensor_t &inputActivation, Tensor_t &activationGradients,
0602 const Matrix_t &weights, const Tensor_t &activationBackward,
0603 const Tensor_t &outputTensor, EActivationFunction activFunc,
0604 const ConvDescriptors_t &descriptors, ConvWorkspace_t &workspace, size_t ,
0605 size_t , size_t , size_t , size_t ,
0606 size_t , size_t , size_t ,
0607 size_t , size_t );
0608
0609
0610
0611
0612
0613
0614
0615
0616
0617
0618
0619
0620
0621
0622 static void Downsample(Tensor_t &A, Tensor_t & , const Tensor_t &C, const PoolingDescriptors_t &descriptors,
0623 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
0624 size_t fltWidth, size_t strideRows, size_t strideCols);
0625
0626
0627
0628
0629
0630
0631
0632
0633
0634 static void MaxPoolLayerBackward(Tensor_t &activationGradientsBackward, const Tensor_t &activationGradients,
0635 const Tensor_t & , const Tensor_t &inputActivation,
0636 const Tensor_t &outputTensor, const PoolingDescriptors_t &descriptors,
0637 PoolingWorkspace_t &workspace, size_t imgHeight, size_t imgWidth, size_t fltHeight,
0638 size_t fltWidth, size_t strideRows, size_t strideCols, size_t nLocalViews);
0639
0640
0641
0642
0643
0644
0645
0646
0647
0648
0649
0650
0651
0652
0653
0654
0655 static void Flatten(Tensor_t &A, const Tensor_t &B);
0656
0657
0658
0659 static void Deflatten(Tensor_t &A, const Tensor_t &B);
0660
0661
0662 static void Rearrange(Tensor_t &out, const Tensor_t &in);
0663
0664
0665 static void RNNForward(const Tensor_t &x, const Tensor_t &hx, const Tensor_t &cx, const Tensor_t &weights,
0666 Tensor_t &y, Tensor_t &hy, Tensor_t &cy, const RNNDescriptors_t &descr,
0667 RNNWorkspace_t &workspace, bool isTraining);
0668
0669 static void RNNBackward(const Tensor_t &x, const Tensor_t &hx, const Tensor_t &cx, const Tensor_t &y, const Tensor_t &dy,
0670 const Tensor_t &dhy, const Tensor_t &dcy, const Tensor_t &weights, Tensor_t &dx, Tensor_t &dhx,
0671 Tensor_t &dcx, Tensor_t &dw, const RNNDescriptors_t &desc, RNNWorkspace_t &workspace);
0672
0673
0674
0675
0676 static Matrix_t &RecurrentLayerBackward(Matrix_t &state_gradients_backward,
0677 Matrix_t & ,
0678 Matrix_t & , Matrix_t & ,
0679 Matrix_t & ,
0680 const Matrix_t & ,
0681 const Matrix_t & ,
0682 const Matrix_t & ,
0683 const Matrix_t & ,
0684 Matrix_t & )
0685 {
0686 return state_gradients_backward;
0687 }
0688 static Matrix_t &LSTMLayerBackward(
0689 Matrix_t & state_gradients_backward , Matrix_t & ,
0690 Matrix_t & , Matrix_t & ,
0691 Matrix_t & , Matrix_t & ,
0692 Matrix_t & , Matrix_t & ,
0693 Matrix_t & ,
0694 Matrix_t & , Matrix_t & ,
0695 Matrix_t & , Matrix_t & ,
0696 Matrix_t & , Matrix_t & , Matrix_t & ,
0697 Matrix_t & , Matrix_t & ,
0698 const Matrix_t & , const Matrix_t & ,
0699 const Matrix_t & , const Matrix_t & ,
0700 const Matrix_t & , const Matrix_t & ,
0701 const Matrix_t & , const Matrix_t & ,
0702 const Matrix_t & , const Matrix_t & ,
0703 const Matrix_t & , const Matrix_t & ,
0704 const Matrix_t & , const Matrix_t & ,
0705 const Matrix_t & , Matrix_t & ,
0706 Matrix_t & , Matrix_t & )
0707 {
0708 return state_gradients_backward;
0709 }
0710
0711
0712 static Matrix_t &GRULayerBackward(
0713 Matrix_t & state_gradients_backward, Matrix_t & ,
0714 Matrix_t & , Matrix_t & ,
0715 Matrix_t & , Matrix_t & ,
0716 Matrix_t & , Matrix_t & ,
0717 Matrix_t & , Matrix_t & ,
0718 Matrix_t & , Matrix_t & , Matrix_t & ,
0719 const Matrix_t & , const Matrix_t & ,
0720 const Matrix_t & , const Matrix_t & ,
0721 const Matrix_t & , const Matrix_t & ,
0722 const Matrix_t & , const Matrix_t & ,
0723 const Matrix_t & , const Matrix_t & ,
0724 const Matrix_t & , Matrix_t & , bool)
0725 {
0726 return state_gradients_backward;
0727 }
0728
0729
0730
0731
0732
0733
0734
0735
0736
0737
0738
0739
0740
0741
0742
0743
0744
0745 static void Hadamard(Tensor_t &A, const Tensor_t &B)
0746 {
0747 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), 1, A.GetSize());
0748 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), 1, B.GetSize());
0749 assert(A.GetSize() == B.GetSize());
0750 TCuda<AFloat>::Hadamard(tmpA, tmpB);
0751 }
0752
0753
0754
0755
0756
0757
0758
0759
0760
0761 static Scalar_t Sum(const Matrix_t &A, Scalar_t alpha = 1.0, Scalar_t beta = 0.0);
0762
0763
0764
0765
0766
0767
0768
0769 static void ConstAdd(Matrix_t &A, Scalar_t beta) {
0770 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
0771 TCuda<AFloat>::ConstAdd(tmp,beta);
0772 }
0773
0774
0775
0776
0777 static void ConstMult(Matrix_t &A, Scalar_t beta) {
0778 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
0779 TCuda<AFloat>::ConstMult(tmp,beta);
0780 }
0781
0782
0783
0784
0785 static void ReciprocalElementWise(Matrix_t &A) {
0786 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
0787 TCuda<AFloat>::ReciprocalElementWise(tmp);
0788 }
0789
0790
0791
0792
0793 static void SquareElementWise(Matrix_t &A) {
0794 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
0795 TCuda<AFloat>::SquareElementWise(tmp);
0796 }
0797
0798
0799
0800
0801
0802 static void SqrtElementWise(Matrix_t &A) {
0803 TCudaMatrix<AFloat> tmp(A.GetDeviceBuffer(), 1, A.GetSize());
0804 TCuda<AFloat>::SqrtElementWise(tmp);
0805 }
0806
0807
0808 static void AdamUpdate(Matrix_t & A, const Matrix_t & M, const Matrix_t & V, Scalar_t alpha, Scalar_t eps) {
0809 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
0810 TCudaMatrix<AFloat> tmpM(M.GetDeviceBuffer(), M.GetSize(),1);
0811 TCudaMatrix<AFloat> tmpV(V.GetDeviceBuffer(), V.GetSize(),1);
0812 TCuda<AFloat>::AdamUpdate(tmpA, tmpM, tmpV,alpha, eps);
0813 }
0814 static void AdamUpdateFirstMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
0815 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
0816 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
0817 TCuda<AFloat>::AdamUpdateFirstMom(tmpA, tmpB, beta);
0818 }
0819 static void AdamUpdateSecondMom(Matrix_t & A, const Matrix_t & B, Scalar_t beta) {
0820 TCudaMatrix<AFloat> tmpA(A.GetDeviceBuffer(), A.GetSize(),1);
0821 TCudaMatrix<AFloat> tmpB(B.GetDeviceBuffer(), B.GetSize(),1);
0822 TCuda<AFloat>::AdamUpdateSecondMom(tmpA, tmpB, beta);
0823 }
0824
0825
0826 static void PrintTensor( const Tensor_t & A, const std::string name = "tensor", bool = true);
0827
0828 static void PrintTensor4dDescriptor(TensorDescriptor_t descriptor);
0829 static void PrintTensorNdDescriptor(TensorDescriptor_t descriptor, int n = 10);
0830
0831
0832
0833
0834
0835
0836
0837
0838 static void SumRows(Matrix_t &B, const Matrix_t &A);
0839 };
0840
0841
0842
0843 template <typename AFloat>
0844 template <typename ATensor>
0845 void TCudnn<AFloat>::CopyDiffArch(TCudaTensor<AFloat> &B,
0846 const ATensor &A)
0847 {
0848
0849
0850
0851
0852 if (B.GetLayout() == GetTensorLayout()) {
0853 if ( B.GetShape().size() == 4) {
0854 assert(B.GetShape().size() == 4);
0855 size_t firstSize = (A.GetLayout() == GetTensorLayout()) ? A.GetShape()[0] : A.GetShape().back();
0856 for (size_t i = 0; i < firstSize; ++i) {
0857 TMatrixT<AFloat> matIn = A.At(i).GetMatrix();
0858
0859 TCudaTensor<AFloat> tmpOut = B.At(i);
0860
0861 TCudaTensor<AFloat> tmpIn(matIn.GetMatrixArray(), tmpOut.GetShape(), tmpOut.GetLayout());
0862 Copy(tmpOut, tmpIn);
0863 }
0864 }
0865 else {
0866
0867 TMatrixT<AFloat> tmp = A;
0868 TCudaMatrix<AFloat> tmp2(tmp);
0869 TCudaTensor<AFloat> tA(tmp2);
0870 Copy(B, tA);
0871 }
0872 } else {
0873
0874 TMatrixT<AFloat> tmp = A;
0875 TCudaMatrix<AFloat> tmp2(tmp);
0876 TCudaTensor<AFloat> tA(tmp2);
0877 Copy(B, tA);
0878 }
0879 }
0880
0881
0882 template <typename AFloat>
0883 template <typename AMatrix>
0884 void TCudnn<AFloat>::CopyWeightsDiffArch(TCudaTensor<AFloat> &B, const AMatrix &A)
0885 {
0886
0887
0888 TMatrixT<AFloat> tmp = A;
0889
0890 if (B.GetLayout() == GetTensorLayout() ) {
0891
0892
0893 tmp.T();
0894 }
0895 TCudaMatrix<AFloat> tmp2(tmp);
0896 TCudaTensor<AFloat> tA(tmp2);
0897 Copy(B, tA);
0898 }
0899
0900
0901 template <typename AFloat>
0902 template <typename AMatrix_t>
0903 void TCudnn<AFloat>::CopyDiffArch(std::vector<Tensor_t> &B,
0904 const std::vector<AMatrix_t> &A)
0905 {
0906 for (size_t i = 0; i < B.size(); ++i) {
0907 CopyWeightsDiffArch(B[i], A[i]);
0908 }
0909 }
0910
0911 template <typename AFloat>
0912 void TCudnn<AFloat>::PrintTensor(const typename TCudnn<AFloat>::Tensor_t & A, const std::string name, bool truncate )
0913 {
0914 std::cout << name << " size = " << A.GetSize() << " shape = { ";
0915 auto shape = A.GetShape();
0916 for (size_t k = 0; k < shape.size()-1; ++k)
0917 std::cout << shape[k] << " , ";
0918 std::cout << shape.back() << " } ";
0919 std::cout << " strides = { ";
0920 auto strides = A.GetStrides();
0921 for (size_t k = 0; k < strides.size()-1; ++k)
0922 std::cout << strides[k] << " , ";
0923 std::cout << strides.back() << " }\n ";
0924 if (A.GetShape().size() == 1 ) {
0925 size_t n = A.GetShape()[0];
0926 if (truncate) n = std::min(n,size_t(10));
0927 for (size_t j = 0; j < n; ++j) {
0928 std::cout << A(0,j) << " ";
0929 }
0930 if (truncate && n < A.GetShape()[0]) std::cout << " ...... ";
0931 std::cout << " } " << std::endl;
0932 } else if (A.GetShape().size() == 2 ) {
0933 size_t n1 = A.GetShape()[0];
0934 size_t n2 = A.GetShape()[1];
0935 if (truncate) n1 = std::min(n1,size_t(10));
0936 for (size_t i = 0; i < n1; ++i) {
0937 std::cout << "{ ";
0938 if (truncate) n2 = std::min(n2,size_t(10));
0939 for (size_t j = 0; j < n2; ++j) {
0940 std::cout << A(i,j) << " ";
0941 }
0942 if (truncate && n2 < A.GetShape()[1]) std::cout << " ...... ";
0943 std::cout << " } " << std::endl;
0944 }
0945 if (truncate && n1 < A.GetShape()[0]) std::cout << " ...............\n";
0946 } else if (A.GetShape().size() == 3 ) {
0947 size_t n1 = A.GetFirstSize();
0948 size_t n2 = A.GetHSize();
0949 size_t n3 = A.GetWSize();
0950 if (truncate) n1 = std::min(n1,size_t(10));
0951 if (truncate) n2 = std::min(n2,size_t(10));
0952 if (truncate) n3 = std::min(n3,size_t(10));
0953 for (size_t i = 0; i < n1; ++i) {
0954 std::cout << "{ ";
0955 for (size_t j = 0; j < n2; ++j) {
0956 std::cout << "{ ";
0957 for (size_t k = 0; k < n3; ++k) {
0958 std::cout << A(i,j,k) << " ";
0959 }
0960 if (truncate && n3 < A.GetWSize()) std::cout << " ...... ";
0961 std::cout << " } " << std::endl;
0962 }
0963 if (truncate && n2 < A.GetHSize()) std::cout << ".................\n";
0964 std::cout << " } " << std::endl;
0965 }
0966 if (truncate && n1 < A.GetFirstSize()) std::cout << "...................\n";
0967 } else if (A.GetShape().size() == 4 ) {
0968 for (size_t i = 0; i < A.GetShape()[0]; ++i) {
0969 std::cout << "{ ";
0970 for (size_t j = 0; j < A.GetShape()[1]; ++j) {
0971 std::cout << "{ ";
0972 for (size_t k = 0; k < A.GetShape()[2]; ++k) {
0973 size_t n = A.GetShape()[3];
0974 if (truncate) n = std::min(n,size_t(10));
0975 for (size_t l = 0; l < n; ++l) {
0976 std::cout << A(i,j,k,l) << " ";
0977 }
0978 if (truncate && n < A.GetShape()[3]) std::cout << " ...... ";
0979 std::cout << " } " << std::endl;
0980 }
0981 std::cout << " } " << std::endl;
0982 }
0983 std::cout << " } " << std::endl;
0984 }
0985 }
0986 else {
0987 for (size_t l = 0; l < A.GetSize(); ++l) {
0988 std::cout << A.GetData()[l] << " ";
0989 }
0990 std::cout << "\n";
0991 }
0992 }
0993
0994 template <typename AFloat>
0995 void TCudnn<AFloat>::PrintTensor4dDescriptor(TensorDescriptor_t descriptor) {
0996 int n, c, h, w = 0;
0997 int s1, s2, s3, s4 = 0;
0998 cudnnDataType_t dataType;
0999 cudnnGetTensor4dDescriptor(descriptor, &dataType, &n, &c, &h, &w, &s1, &s2, &s3, &s4);
1000 std::cout << "Descriptor for 4d tensor of shape { " << n << " , " << c << " , " << h << " , " << w << " }"
1001 << " and strides { " << s1 << " , " << s2 << " , " << s3 << " , " << s4 << " }" << std::endl;
1002 }
1003 template <typename AFloat>
1004 void TCudnn<AFloat>::PrintTensorNdDescriptor(TensorDescriptor_t descriptor, int ndim)
1005 {
1006 int n = 0;
1007 std::vector<int> dims(ndim);
1008 std::vector<int> strides(ndim);
1009 cudnnDataType_t dataType;
1010 cudnnGetTensorNdDescriptor(descriptor, ndim, &dataType, &n, dims.data(), strides.data());
1011 dims.resize(n);
1012 strides.resize(n);
1013 std::cout << "Descriptor for Nd tensor of dim = " << n << " shape { ";
1014 for (auto d : dims)
1015 std::cout << d << " , ";
1016 std::cout << "} and strides { ";
1017 for (auto s : strides)
1018 std::cout << s << " , ";
1019 std::cout << " }" << std::endl;
1020 }
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034 template <typename AFloat>
1035 int TCudnn<AFloat>::CNNOptions::ConvFwdAlgorithm = -1;
1036 template <typename AFloat>
1037 int TCudnn<AFloat>::CNNOptions::ConvBwdDataAlgorithm = -1;
1038 template <typename AFloat>
1039 int TCudnn<AFloat>::CNNOptions::ConvBwdFilterAlgorithm = -1;
1040 template <typename AFloat>
1041 Long_t TCudnn<AFloat>::CNNOptions::ConvMaxWorkspaceSize = -1;
1042
1043 }
1044 }
1045
1046 #endif
1047 #endif