root/TMVA/ROperator_Conv.hxx

0001 #ifndef TMVA_SOFIE_ROPERATOR_CONV
0002 #define TMVA_SOFIE_ROPERATOR_CONV
0003
0004 #include "TMVA/SOFIE_common.hxx"
0005 #include "TMVA/ROperator.hxx"
0006 #include "TMVA/RModel.hxx"
0007
0008 #include <memory>
0009 #include <sstream>
0010 #include <algorithm>
0011 #include <stdexcept>
0012 #include <vector>
0013 #include <cassert>
0014
0015 namespace TMVA {
0016 namespace Experimental {
0017 namespace SOFIE {
0018
0019 template<typename T>
0020 class ROperator_Conv final : public ROperator
0021 {
0022 private:
0023    std::string fAttrAutopad;
0024    std::vector<size_t> fAttrDilations;
0025    size_t fAttrGroup;
0026    std::vector<size_t> fAttrKernelShape;
0027    std::vector<size_t> fAttrPads;
0028    std::vector<size_t> fAttrStrides;
0029
0030    std::string fNX;
0031    std::string fNW;
0032    std::string fNB;
0033    std::string fNB2; // bias tensor name after broadcasting
0034    std::string fNY;
0035
0036    std::string convK;
0037    std::string imcol;
0038
0039    std::vector<Dim> fShapeX;
0040    std::vector<size_t> fShapeW;
0041    std::vector<size_t> fShapeB;
0042    std::vector<Dim> fShapeY;
0043
0044    std::string fType;
0045
0046    size_t fDim;   // dimension of the convolution
0047
0048
0049 public:
0050
0051    ROperator_Conv() {}
0052
0053    ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
0054       size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
0055       std::vector<size_t> strides, std::string nameX, std::string nameW,
0056       std::string nameB, std::string nameY):
0057       fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
0058       fAttrPads(pads), fAttrStrides(strides),
0059       fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)),
0060       fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
0061    {
0062       if(std::is_same<T, float>::value) {
0063          fType = "float";
0064       } else {
0065          throw
0066             std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
0067       }
0068       fInputTensorNames = { fNX, fNB };
0069       fOutputTensorNames = { fNY };
0070    }
0071
0072    ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
0073       size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
0074       std::vector<size_t> strides, std::string nameX, std::string nameW,
0075       std::string nameY):
0076       fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
0077       fAttrPads(pads), fAttrStrides(strides),
0078       fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY))
0079    {
0080       if(std::is_same<T, float>::value) {
0081          fType = "float";
0082       } else {
0083          throw
0084             std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
0085       }
0086       fInputTensorNames = { fNX };
0087       fOutputTensorNames = { fNY };
0088    }
0089
0090    std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
0091       ETensorType out = input[0];
0092       return {out};
0093    }
0094
0095    // function returning output shape given input
0096    std::vector<Dim> DoShapeInference(const std::vector<Dim> & input, const std::vector<size_t> & weight) {
0097       // shape of convolution input has to be (according to ONNX): N x C x H x W
0098       // Where N : batch size, C : input  channels, H : input height, W : input width
0099
0100       if (input.size() -2 != fDim) {
0101          throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid input ");
0102       }
0103       if (weight.size() -2 != fDim) {
0104          throw std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid weights ");
0105       }
0106       if (fAttrGroup == 0 && input[1].isParam)
0107          throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without group attr");
0108       if (fAttrKernelShape.empty()) {
0109          if (input[2].isParam || (fDim > 1 && input[3].isParam) || (fDim > 2 && input[4].isParam))
0110             throw std::runtime_error("TMVA SOFIE Conv - param shapes not supported without kernel attr");
0111       }
0112
0113       if (fAttrGroup == 0) {
0114          fAttrGroup = input[1].dim / weight[1];
0115       }
0116
0117       // kernel shape
0118       size_t k1 = ((fAttrKernelShape.empty())? weight[2] : fAttrKernelShape[0]);
0119       size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? weight[3] : fAttrKernelShape[1]) : 1;
0120       size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? weight[4] : fAttrKernelShape[2]) : 1;
0121
0122
0123       size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1;
0124       size_t i2 = (fDim > 2) ? 4 : 3;
0125       size_t i3 = 5;
0126
0127       if (fAttrDilations.empty()) {
0128          fAttrDilations = {1, 1, 1};
0129       }
0130       fAttrDilations.resize(3);
0131       if (fDim < 3) {
0132          fAttrDilations.resize(3, 1);
0133       }
0134       // Shape of the kernel
0135       fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1),
0136                           k2 + (fAttrDilations[1] - 1) * (k2 - 1),
0137                           k3 + (fAttrDilations[2] - 1) * (k3 - 1)};
0138
0139       if (fAttrAutopad == "NOTSET") {
0140          if (fAttrPads.empty()) {
0141             fAttrPads = {1, 1, 1, 1, 1, 1};
0142          }
0143       } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") {
0144          if (fDim == 1)
0145             fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[0] / 2};
0146          else if (fDim == 2)
0147             fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2};
0148          else if (fDim == 3)
0149             fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2,
0150                          fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2};
0151          // add extra padding at beginning or end (depending if SAME_UPPER or SAME_LOWER)
0152          // need to check this!
0153          if (fAttrKernelShape[0] % 2 == 1) {
0154             (fAttrAutopad == "SAME_UPPER") ? fAttrPads[0]++ : fAttrPads[i1]++;
0155          }
0156          if (fDim > 1 && fAttrKernelShape[1] % 2 == 1) {
0157             (fAttrAutopad == "SAME_UPPER") ? fAttrPads[1]++ : fAttrPads[i2]++;
0158          }
0159          if (fDim > 2 && fAttrKernelShape[2] % 2 == 1) {
0160             (fAttrAutopad == "SAME_UPPER") ? fAttrPads[2]++ : fAttrPads[i3]++;
0161          }
0162       } else if (fAttrAutopad != "VALID") {
0163          throw
0164             std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad");
0165       }
0166       // to be sure pad is vector of size 6
0167       if (fDim < 3) fAttrPads.resize(6, 0);
0168
0169       if (fAttrStrides.empty()) {
0170          fAttrStrides = {1, 1, 1};
0171       }
0172       if (fDim < 3)
0173          fAttrStrides.resize(3, 1);
0174
0175
0176       Dim input1 = input[2];
0177       Dim input2 = (fDim > 1) ? input[3] : Dim{1};
0178       Dim input3 = (fDim > 2) ? input[4] : Dim{1};
0179
0180       size_t pad1 = fAttrPads[0] + fAttrPads[i1];
0181
0182       // function to get output dimension of convolution given input
0183
0184       auto computeOutput = [&](Dim inputDim, size_t kernel, size_t pad, size_t stride) {
0185          if (!inputDim.isParam) {
0186             size_t outSize = (inputDim.dim + pad - kernel) / stride + 1;
0187             return  Dim{outSize};
0188          } else {
0189             if (stride == 1){
0190                if ((pad - kernel + 1) == 0 )
0191                   // output is same as input
0192                   return inputDim;
0193                else  {
0194                   int64_t v =  pad - kernel + 1;
0195                   std::string outStr = "(" + inputDim.param + "+" + std::to_string(v) + ")";
0196                   return Dim{ outStr, static_cast<size_t>(-1)};
0197                }
0198             } else { // general case (stride not 1)
0199                int64_t v =  pad - kernel;
0200                std::string outStr = "((" + inputDim.param + "+" + std::to_string(v) + ")/"
0201                                  + std::to_string(stride) + "1)";
0202                return Dim{ outStr, static_cast<size_t>(-1)};
0203             }
0204          }
0205          std::runtime_error("TMVA SOFIE Conv Op -  invalid values");
0206          return Dim{};
0207       };
0208
0209       Dim output1 = computeOutput(input1, fAttrKernelShape[0], pad1, fAttrStrides[0]);
0210
0211       Dim batch_size = input[0];        // first element in input tensor
0212       Dim output_channels = Dim{weight[0]};   // first element in weight tensor
0213
0214       std::vector<Dim> ret({ batch_size, output_channels, output1 });
0215
0216       if (fDim == 1)
0217          return ret;
0218
0219       size_t pad2 = fAttrPads[1] + fAttrPads[i2];
0220       Dim output2 = computeOutput(input2, fAttrKernelShape[1], pad2, fAttrStrides[1]);
0221
0222       // output is N x M x OH x OW
0223       ret.push_back(output2);
0224       if (fDim == 2)
0225          return ret;
0226
0227       size_t pad3 = fAttrPads[2] + fAttrPads[i3];
0228       Dim output3 = computeOutput(input3, fAttrKernelShape[2], pad3, fAttrStrides[2]);
0229
0230       // output is N x M x OH x OW x OD
0231       ret.push_back(output3);
0232       return ret;
0233    }
0234
0235    void Initialize(RModel& model) override {
0236       fUseSession = model.UseSession();
0237       if (!model.CheckIfTensorAlreadyExist(fNX)) {
0238          throw
0239             std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model");
0240       }
0241       fShapeX = model.GetDimTensorShape(fNX);
0242       if (fShapeX.size() < 3 || fShapeX.size()  > 5) {
0243          std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
0244          throw
0245             std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions");
0246       }
0247       fDim = fShapeX.size() - 2;
0248       if (!model.CheckIfTensorAlreadyExist(fNW)) {
0249          throw
0250             std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
0251       }
0252       fShapeW = model.GetTensorShape(fNW);
0253       if (fShapeW.size() < 3 || fShapeW.size()  > 5) {
0254          std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
0255          throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions");
0256       }
0257       fShapeY = DoShapeInference(fShapeX, fShapeW);
0258       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
0259       if (fNB != "") {
0260          if (!model.CheckIfTensorAlreadyExist(fNB)) {
0261             throw
0262                std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model");
0263          }
0264          fShapeB = model.GetTensorShape(fNB);
0265          std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
0266          auto shapeDimB = model.GetDimTensorShape(fNB);
0267          bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape);
0268          if (broadcast_needed) {
0269             auto original_data = model.GetInitializedTensorData(fNB);
0270             // make bias shape equal to Y shape by adding 1
0271             if (fShapeB.size() < 1)
0272                throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape");
0273             // we assume bias tensor dimension is equal to number of filters that is the second dimension in
0274             // the output tensor
0275             if (!(shapeDimB[0] == fShapeY[1]))
0276                throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " +
0277                                            ConvertShapeToString(fShapeB));
0278             if (fType != "float")
0279                throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported");
0280             // here is the actual broadcasting
0281             if (!fUseSession) {
0282                std::vector<size_t> shape(fDim + 1, 1);
0283                shape[0] = fShapeB[0];
0284                auto intTargetShape = ConvertShapeToInt(targetShape);
0285                std::shared_ptr<void> new_data_ptr(
0286                   UTILITY::UnidirectionalBroadcast<float>(static_cast<float *>(original_data.get()), shape, intTargetShape),
0287                   std::default_delete<float[]>());
0288                model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr);
0289                fShapeB = model.GetTensorShape(fNB);
0290                fNB2 = fNB;   // use same name
0291             }
0292             else {
0293                // In case of session add broadcasting code in Session constructor and in GenerateInitCode
0294                // we need to add a new intermediate tensor for broadcasted bias tensor
0295                fNB2 = fNB + "bcast";
0296                model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape);
0297             }
0298          }
0299       }
0300       // output channel size can be parametric
0301       std::vector<Dim> outputDims = std::vector<Dim>(fShapeY.begin()+2, fShapeY.end());
0302       auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W
0303       size_t kernelSize = fAttrKernelShape[0];
0304       for (size_t i = 1; i < fDim; i++) {
0305          kernelSize *= fAttrKernelShape[i];
0306       }
0307
0308       std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
0309       std::vector<Dim> shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, Dim{outputChannelSize}};
0310       model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
0311       model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
0312       convK = fNX +"_f";
0313       imcol = fNX +"_xcol";
0314       fOutputTensorNames.emplace_back(convK);
0315       fOutputTensorNames.emplace_back(imcol);
0316       fInputTensorNames.emplace_back(convK);
0317       fInputTensorNames.emplace_back(imcol);
0318
0319       if (model.Verbose()) {
0320          std::cout << "Conv - " << fDim << "  " << fNX << " : " << ConvertShapeToString(fShapeX)
0321                   << " --> " << fNY << " : " << ConvertShapeToString(fShapeY) << std::endl;
0322       }
0323    }
0324
0325    std::string GenerateInitCode() override {
0326       std::stringstream out;
0327       // Generate initialization code for broadcasting of bias tensor
0328       if (!fNB2.empty()) {
0329          // include a separate scope to avoid defining unique operator temp variables
0330          std::vector<size_t> shape(fDim + 1, 1);
0331          shape[0] = fShapeB[0];
0332          std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
0333          out << SP << "{\n";
0334          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
0335              << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
0336          out << SP << SP << "std::copy(data, data + " << ConvertDimShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n";
0337          out << SP << SP << "delete[] data;\n";
0338          out << SP << "}\n";
0339       }
0340       return out.str();
0341    }
0342
0343    std::string Generate(std::string OpName) override {
0344       OpName = "op_" + OpName;
0345
0346       if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
0347          throw
0348             std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
0349       }
0350
0351       std::stringstream out;
0352       auto bsize = fShapeX[0];
0353       size_t kDepth = (fDim > 2) ?  fShapeW[2] : 1;  // kernel depth
0354       size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1;  // kernel height
0355       size_t kWidth = fShapeW[fDim+1]; // kernel width
0356       auto iDepth = (fDim > 2) ?  fShapeX[2] : Dim{1};  // input depth
0357       auto iHeight = (fDim > 1) ? fShapeX[fDim] : Dim{1}; // input height
0358       auto iWidth = fShapeX[fDim+1]; // input width
0359       auto oDepth = (fDim > 2) ? fShapeY[2] : Dim{1}; // output depth
0360       auto oHeight = (fDim > 1) ? fShapeY[fDim] : Dim{1};  // ouput height
0361       auto oWidth = fShapeY[fDim+1]; // output width
0362       // total output size for a channel
0363       auto outputChannelStride = ConvertDimShapeToLength(std::vector<Dim>{oDepth, oHeight, oWidth}); // size of channel = D * H * W
0364       auto outputBatchStride =  ConvertDimShapeToLength(std::vector<Dim>{fShapeY[1] , oDepth, oHeight, oWidth}); // size of C * D * H * W
0365       // input size
0366       auto inputChannelStride = ConvertDimShapeToLength(std::vector<Dim>{iDepth, iHeight, iWidth});
0367       auto inputBatchStride =  ConvertDimShapeToLength(std::vector<Dim>{fShapeX[1] , iDepth, iHeight, iWidth}); // size of C * D * H * W
0368
0369       out << "\n//----  operator Conv " << OpName << "\n";
0370
0371       // vectorize the (dilated)convolution kernels into a matrix
0372       // no need to transpose the matrix
0373       // to fix for 1d and 3d
0374
0375       size_t id = (fDim > 2) ? fDim-3 : 2;
0376       size_t ih = (fDim > 1) ? fDim-2 : 1;
0377       size_t iw = fDim-1;
0378
0379       size_t wstrideDil = fAttrDilations[iw];
0380       size_t hstride = kWidth;
0381       size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw];  // stride dilated in the height
0382       size_t dstride = kHeight * kWidth;
0383       size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
0384       size_t icstride = kHeight * kWidth * kDepth;
0385       size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
0386       size_t ocstride = fShapeW[1] * icstride;
0387       size_t ocstrideDil = fShapeW[1] * icstrideDil;
0388
0389       out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n";
0390       out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n";
0391       if (fDim > 2)
0392          out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n";
0393       if (fDim > 1)
0394          out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n";
0395       out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n";
0396
0397       out << SP << SP << SP << SP << SP << "tensor_" <<fNX <<  "_f[oc * "
0398           << ocstrideDil << " + ic * " << icstrideDil;
0399       if (fDim > 2) out << " + kd * " << dstrideDil;
0400       if (fDim > 1) out << " + kh * " << hstrideDil;
0401       out << " + kw * " << wstrideDil  << "  ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride;
0402       if (fDim > 2) out << " + kd * " << dstride;
0403       if (fDim > 1) out << " + kh * " << hstride;
0404       out  << " + kw ];\n";
0405
0406       out << SP << SP << SP << SP << "}\n";
0407       if (fDim > 1) out << SP << SP << SP << "}\n";
0408       if (fDim > 2) out << SP << SP << SP << "}\n";
0409       out << SP << SP << "}\n";
0410       out << SP << "}\n";
0411
0412       //out << SP << "char " << OpName << "_transA = 'T';\n";
0413       out << SP << "char " << OpName << "_transA = 'N';\n";
0414       out << SP << "char " << OpName << "_transB = 'N';\n";
0415       out << SP << "int " << OpName << "_m = " << outputChannelStride << ";\n"; // output h*w
0416       assert(fShapeY[1] == fShapeW[0]);
0417       //assert(fShapeW[1] == fShapeX[1] / fAttrGroup);
0418       out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels
0419       out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n";
0420       out << SP << "float " << OpName << "_alpha = 1.0;\n";
0421       out << SP << "float " << OpName << "_beta = 0.0;\n";
0422
0423
0424       // Loop on batch size
0425       out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n";
0426
0427       // IM2COL: Unroll the input tensor
0428       // order input data as  (e.g. kernel 2x2)  and (xa,ya) is channel 1 and (xb,yb) is channel 2
0429       //   (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk)
0430       //   (xa2,...xak+1,ya1,...yak)(......)
0431       // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows.
0432       // By doing this one has consecutive memory reads and writes
0433       // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w)
0434       if (fDim ==1) {
0435          if (fAttrPads[0] != fAttrPads[1] ) {
0436             std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
0437                       << std::endl;
0438             fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
0439          }
0440          fAttrPads[1] = 0;
0441          fAttrStrides[1] = 1;
0442       }
0443       if (fDim == 2) {
0444          if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
0445             std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
0446             fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
0447             fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
0448          }
0449       }
0450       if (fDim == 3) {
0451          if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
0452             std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
0453             fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
0454             fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
0455             fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2;
0456          }
0457       }
0458       out << SP << SP << "size_t out_offset = n * " << outputBatchStride  << ";\n";
0459
0460       if (fAttrGroup == 1) {
0461          out << SP << SP << "size_t x_offset = n * " << inputBatchStride << ";\n";
0462          // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y,  output_h *
0463          // output_w)
0464          if (fDim < 3) {
0465             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
0466                 << " + x_offset,"
0467                 //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0468                 //  dilation_w,
0469                 //
0470                 << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
0471             if (fDim == 1)
0472                out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0473                    << fAttrDilations[0];
0474             else // dim ==2
0475                out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0476                    << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0477                    << fAttrDilations[1];
0478             out << "," << "tensor_" <<fNX << "_xcol);\n\n ";
0479          } else {
0480             // 3d im2col
0481             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0482                 << " + x_offset,"
0483                 //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0484                 //  dilation_d, dilation_h, dilation_w,
0485                 //
0486                 << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << ","
0487                 << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << ","
0488                 << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << ","
0489                 << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
0490                 << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ","
0491                 << "tensor_" << fNX << "_xcol);\n\n ";
0492          }
0493          // BLAS
0494          out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0495              << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName
0496              << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0497          out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0498              << " + out_offset, &" << OpName << "_m);\n";
0499       } else {
0500          // case of group convolution
0501          // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each
0502          // group)
0503          // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0504          out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n";
0505          out << SP << SP << "size_t x_offset = n * " << inputBatchStride << " + g * "
0506              << fShapeW[1] << " * " << inputChannelStride << ";\n ";
0507          out << SP << SP << "size_t out_offset = n * " << outputBatchStride << " + g * "
0508              << fShapeW[0] << " * (" << outputChannelStride << ") / " << fAttrGroup << ";\n ";
0509
0510          if (fDim < 3) {
0511             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
0512                 << " + x_offset,"
0513                 //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0514                 //  dilation_w,
0515                 //
0516                 << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
0517             if (fDim == 1)
0518                out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0519                    << fAttrDilations[0];
0520             else // dim ==2
0521                out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0522                    << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0523                    << fAttrDilations[1];
0524             out << ", tensor_" << fNX << "_xcol);\n\n ";
0525          } else {
0526             // 3d im2col
0527             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0528                 << " + x_offset,"
0529                 //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0530                 //  dilation_d, dilation_h, dilation_w,
0531                 //
0532                 << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << ","
0533                 << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1]
0534                 << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2]
0535                 << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX
0536                 << "_xcol);\n\n ";
0537          }
0538
0539          // BLAS
0540          // n must be divided by the number of groups
0541          out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n";
0542          // offset g must be  g * k * n
0543          out << SP << SP << SP << "size_t offset_f = g * "
0544              << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup
0545              << ";\n";
0546          out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0547              << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName
0548              << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0549          out << SP << SP << SP << "tensor_" << fNX << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0550              << " + out_offset"
0551              << ", &" << OpName << "_m);\n";
0552
0553          out << SP << SP << "}\n"; // end of group loop
0554       }
0555
0556       if (fNB2 != "") {
0557          out << SP << "int " << OpName << "_size = " << outputBatchStride << ";\n";
0558          out << SP << "float " << OpName << "_gamma = 1.0;\n";
0559          out << SP << "int " << OpName << "_incx = 1;\n";
0560          out << SP << "int " << OpName << "_incy = 1;\n";
0561
0562          out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &"
0563              << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n";
0564
0565       }
0566       out << SP << "}\n"; // end of batch size loop
0567
0568       return out.str();
0569       }
0570
0571    /*! \brief Returns the blas routines needed to compile the generated code
0572     */
0573    std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
0574 };
0575
0576 } // namespace SOFIE
0577 } // namespace Experimental
0578 } // namespace TMVA
0579
0580 #endif