Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-09-16 09:08:51

0001 #ifndef TMVA_SOFIE_ROPERATOR_CONV
0002 #define TMVA_SOFIE_ROPERATOR_CONV
0003 
0004 #include "TMVA/SOFIE_common.hxx"
0005 #include "TMVA/ROperator.hxx"
0006 #include "TMVA/RModel.hxx"
0007 
0008 #include <memory>
0009 #include <sstream>
0010 #include <algorithm>
0011 #include <stdexcept>
0012 #include <vector>
0013 #include <cassert>
0014 
0015 namespace TMVA {
0016 namespace Experimental {
0017 namespace SOFIE {
0018 
0019 template<typename T>
0020 class ROperator_Conv final : public ROperator
0021 {
0022 private:
0023    std::string fAttrAutopad;
0024    std::vector<size_t> fAttrDilations;
0025    size_t fAttrGroup;
0026    std::vector<size_t> fAttrKernelShape;
0027    std::vector<size_t> fAttrPads;
0028    std::vector<size_t> fAttrStrides;
0029 
0030    std::string fNX;
0031    std::string fNW;
0032    std::string fNB;
0033    std::string fNB2; // bias tensor name after broadcasting
0034    std::string fNY;
0035 
0036    std::string convK;
0037    std::string imcol;
0038 
0039    std::vector<size_t> fShapeX;
0040    std::vector<size_t> fShapeW;
0041    std::vector<size_t> fShapeB;
0042    std::vector<size_t> fShapeY;
0043 
0044    std::string fType;
0045 
0046    size_t fDim;   // dimension of the convolution
0047 
0048 
0049 public:
0050 
0051    ROperator_Conv() {}
0052 
0053    ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
0054       size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
0055       std::vector<size_t> strides, std::string nameX, std::string nameW,
0056       std::string nameB, std::string nameY):
0057       fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
0058       fAttrPads(pads), fAttrStrides(strides),
0059       fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)),
0060       fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
0061    {
0062       if(std::is_same<T, float>::value) {
0063          fType = "float";
0064       } else {
0065          throw
0066             std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
0067       }
0068       fInputTensorNames = { fNX, fNB };
0069       fOutputTensorNames = { fNY };
0070    }
0071 
0072    ROperator_Conv(std::string autopad, std::vector<size_t> dilations,
0073       size_t group, std::vector<size_t> kernelShape, std::vector<size_t> pads,
0074       std::vector<size_t> strides, std::string nameX, std::string nameW,
0075       std::string nameY):
0076       fAttrAutopad(autopad), fAttrDilations(dilations), fAttrGroup(group), fAttrKernelShape(kernelShape),
0077       fAttrPads(pads), fAttrStrides(strides),
0078       fNX(UTILITY::Clean_name(nameX)), fNW(UTILITY::Clean_name(nameW)), fNY(UTILITY::Clean_name(nameY))
0079    {
0080       if(std::is_same<T, float>::value) {
0081          fType = "float";
0082       } else {
0083          throw
0084             std::runtime_error("TMVA SOFIE Encountered unsupported type parsing a Conv operator");
0085       }
0086       fInputTensorNames = { fNX };
0087       fOutputTensorNames = { fNY };
0088    }
0089 
0090    std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
0091       ETensorType out = input[0];
0092       return {out};
0093    }
0094 
0095    // function returning output shape given input
0096    std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
0097       // shape of convolution input has to be (according to ONNX): N x C x H x W
0098       // Where N : batch size, C : input  channels, H : input height, W : input width
0099 
0100       if (input.size() > 3 ) {
0101          throw
0102             std::runtime_error("TMVA SOFIE Conv Op Shape inference need 2 or 3 input tensors");
0103       }
0104       for(size_t i = 0; i < input.size(); i++) {
0105          if (input[i].size() -2 != fDim) {
0106             throw
0107                std::runtime_error("TMVA SOFIE Conv Op Shape inference - invalid inputs ");
0108          }
0109       }
0110 
0111       if (fAttrGroup == 0) {
0112          fAttrGroup = input[0][1] / input[1][1];
0113       }
0114 
0115       // kernel shape
0116       size_t k1 = ((fAttrKernelShape.empty())? input[1][2] : fAttrKernelShape[0]);
0117       size_t k2 = (fDim > 1) ? ((fAttrKernelShape.empty()) ? input[1][3] : fAttrKernelShape[1]) : 1;
0118       size_t k3 = (fDim > 2) ? ((fAttrKernelShape.empty()) ? input[1][4] : fAttrKernelShape[2]) : 1;
0119 
0120 
0121       size_t i1 = (fDim > 1) ? ((fDim > 2) ? 3 : 2) : 1;
0122       size_t i2 = (fDim > 2) ? 4 : 3;
0123       size_t i3 = 5;
0124 
0125       if (fAttrDilations.empty()) {
0126          fAttrDilations = {1, 1, 1};
0127       }
0128       fAttrDilations.resize(3);
0129       if (fDim < 3) {
0130          fAttrDilations.resize(3, 1);
0131       }
0132       // Shape of the kernel
0133       fAttrKernelShape = {k1 + (fAttrDilations[0] - 1) * (k1 - 1),
0134                           k2 + (fAttrDilations[1] - 1) * (k2 - 1),
0135                           k3 + (fAttrDilations[2] - 1) * (k3 - 1)};
0136 
0137       if (fAttrAutopad == "NOTSET") {
0138          if (fAttrPads.empty()) {
0139             fAttrPads = {1, 1, 1, 1, 1, 1};
0140          }
0141       } else if (fAttrAutopad == "SAME_UPPER" || fAttrAutopad == "SAME_LOWER") {
0142          if (fDim == 1)
0143             fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[0] / 2};
0144          else if (fDim == 2)
0145             fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2};
0146          else if (fDim == 3)
0147             fAttrPads = {fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2,
0148                          fAttrKernelShape[0] / 2, fAttrKernelShape[1] / 2, fAttrKernelShape[2] / 2};
0149          // add extra padding at beginning or end (depending if SAME_UPPER or SAME_LOWER)
0150          // need to check this!
0151          if (fAttrKernelShape[0] % 2 == 1) {
0152             (fAttrAutopad == "SAME_UPPER") ? fAttrPads[0]++ : fAttrPads[i1]++;
0153          }
0154          if (fDim > 1 && fAttrKernelShape[1] % 2 == 1) {
0155             (fAttrAutopad == "SAME_UPPER") ? fAttrPads[1]++ : fAttrPads[i2]++;
0156          }
0157          if (fDim > 2 && fAttrKernelShape[2] % 2 == 1) {
0158             (fAttrAutopad == "SAME_UPPER") ? fAttrPads[2]++ : fAttrPads[i3]++;
0159          }
0160       } else if (fAttrAutopad != "VALID") {
0161          throw
0162             std::runtime_error("TMVA SOFIE Conv Op invalid fAutopad");
0163       }
0164       // to be sure pad is vector of size 6
0165       if (fDim < 3) fAttrPads.resize(6, 0);
0166 
0167       if (fAttrStrides.empty()) {
0168          fAttrStrides = {1, 1, 1};
0169       }
0170       if (fDim < 3)
0171          fAttrStrides.resize(3, 1);
0172 
0173 
0174       size_t input1 = input[0][2];
0175       size_t input2 = (fDim > 1) ? input[0][3] : 1;
0176       size_t input3 = (fDim > 2) ? input[0][4] : 1;
0177 
0178       size_t pad1 = fAttrPads[0] + fAttrPads[i1];
0179       size_t output1 = (input1 + pad1 - fAttrKernelShape[0]) / fAttrStrides[0] + 1;
0180 
0181       size_t batch_size = input[0][0];        // first element in input tensor
0182       size_t output_channels = input[1][0];   // first element in weight tensor
0183 
0184       std::vector<std::vector<size_t>> ret({{ batch_size, output_channels, output1 }});
0185 
0186       if (fDim == 1)
0187          return ret;
0188 
0189       size_t pad2 = fAttrPads[1] + fAttrPads[i2];
0190       size_t output2 = (input2 + pad2 - fAttrKernelShape[1]) / fAttrStrides[1] + 1;
0191       // output is N x M x OH x OW
0192       ret[0].push_back(output2);
0193       if (fDim == 2)
0194          return ret;
0195 
0196       size_t pad3 = fAttrPads[2] + fAttrPads[i3];
0197       size_t output3 = (input3 + pad3 - fAttrKernelShape[2] ) / fAttrStrides[2] + 1;
0198 
0199       // output is N x M x OH x OW x OD
0200       ret[0].push_back(output3);
0201       return ret;
0202    }
0203 
0204    void Initialize(RModel& model) override {
0205       fUseSession = model.UseSession();
0206       if (!model.CheckIfTensorAlreadyExist(fNX)) {
0207          throw
0208             std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNX + " is not found in model");
0209       }
0210       fShapeX = model.GetTensorShape(fNX);
0211       if (fShapeX.size() < 3 || fShapeX.size()  > 5) {
0212          std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
0213          throw
0214             std::runtime_error("TMVA SOFIE Conv Op input data tensor" + fNX + " is not of 3,4 or 5 dimensions");
0215       }
0216       fDim = fShapeX.size() - 2;
0217       if (!model.CheckIfTensorAlreadyExist(fNW)) {
0218          throw
0219             std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
0220       }
0221       fShapeW = model.GetTensorShape(fNW);
0222       if (fShapeW.size() < 3 || fShapeW.size()  > 5) {
0223          std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
0224          throw std::runtime_error("TMVA SOFIE Conv Op input weight tensor" + fNW + " is not of 3,4 or 5 dimensions");
0225       }
0226       fShapeY = ShapeInference({fShapeX, fShapeW})[0];
0227       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
0228       if (fNB != "") {
0229          if (!model.CheckIfTensorAlreadyExist(fNB)) {
0230             throw
0231                std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model");
0232          }
0233          fShapeB = model.GetTensorShape(fNB);
0234          std::vector<size_t> targetShape(fShapeY.begin() + 1, fShapeY.end());
0235          bool broadcast_needed = !UTILITY::AreSameShape(fShapeB, targetShape);
0236          if (broadcast_needed) {
0237             auto original_data = model.GetInitializedTensorData(fNB);
0238             // make bias shape equal to Y shape by adding 1
0239             if (fShapeB.size() < 1)
0240                throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has empty shape");
0241             // we assume bias tensor dimension is equal to number of filters that is the second dimension in
0242             // the output tensor
0243             if (fShapeB[0] != fShapeY[1])
0244                throw std::runtime_error("TMVA SOFIE Conv op: Bias Tensor has wrong shape: " +
0245                                            ConvertShapeToString(fShapeB));
0246             if (fType != "float")
0247                throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported");
0248             // here is the actual broadcasting
0249             if (!fUseSession) {
0250                std::vector<size_t> shape(fDim + 1, 1);
0251                shape[0] = fShapeB[0];
0252                std::shared_ptr<void> new_data_ptr(
0253                   UTILITY::UnidirectionalBroadcast<float>(static_cast<float *>(original_data.get()), shape, targetShape),
0254                   std::default_delete<float[]>());
0255                model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), targetShape, new_data_ptr);
0256                fShapeB = model.GetTensorShape(fNB);
0257                fNB2 = fNB;   // use same name
0258             }
0259             else {
0260                // In case of session add broadcasting code in Session constructor and in GenerateInitCode
0261                // we need to add a new intermediate tensor for broadcasted bias tensor
0262                fNB2 = fNB + "bcast";
0263                model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape);
0264             }
0265          }
0266       }
0267 
0268       size_t outputChannelSize = fShapeY[2];  // size/channel = D * H * W
0269       size_t kernelSize = fAttrKernelShape[0];
0270       for (size_t i = 1; i < fDim; i++) {
0271          outputChannelSize *= fShapeY[2 + i];
0272          kernelSize *= fAttrKernelShape[i];
0273       }
0274 
0275       std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
0276       std::vector<size_t> shape2 = {fShapeW[1], kernelSize, outputChannelSize};
0277       model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
0278       model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
0279       convK = fNX +"_f";
0280       imcol = fNX +"_xcol";
0281       fOutputTensorNames.emplace_back(convK);
0282       fOutputTensorNames.emplace_back(imcol);
0283    }
0284 
0285    std::string GenerateInitCode() override {
0286       std::stringstream out;
0287       // Generate initialization code for broadcasting of bias tensor
0288       if (!fNB2.empty()) {
0289          // include a separate scope to avoid defining unique operator temp variables
0290          std::vector<size_t> shape(fDim + 1, 1);
0291          shape[0] = fShapeB[0];
0292          std::vector<size_t> targetShape(fShapeY.begin() + 1, fShapeY.end());
0293          out << SP << "{\n";
0294          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
0295              << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
0296          out << SP << SP << "std::copy(data, data + " << ConvertShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n";
0297          out << SP << SP << "delete[] data;\n";
0298          out << SP << "}\n";
0299       }
0300       return out.str();
0301    }
0302 
0303    std::string Generate(std::string OpName) override {
0304       OpName = "op_" + OpName;
0305 
0306       if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
0307          throw
0308             std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
0309       }
0310 
0311       std::stringstream out;
0312       size_t bsize = fShapeX[0];
0313       size_t kDepth = (fDim > 2) ?  fShapeW[2] : 1;  // kernel depth
0314       size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1;  // kernel height
0315       size_t kWidth = fShapeW[fDim+1]; // kernel width
0316       size_t iDepth = (fDim > 2) ?  fShapeX[2] : 1;  // input depth
0317       size_t iHeight = (fDim > 1) ? fShapeX[fDim] : 1; // input height
0318       size_t iWidth = fShapeX[fDim+1]; // input width
0319       size_t oDepth = (fDim > 2) ? fShapeY[2] : 1; // output depth
0320       size_t oHeight = (fDim > 1) ? fShapeY[fDim] : 1;  // ouput height
0321       size_t oWidth = fShapeY[fDim+1]; // output width
0322 
0323       out << "\n//----  operator Conv " << OpName << "\n";
0324 
0325       // vectorize the (dilated)convolution kernels into a matrix
0326       // no need to transpose the matrix
0327       // to fix for 1d and 3d
0328 
0329       size_t id = (fDim > 2) ? fDim-3 : 2;
0330       size_t ih = (fDim > 1) ? fDim-2 : 1;
0331       size_t iw = fDim-1;
0332 
0333       size_t wstrideDil = fAttrDilations[iw];
0334       size_t hstride = kWidth;
0335       size_t hstrideDil = fAttrDilations[ih] * fAttrKernelShape[iw];  // stride dilated in the height
0336       size_t dstride = kHeight * kWidth;
0337       size_t dstrideDil = fAttrDilations[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
0338       size_t icstride = kHeight * kWidth * kDepth;
0339       size_t icstrideDil = fAttrKernelShape[id] * fAttrKernelShape[ih] * fAttrKernelShape[iw];
0340       size_t ocstride = fShapeW[1] * icstride;
0341       size_t ocstrideDil = fShapeW[1] * icstrideDil;
0342 
0343       out << SP << "for (std::size_t oc = 0; oc < " << fShapeW[0] << "; oc++) {\n";
0344       out << SP << SP << "for (std::size_t ic = 0; ic < " << fShapeW[1] << "; ic++) {\n";
0345       if (fDim > 2)
0346          out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n";
0347       if (fDim > 1)
0348          out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n";
0349       out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n";
0350 
0351       out << SP << SP << SP << SP << SP << "tensor_" <<fNX <<  "_f[oc * "
0352           << ocstrideDil << " + ic * " << icstrideDil;
0353       if (fDim > 2) out << " + kd * " << dstrideDil;
0354       if (fDim > 1) out << " + kh * " << hstrideDil;
0355       out << " + kw * " << wstrideDil  << "  ] = tensor_" << fNW << "[oc * " << ocstride << " + ic * " << icstride;
0356       if (fDim > 2) out << " + kd * " << dstride;
0357       if (fDim > 1) out << " + kh * " << hstride;
0358       out  << " + kw ];\n";
0359 
0360       out << SP << SP << SP << SP << "}\n";
0361       if (fDim > 1) out << SP << SP << SP << "}\n";
0362       if (fDim > 2) out << SP << SP << SP << "}\n";
0363       out << SP << SP << "}\n";
0364       out << SP << "}\n";
0365 
0366       //out << SP << "char " << OpName << "_transA = 'T';\n";
0367       out << SP << "char " << OpName << "_transA = 'N';\n";
0368       out << SP << "char " << OpName << "_transB = 'N';\n";
0369       out << SP << "int " << OpName << "_m = " << oHeight * oWidth * oDepth << ";\n"; // output h*w
0370       assert(fShapeY[1] == fShapeW[0]);
0371       assert(fShapeW[1] == fShapeX[1] / fAttrGroup);
0372       out << SP << "int " << OpName << "_n = " << fShapeW[0] << ";\n"; // output channels
0373       out << SP << "int " << OpName << "_k = " << fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] << ";\n";
0374       out << SP << "float " << OpName << "_alpha = 1.0;\n";
0375       out << SP << "float " << OpName << "_beta = 0.0;\n";
0376 
0377 
0378       // Loop on batch size
0379       out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n";
0380 
0381       // IM2COL: Unroll the input tensor
0382       // order input data as  (e.g. kernel 2x2)  and (xa,ya) is channel 1 and (xb,yb) is channel 2
0383       //   (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk)
0384       //   (xa2,...xak+1,ya1,...yak)(......)
0385       // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows.
0386       // By doing this one has consecutive memory reads and writes
0387       // Resulting matrix op_xcol is (input channels * filter_h * filter_w , output_h * output_w)
0388       if (fDim ==1) {
0389          if (fAttrPads[0] != fAttrPads[1] ) {
0390             std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
0391                       << std::endl;
0392             fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
0393          }
0394          fAttrPads[1] = 0;
0395          fAttrStrides[1] = 1;
0396       }
0397       if (fDim == 2) {
0398          if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
0399             std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
0400             fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
0401             fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
0402          }
0403       }
0404       if (fDim == 3) {
0405          if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
0406             std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding " << std::endl;
0407             fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
0408             fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
0409             fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2;
0410          }
0411       }
0412       out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0413 
0414       if (fAttrGroup == 1) {
0415          out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iHeight * iWidth << ";\n";
0416          // when using im2col - resulting matrix is transposed, the dimension is (input_c * filter_h * filter_y,  output_h *
0417          // output_w)
0418          if (fDim < 3) {
0419             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
0420                 << " + x_offset,"
0421                 //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0422                 //  dilation_w,
0423                 //
0424                 << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
0425             if (fDim == 1)
0426                out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0427                    << fAttrDilations[0];
0428             else // dim ==2
0429                out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0430                    << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0431                    << fAttrDilations[1];
0432             out << "," << "tensor_" <<fNX << "_xcol);\n\n ";
0433          } else {
0434             // 3d im2col
0435             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0436                 << " + x_offset,"
0437                 //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0438                 //  dilation_d, dilation_h, dilation_w,
0439                 //
0440                 << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << ","
0441                 << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << ","
0442                 << fAttrPads[0] << "," << fAttrPads[1] << "," << fAttrPads[2] << ","
0443                 << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
0444                 << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ","
0445                 << "tensor_" << fNX << "_xcol);\n\n ";
0446          }
0447          // BLAS
0448          out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0449              << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << "tensor_" << fNX << "_xcol, &" << OpName
0450              << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0451          out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0452              << " + out_offset, &" << OpName << "_m);\n";
0453       } else {
0454          // case of group convolution
0455          // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each
0456          // group)
0457          // out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0458          out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n";
0459          out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iDepth * iHeight * iWidth << " + g * "
0460              << fShapeW[1] * iDepth * iHeight * iWidth << ";\n ";
0461          out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << " + g * "
0462              << fShapeW[0] * oDepth * oHeight * oWidth / fAttrGroup << ";\n ";
0463 
0464          if (fDim < 3) {
0465             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col<float>(tensor_" << fNX
0466                 << " + x_offset,"
0467                 //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0468                 //  dilation_w,
0469                 //
0470                 << fShapeW[1] << "," << iHeight << "," << iWidth << ",";
0471             if (fDim == 1)
0472                out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0473                    << fAttrDilations[0];
0474             else // dim ==2
0475                out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0476                    << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0477                    << fAttrDilations[1];
0478             out << ", tensor_" << fNX << "_xcol);\n\n ";
0479          } else {
0480             // 3d im2col
0481             out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0482                 << " + x_offset,"
0483                 //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0484                 //  dilation_d, dilation_h, dilation_w,
0485                 //
0486                 << fShapeW[1] << "," << iDepth << "," << iHeight << "," << iWidth << "," << fAttrKernelShape[0] << ","
0487                 << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1]
0488                 << "," << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2]
0489                 << "," << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << ",tensor_" << fNX
0490                 << "_xcol);\n\n ";
0491          }
0492 
0493          // BLAS
0494          // n must be divided by the number of groups
0495          out << SP << SP << SP << OpName << "_n = " << fShapeW[0] / fAttrGroup << ";\n";
0496          // offset g must be  g * k * n
0497          out << SP << SP << SP << "size_t offset_f = g * "
0498              << fShapeW[0] * fShapeW[1] * fAttrKernelShape[0] * fAttrKernelShape[1] * fAttrKernelShape[2] / fAttrGroup
0499              << ";\n";
0500          out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0501              << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName
0502              << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0503          out << SP << SP << SP << "tensor_" << fNX << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0504              << " + out_offset"
0505              << ", &" << OpName << "_m);\n";
0506 
0507          out << SP << SP << "}\n"; // end of group loop
0508       }
0509 
0510       if (fNB2 != "") {
0511          out << SP << "int " << OpName << "_size = " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0512          out << SP << "float " << OpName << "_gamma = 1.0;\n";
0513          out << SP << "int " << OpName << "_incx = 1;\n";
0514          out << SP << "int " << OpName << "_incy = 1;\n";
0515 
0516          out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &"
0517              << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n";
0518 
0519       }
0520       out << SP << "}\n"; // end of batch size loop
0521 
0522       return out.str();
0523       }
0524 
0525    /*! \brief Returns the blas routines needed to compile the generated code
0526     */
0527    std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
0528 };
0529 
0530 } // namespace SOFIE
0531 } // namespace Experimental
0532 } // namespace TMVA
0533 
0534 #endif