Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-01-30 10:22:59

0001 #ifndef TMVA_SOFIE_ROPERATOR_CONVTRANSPOSE_I
0002 #define TMVA_SOFIE_ROPERATOR_CONVTRANSPOSE_I
0003 
0004 #include <memory>
0005 #include <sstream>
0006 #include <algorithm>
0007 #include <stdexcept>
0008 #include <vector>
0009 #include <cassert>
0010 
0011 #include <TMVA/SOFIE_common.hxx>
0012 
0013 namespace TMVA {
0014 namespace Experimental {
0015 namespace SOFIE {
0016 
0017 template <typename T>
0018 auto ROperator_ConvTranspose<T>::ShapeInference(std::vector<std::vector<size_t>> input)
0019    -> std::vector<std::vector<size_t>>
0020 {
0021    const std::vector<size_t> &inputShape = input[0];
0022    const std::vector<size_t> &weightShape = input[1];
0023    size_t size = inputShape.size();
0024    // Dimension of the conv transpose op
0025    fDim = size - 2;
0026    // Number of groups
0027    if (fAttrGroup == 0)
0028       fAttrGroup = 1;
0029    if (fAttrStrides.empty()) {
0030       fAttrStrides = std::vector<size_t>(fDim, 1);
0031    }
0032    if (fAttrDilations.empty()) {
0033       fAttrDilations = std::vector<size_t>(fDim, 1);
0034    }
0035    // The shape of the kernel is kw for 1d image, kh x Kw for 2d images and kd x kh x kw for a 3d image
0036    if (fAttrKernelShape.empty()) {
0037       fAttrKernelShape.resize(fDim);
0038       for (size_t i = 0; i < fDim; i++)
0039          fAttrKernelShape[i] = fShapeW[i + 2] + (fAttrDilations[i] - 1) * (fShapeW[i + 2] - 1);
0040    }
0041    if (fAttrOutputPadding.empty())
0042       fAttrOutputPadding = std::vector<size_t>(fDim, 0);
0043 
0044    // The Shape of the output is batch_size x out_channel x out_w for a 1d image,
0045    // batch_size x out_channel x out_h x out_w for a 2d image and
0046    // batch_size x out_channel x out_d x out_h x out_w for a 3d image
0047    // where out_channel = weight_shape[1] * group
0048    std::vector<size_t> outShape(size);
0049    outShape[0] = inputShape[0];
0050    outShape[1] = weightShape[1] * fAttrGroup;
0051 
0052 
0053    // Generate the padding
0054    if (fAttrPads.empty() ) {
0055       fAttrPads = std::vector<size_t>(2 * fDim, 0);
0056       if (fAttrOutputShape.size() == fDim) {
0057          //LM: to be checked...
0058          // for time being not support
0059          throw
0060             std::runtime_error("ConvTranspose with output_shape explicitly set not yet supported.");
0061       /*
0062       std::vector<size_t> totalPadding(fDim, 1);
0063       for (size_t i = 0; i < fDim; i++) {
0064          size_t j = i + 2;
0065          totalPadding[i] =
0066             fAttrStrides[i] * (fAttrOutputShape[i] - 1) + fAttrOutputPadding[i] + fAttrKernelShape[i] - fShapeX[j];
0067       }
0068 
0069       for (size_t i = 0; i < fDim; i++) {
0070          size_t end_i = i + fDim;
0071          if (fAttrAutopad == "SAME_UPPER") {
0072             fAttrPads[i] = totalPadding[i] / 2;
0073             fAttrPads[end_i] = totalPadding[i] - fAttrPads[i];
0074          } else {
0075             fAttrPads[end_i] = totalPadding[i] / 2;
0076             fAttrPads[i] = totalPadding[i] - fAttrPads[end_i];
0077          }
0078       }
0079       */
0080       }
0081       if (fAttrAutopad != "NOTSET") {
0082          throw
0083             std::runtime_error("ConvTranspose with padding SAME_UPPER or SMAE_LOWER not supported");
0084       }
0085    }
0086    if (fAttrOutputShape.empty()) {
0087       fAttrOutputShape.resize(fDim);
0088       for (size_t i = 0; i < fDim; i++) {
0089          size_t j = i + 2;
0090          fAttrOutputShape[i] = fAttrStrides[i] * (inputShape[j] - 1) + fAttrKernelShape[i] + fAttrOutputPadding[i] - fAttrPads[i] - fAttrPads[fDim+i];
0091       }
0092    } else {
0093         // The shape of the output is explicitly set
0094         // TODO Generate the padding from the output shape and the input shape
0095         throw
0096             std::runtime_error("ConvTranspose with output_shape explicitly set not yet supported.");
0097     }
0098 
0099    for (size_t i = 0; i < fDim; i++)
0100       outShape[i + 2] = fAttrOutputShape[i];
0101    std::vector<std::vector<size_t>> ret({outShape});
0102    return ret;
0103 }
0104 
0105 template <typename T>
0106 void ROperator_ConvTranspose<T>::Initialize(RModel &model)
0107 {
0108    fUseSession = model.UseSession();
0109    if (!model.CheckIfTensorAlreadyExist(fNX)) {
0110       throw std::runtime_error("TMVA SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model");
0111    }
0112    fShapeX = model.GetTensorShape(fNX);
0113    if (fShapeX.size() < 3 || fShapeX.size() > 5) {
0114       std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
0115       throw std::runtime_error("TMVA SOFIE Conv Transpose Op input data tensor" + fNX +
0116                                " is not of 3,4 or 5 dimensions");
0117    }
0118    fDim = fShapeX.size() - 2;
0119    if (!model.CheckIfTensorAlreadyExist(fNW)) {
0120       throw std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
0121    }
0122    fShapeW = model.GetTensorShape(fNW);
0123    if (fShapeW.size() < 3 || fShapeW.size() > 5) {
0124       std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
0125       throw std::runtime_error("TMVA SOFIE Conv Transpose Op input weight tensor" + fNW +
0126                                " is not of 3,4 or 5 dimensions");
0127    }
0128    fShapeY = ShapeInference({fShapeX, fShapeW})[0];
0129 
0130    model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
0131    if (fNB != "") {
0132       if (!model.CheckIfTensorAlreadyExist(fNB)) {
0133          throw std::runtime_error("TMVA SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model");
0134       }
0135       fShapeB = model.GetTensorShape(fNB);
0136       if (fShapeB.size() < 1)
0137             throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has empty shape");
0138 
0139       size_t bsize = ConvertShapeToLength(fShapeB);
0140       size_t ysize = ConvertShapeToLength(fShapeY);
0141       // broadcasting is needed if first stride of B is not same of Y
0142       bool broadcast_needed = (bsize != ysize);
0143       // Broadcast the bias B
0144       if (broadcast_needed) {
0145          // we assume bias tensor size is equal to number of filters that is the second dimension in
0146          // the output tensor
0147          if (bsize != fShapeY[1] )
0148             throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has wrong shape: " +
0149                                      ConvertShapeToString(fShapeB));
0150 
0151          auto original_data = model.GetInitializedTensorData(fNB);
0152 
0153          if (fType != "float")
0154             throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported");
0155          // here the acual broadcasting
0156          if (!fUseSession) {
0157             // Broadcast B from M to N x M x Od x Oh x Ow
0158             std::shared_ptr<void> new_data_ptr(
0159                UTILITY::BroadcastConvBias<float>(static_cast<float *>(original_data.get()), bsize, fShapeY),
0160                std::default_delete<float[]>());
0161 
0162             model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), fShapeY, new_data_ptr);
0163             fShapeB = model.GetTensorShape(fNB);
0164             fNBroadcastedB = fNB; // use same name
0165          } else {
0166             // In case of session add broadcasting code in Session constructor and in GenerateInitCode
0167             // we need to add a new intermediate tensor for broadcasted bias tensor
0168             fNBroadcastedB = "Broadcasted" + fNB;
0169             model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY);
0170          }
0171       }
0172       else {
0173          // bias tensor is already correct shape, no need to broadcast
0174          if (fShapeY != fShapeB)
0175             throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" +
0176                ConvertShapeToString(fShapeB));
0177          fNBroadcastedB = fNB;
0178       }
0179    }
0180 }
0181 
0182 template <typename T>
0183 std::string ROperator_ConvTranspose<T>::GenerateInitCode()
0184 {
0185    std::stringstream out;
0186    // generate initialization code for broadcasting of bias tensor
0187    size_t bsize = ConvertShapeToLength(fShapeB);
0188    size_t ysize = ConvertShapeToLength(fShapeY);
0189    if (bsize != ysize && !fNBroadcastedB.empty()) {
0190          // include a separate scope to avoid defining unique operator temp variables
0191          out << SP << "{\n";
0192          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::BroadcastConvBias<float>(tensor_"
0193              << fNB << ", " << bsize << ", " << ConvertShapeToString(fShapeY) << ");\n";
0194          out << SP << SP << "std::copy(data, data + " << ConvertShapeToLength(fShapeY) << ", tensor_" << fNBroadcastedB << ");\n";
0195          out << SP << SP << "delete[] data;\n";
0196          out << SP << "}\n";
0197    }
0198    return out.str();
0199 }
0200 
0201 // generate code for Session data members (e.g. internal vectors)
0202 template <typename T>
0203 std::string ROperator_ConvTranspose<T>::GenerateSessionMembersCode(std::string opName)
0204 {
0205 
0206    //size_t outputChannelSize = fShapeY[1];
0207    size_t kernelSize = 1;
0208    size_t inputSize = 1;
0209    for (size_t i = 0; i < fDim; i++) {
0210       inputSize *= fShapeX[2+ i];
0211       kernelSize *= fAttrKernelShape[i];
0212    }
0213 
0214    opName = "op_" + opName;
0215    std::stringstream out;
0216    // matrix with convolution kernels
0217    out << "std::vector<" << fType << "> fVec_" << opName << "_f = std::vector<" << fType << ">("
0218        << fShapeW[0] * fShapeW[1] * kernelSize << ");\n";
0219    // output matrix of im2col
0220    out << "std::vector<" << fType << "> fVec_" << opName << "_xcol = std::vector<" << fType << ">("
0221        << kernelSize * fShapeW[1] * inputSize << ");\n"; // kernelsize * output channel size * input size
0222    out << "\n";
0223 
0224    return out.str();
0225 }
0226 
0227 template <typename T>
0228 std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
0229 {
0230    OpName = "op_" + OpName;
0231 
0232    if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
0233       throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
0234    }
0235 
0236    std::stringstream out;
0237 
0238    size_t bsize = fShapeX[0];
0239    size_t kDepth = (fDim > 2) ? fShapeW[2] : 1;     // kernel depth
0240    size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; // kernel height
0241    size_t kWidth = fShapeW[fDim + 1];               // kernel width
0242 
0243    size_t iDepth = (fDim > 2) ? fShapeX[2] : 1;     // input depth
0244    size_t iHeight = (fDim > 1) ? fShapeX[fDim] : 1; // input height
0245    size_t iWidth = fShapeX[fDim + 1];               // input width
0246 
0247    size_t oDepth = (fDim > 2) ? fShapeY[2] : 1;     // output depth
0248    size_t oHeight = (fDim > 1) ? fShapeY[fDim] : 1; // ouput height
0249    size_t oWidth = fShapeY[fDim + 1];               // output width
0250 
0251    out << "\n//----  operator ConvTranspose " << OpName << "\n";
0252 
0253    // create first matrix with convolution kernels
0254    if (fUseSession)
0255       out << SP << fType << " * " << OpName << "_f = fVec_" << OpName << "_f.data();\n";
0256    else {
0257       size_t kernelSize = fAttrKernelShape[0];
0258       if (fDim > 1)
0259          kernelSize *= fAttrKernelShape[1];
0260       out << SP << fType << " " << OpName << "_f[" << fShapeW[0] * fShapeW[1] * kernelSize << "] = {0};\n";
0261    }
0262 
0263    // vectorize the (dilated)convolution kernels into a matrix
0264    // The shape of the kernel is W for 1d image, H x W for 2d image and D x H x W
0265    // for 3d image
0266    size_t id = (fDim > 2) ? fDim - 3 : 2;
0267    size_t ih = (fDim > 1) ? fDim - 2 : 1;
0268    size_t iw = fDim - 1;
0269    size_t wstrideDil = fAttrDilations[iw];
0270    size_t hstride = kWidth;
0271    size_t hstrideDil = fAttrKernelShape[iw];
0272    if (fDim > 1) 
0273       hstrideDil *= fAttrDilations[ih];
0274    // stride dilated in the height
0275    size_t dstride = kHeight * kWidth;
0276    size_t dstrideDil = fAttrKernelShape[iw];
0277    if (fDim > 1)
0278       dstrideDil *= fAttrKernelShape[ih];
0279    if (fDim > 2)
0280       dstrideDil *= fAttrDilations[id];
0281    size_t icstride = kHeight * kWidth * kDepth;
0282    size_t icstrideDil = 1;
0283    for (size_t i = 0; i < fDim; i++)
0284       icstrideDil *= fAttrKernelShape[i];
0285    size_t ocstride = fShapeW[1] * icstride;
0286    size_t ocstrideDil = fShapeW[1] * icstrideDil;
0287 
0288    // The shape of f is [M/group, kHeight x kWidth]
0289    out << SP << "for (std::size_t ic = 0; ic < " << fShapeW[0] << "; ic++) {\n";
0290    out << SP << SP << "for (std::size_t oc = 0; oc < " << fShapeW[1] << "; oc++) {\n";
0291    //out << SP << SP << SP << "size_t kIndex = 0;\n";  // filter index
0292    if (fDim > 2)
0293       out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n";
0294    if (fDim > 1)
0295       out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n";
0296    out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n";
0297 
0298    out << SP << SP << SP << SP << SP << OpName << "_f[ic * " << ocstrideDil << " + oc * " << icstrideDil;
0299    if (fDim > 2)
0300       out << " + kd * " << dstrideDil;
0301    if (fDim > 1)
0302       out << " + kh * " << hstrideDil;
0303    out << " + kw * " << wstrideDil << "  ] = tensor_" << fNW << "[ic * " << ocstride << " + oc * " << icstride;
0304 
0305    if (fDim > 2)
0306       out << " + kd * " << dstride;
0307    if (fDim > 1)
0308       out << " + kh * " << hstride;
0309    out << " + kw ];\n";
0310 
0311    // here we rotate the input kernel tranforming  0,1,2,...N-1 in N-1,N-2,...,2,1,0
0312    // out << " + " << icstride -1 << " - kIndex ];\n"; // tranform 1,2,3,4 in 4,3,2,1
0313    // out << SP << SP << SP << SP << SP << "kIndex++;\n";  // update input filter index
0314 
0315    out << SP << SP << SP << SP << "}\n";
0316    if (fDim > 1)
0317       out << SP << SP << SP << "}\n";
0318    if (fDim > 2)
0319       out << SP << SP << SP << "}\n";
0320 
0321    out << SP << SP << "}\n";
0322    out << SP << "}\n";
0323 
0324    out << SP << "char " << OpName << "_transA = 'N';\n";
0325    out << SP << "char " << OpName << "_transB = 'T';\n";
0326    out << SP << "int " << OpName << "_m = " << iHeight * iWidth * iDepth << ";\n";
0327    out << SP << "int " << OpName << "_n = " << icstrideDil*fShapeW[1] << ";\n";   // output channels * filters
0328    out << SP << "int " << OpName << "_k = " << fShapeW[0] << ";\n";  // input channels
0329    out << SP << "float " << OpName << "_alpha = 1.0;\n";
0330    out << SP << "float " << OpName << "_beta = 0.0;\n";
0331 
0332    if (fUseSession) {
0333       out << SP << fType << " * " << OpName << "_xcol = fVec_" << OpName << "_xcol.data();\n";
0334    } else {
0335       out << SP << fType << " " << OpName << "_xcol[" << fShapeW[0]*icstrideDil * oDepth * oHeight * oWidth << "] = {0};\n";
0336    }
0337 
0338    // Loop on batch size
0339    out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n";
0340 
0341    // IM2COL: Unroll the input tensor
0342    // order input data as  (e.g. kernel 2x2)  and (xa,ya) is channel 1 and (xb,yb) is channel 2
0343    //   (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk)
0344    //   (xa2,...xak+1,ya1,...yak)(......)
0345    // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows.
0346    // By doing this one has consecutive memory reads and writes
0347    // Resulting matrix op_xcol is (output channels * filter_h * filter_w , output_h * output_w)
0348    if (fDim == 1) {
0349       if (fAttrPads[0] != fAttrPads[1]) {
0350          std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
0351                    << std::endl;
0352          fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
0353       }
0354       fAttrPads[1] = 0;
0355    }
0356    if (fDim == 2) {
0357       if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
0358          std::cout << "TMVA SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
0359                    << std::endl;
0360          fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
0361          fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
0362       }
0363    }
0364    if (fDim == 3) {
0365       if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
0366          std::cout << "TMVA SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
0367                    << std::endl;
0368          fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
0369          fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
0370          fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2;
0371       }
0372    }
0373 
0374    if (fAttrGroup == 1) {
0375       out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iDepth * iHeight * iWidth << ";\n";
0376       out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0377 
0378       // DO BLAS before:
0379        // BLAS
0380       out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0381           << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, "
0382           <<  "tensor_" << fNX << " + x_offset, &" << OpName << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0383       out << SP << SP << SP << OpName << "_f, &" << OpName << "_n, &" << OpName << "_beta, "
0384       << OpName << "_xcol, &" << OpName << "_m);\n";
0385 
0386       // when using im2col - resulting matrix is transposed, is (input_c * filter_h * filter_w,  output_h *
0387       // output_w)
0388       // before using col2im I need to transpose matrix
0389       if (fDim < 3) {
0390          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::col2im<float>(" << OpName << "_xcol,"
0391              //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0392              //  dilation_w,
0393              << fShapeY[1] << "," << oHeight << "," << oWidth << ",";
0394          if (fDim == 1)
0395             out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0396                 << fAttrDilations[0];
0397          else // dim ==2
0398             out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0399                 << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0400                 << fAttrDilations[1];
0401          out << ", tensor_" << fNY << " + out_offset);\n\n ";
0402       } else {
0403          // 3d : needs a col2im for 3d
0404          throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported");
0405          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0406              << " + x_offset,"
0407              //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0408              //  dilation_d, dilation_h, dilation_w,
0409              //
0410              << fShapeX[1] << "," << oDepth << "," << oHeight << "," << oWidth << "," << fAttrKernelShape[0] << ","
0411              << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] << ","
0412              << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
0413              << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," << OpName
0414              << "_xcol);\n\n ";
0415       }
0416       // // BLAS
0417       // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0418       //     << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << OpName << "_xcol, &" << OpName
0419       //     << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0420       // out << SP << SP << SP << OpName << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0421       //     << " + out_offset, &" << OpName << "_m);\n";
0422    } else {
0423       // case of group transposed convolution
0424       // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each
0425       // group)
0426       out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n";
0427       out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iHeight * iWidth  << " + g * "
0428           << fShapeX[1] * iHeight * iWidth / fAttrGroup << ";\n ";
0429       out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oHeight * oWidth << " + g * "
0430           << fShapeY[1] * oHeight * oWidth / fAttrGroup << ";\n ";
0431 
0432       // do BLAS here (LM: probably need an offset for op_f the kernels)
0433       out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0434           << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, "
0435           << "tensor_" << fNX << " + x_offset, &" << OpName
0436           << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0437       out << SP << SP << SP << OpName << "_f, &" << OpName << "_n, &" << OpName
0438       << "_beta, " << OpName << "_xcol , &" << OpName << "_m);\n";
0439 
0440       if (fDim < 3) {
0441          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::col2im<float>(" << OpName << "_xcol,"
0442              //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0443              //  dilation_w,
0444             << fShapeY[1] << "," << oHeight << "," << oWidth << ",";
0445          if (fDim == 1)
0446             out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0447                 << fAttrDilations[0];
0448          else // dim ==2
0449             out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0450                 << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0451                 << fAttrDilations[1];
0452          out << ", tensor_" << fNY << " + out_offset);\n\n ";
0453       } else {
0454          // 3d im2col
0455          throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported");
0456 
0457          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0458              << " + x_offset,"
0459              //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0460              //  dilation_d, dilation_h, dilation_w,
0461              //
0462              << fShapeX[1] << "," << oDepth << "," << oHeight << "," << oWidth << "," << fAttrKernelShape[0] << ","
0463              << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] << ","
0464              << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
0465              << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," << OpName
0466              << "_xcol);\n\n ";
0467       }
0468 
0469       // // BLAS
0470       // // offset g must be  g * k * n
0471       // out << SP << SP << SP << "size_t offset_f = g * " << fShapeW[0] * fShapeW[1] * icstrideDil / fAttrGroup << ";\n";
0472       // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0473       //     << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, " << OpName << "_xcol, &" << OpName
0474       //     << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0475       // out << SP << SP << SP << OpName << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0476       //     << " + out_offset"
0477       //     << ", &" << OpName << "_m);\n";
0478 
0479       out << SP << SP << "}\n"; // end of group loop
0480    }
0481 
0482    out << SP << "}\n"; // end of batch size loop
0483 
0484    if (fNBroadcastedB != "") {
0485       out << SP << "int " << OpName << "_size = " << fShapeY[0] * fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0486       out << SP << "float " << OpName << "_gamma = 1.0;\n";
0487       out << SP << "int " << OpName << "_incx = 1;\n";
0488       out << SP << "int " << OpName << "_incy = 1;\n";
0489 
0490       out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNBroadcastedB << ", &"
0491           << OpName << "_incx, tensor_" << fNY << ", &" << OpName << "_incy);\n";
0492    }
0493 
0494    return out.str();
0495 }
0496 
0497 } // namespace SOFIE
0498 } // namespace Experimental
0499 } // namespace TMVA
0500 
0501 #endif