Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-09-13 09:10:36

0001 #ifndef TMVA_SOFIE_ROPERATOR_CONVTRANSPOSE_I
0002 #define TMVA_SOFIE_ROPERATOR_CONVTRANSPOSE_I
0003 
0004 #include <memory>
0005 #include <sstream>
0006 #include <algorithm>
0007 #include <stdexcept>
0008 #include <vector>
0009 #include <cassert>
0010 
0011 #include <TMVA/SOFIE_common.hxx>
0012 
0013 namespace TMVA {
0014 namespace Experimental {
0015 namespace SOFIE {
0016 
0017 template <typename T>
0018 auto ROperator_ConvTranspose<T>::ShapeInference(std::vector<std::vector<size_t>> input)
0019    -> std::vector<std::vector<size_t>>
0020 {
0021    const std::vector<size_t> &inputShape = input[0];
0022    const std::vector<size_t> &weightShape = input[1];
0023    size_t size = inputShape.size();
0024    // Dimension of the conv transpose op
0025    fDim = size - 2;
0026    // Number of groups
0027    if (fAttrGroup == 0)
0028       fAttrGroup = 1;
0029    if (fAttrStrides.empty()) {
0030       fAttrStrides = std::vector<size_t>(fDim, 1);
0031    }
0032    if (fAttrDilations.empty()) {
0033       fAttrDilations = std::vector<size_t>(fDim, 1);
0034    }
0035    // The shape of the kernel is kw for 1d image, kh x Kw for 2d images and kd x kh x kw for a 3d image
0036    if (fAttrKernelShape.empty()) {
0037       fAttrKernelShape.resize(fDim);
0038       for (size_t i = 0; i < fDim; i++)
0039          fAttrKernelShape[i] = fShapeW[i + 2] + (fAttrDilations[i] - 1) * (fShapeW[i + 2] - 1);
0040    }
0041    if (fAttrOutputPadding.empty())
0042       fAttrOutputPadding = std::vector<size_t>(fDim, 0);
0043 
0044    // The Shape of the output is batch_size x out_channel x out_w for a 1d image,
0045    // batch_size x out_channel x out_h x out_w for a 2d image and
0046    // batch_size x out_channel x out_d x out_h x out_w for a 3d image
0047    // where out_channel = weight_shape[1] * group
0048    std::vector<size_t> outShape(size);
0049    outShape[0] = inputShape[0];
0050    outShape[1] = weightShape[1] * fAttrGroup;
0051 
0052 
0053    // Generate the padding
0054    if (fAttrPads.empty() ) {
0055       fAttrPads = std::vector<size_t>(2 * fDim, 0);
0056       if (fAttrOutputShape.size() == fDim) {
0057          //LM: to be checked...
0058          // for time being not support
0059          throw
0060             std::runtime_error("ConvTranspose with output_shape explicitly set not yet supported.");
0061       /*
0062       std::vector<size_t> totalPadding(fDim, 1);
0063       for (size_t i = 0; i < fDim; i++) {
0064          size_t j = i + 2;
0065          totalPadding[i] =
0066             fAttrStrides[i] * (fAttrOutputShape[i] - 1) + fAttrOutputPadding[i] + fAttrKernelShape[i] - fShapeX[j];
0067       }
0068 
0069       for (size_t i = 0; i < fDim; i++) {
0070          size_t end_i = i + fDim;
0071          if (fAttrAutopad == "SAME_UPPER") {
0072             fAttrPads[i] = totalPadding[i] / 2;
0073             fAttrPads[end_i] = totalPadding[i] - fAttrPads[i];
0074          } else {
0075             fAttrPads[end_i] = totalPadding[i] / 2;
0076             fAttrPads[i] = totalPadding[i] - fAttrPads[end_i];
0077          }
0078       }
0079       */
0080       }
0081       if (fAttrAutopad != "NOTSET") {
0082          throw
0083             std::runtime_error("ConvTranspose with padding SAME_UPPER or SMAE_LOWER not supported");
0084       }
0085    }
0086    if (fAttrOutputShape.empty()) {
0087       fAttrOutputShape.resize(fDim);
0088       for (size_t i = 0; i < fDim; i++) {
0089          size_t j = i + 2;
0090          fAttrOutputShape[i] = fAttrStrides[i] * (inputShape[j] - 1) + fAttrKernelShape[i] + fAttrOutputPadding[i] - fAttrPads[i] - fAttrPads[fDim+i];
0091       }
0092    } else {
0093         // The shape of the output is explicitly set
0094         // TODO Generate the padding from the output shape and the input shape
0095         throw
0096             std::runtime_error("ConvTranspose with output_shape explicitly set not yet supported.");
0097     }
0098 
0099    for (size_t i = 0; i < fDim; i++)
0100       outShape[i + 2] = fAttrOutputShape[i];
0101    std::vector<std::vector<size_t>> ret({outShape});
0102    return ret;
0103 }
0104 
0105 template <typename T>
0106 void ROperator_ConvTranspose<T>::Initialize(RModel& model){
0107 
0108    fUseSession = model.UseSession();
0109    if (!model.CheckIfTensorAlreadyExist(fNX)) {
0110       throw std::runtime_error("TMVA SOFIE Conv Transpose op Input Tensor " + fNX + " is not found in model");
0111    }
0112    fShapeX = model.GetTensorShape(fNX);
0113    if (fShapeX.size() < 3 || fShapeX.size() > 5) {
0114       std::cout << fNX << " : " << ConvertShapeToString(fShapeX) << std::endl;
0115       throw std::runtime_error("TMVA SOFIE Conv Transpose Op input data tensor" + fNX +
0116                                " is not of 3,4 or 5 dimensions");
0117    }
0118    fDim = fShapeX.size() - 2;
0119    if (!model.CheckIfTensorAlreadyExist(fNW)) {
0120       throw std::runtime_error("TMVA SOFIE Conv op Input weight Tensor " + fNW + " is not found in model");
0121    }
0122    fShapeW = model.GetTensorShape(fNW);
0123    if (fShapeW.size() < 3 || fShapeW.size() > 5) {
0124       std::cout << fNW << " : " << ConvertShapeToString(fShapeW) << std::endl;
0125       throw std::runtime_error("TMVA SOFIE Conv Transpose Op input weight tensor" + fNW +
0126                                " is not of 3,4 or 5 dimensions");
0127    }
0128    fShapeY = ShapeInference({fShapeX, fShapeW})[0];
0129 
0130    model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
0131    if (fNB != "") {
0132       if (!model.CheckIfTensorAlreadyExist(fNB)) {
0133          throw std::runtime_error("TMVA SOFIE ConvTrans op Input Tensor " + fNB + " is not found in model");
0134       }
0135       fShapeB = model.GetTensorShape(fNB);
0136       if (fShapeB.size() < 1)
0137             throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has empty shape");
0138 
0139       size_t bsize = ConvertShapeToLength(fShapeB);
0140       size_t ysize = ConvertShapeToLength(fShapeY);
0141       // broadcasting is needed if first stride of B is not same of Y
0142       bool broadcast_needed = (bsize != ysize);
0143       // Broadcast the bias B
0144       if (broadcast_needed) {
0145          // we assume bias tensor size is equal to number of filters that is the second dimension in
0146          // the output tensor
0147          if (bsize != fShapeY[1] )
0148             throw std::runtime_error("TMVA SOFIE ConvTrans op: Bias Tensor has wrong shape: " +
0149                                      ConvertShapeToString(fShapeB));
0150 
0151          auto original_data = model.GetInitializedTensorData(fNB);
0152 
0153          if (fType != "float")
0154             throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting for non-float type tensors is not supported");
0155          // here the acual broadcasting
0156          if (!fUseSession) {
0157             // Broadcast B from M to N x M x Od x Oh x Ow
0158             std::shared_ptr<void> new_data_ptr(
0159                UTILITY::BroadcastConvBias<float>(static_cast<float *>(original_data.get()), bsize, fShapeY),
0160                std::default_delete<float[]>());
0161 
0162             model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), fShapeY, new_data_ptr);
0163             fShapeB = model.GetTensorShape(fNB);
0164             fNBroadcastedB = fNB; // use same name
0165          } else {
0166             // In case of session add broadcasting code in Session constructor and in GenerateInitCode
0167             // we need to add a new intermediate tensor for broadcasted bias tensor
0168             fNBroadcastedB = "Broadcasted" + fNB;
0169             model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY);
0170          }
0171       }
0172       else {
0173          // bias tensor is already correct shape, no need to broadcast
0174          if (fShapeY != fShapeB)
0175             throw std::runtime_error("TMVA SOFIE ConvTrans op: Broadcasting is not needed but bias has wrong shape" +
0176                ConvertShapeToString(fShapeB));
0177          fNBroadcastedB = fNB;
0178       }
0179    }
0180 
0181    size_t kernelSize = 1;
0182    size_t inputSize = 1;
0183    for (size_t i = 0; i < fDim; i++) {
0184       inputSize *= fShapeX[2+ i];
0185       kernelSize *= fAttrKernelShape[i];
0186    }
0187 
0188    std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
0189    std::vector<size_t> shape2 = {fShapeW[1], kernelSize, inputSize};
0190    model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
0191    model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
0192    fConvK = fNX +"_f";
0193    fImcol = fNX +"_xcol";
0194    fOutputTensorNames.emplace_back(fConvK);
0195    fOutputTensorNames.emplace_back(fImcol);
0196 }
0197 
0198 template <typename T>
0199 std::string ROperator_ConvTranspose<T>::GenerateInitCode()
0200 {
0201    std::stringstream out;
0202    // generate initialization code for broadcasting of bias tensor
0203    size_t bsize = ConvertShapeToLength(fShapeB);
0204    size_t ysize = ConvertShapeToLength(fShapeY);
0205    if (bsize != ysize && !fNBroadcastedB.empty()) {
0206          // include a separate scope to avoid defining unique operator temp variables
0207          out << SP << "{\n";
0208          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::BroadcastConvBias<float>(tensor_"
0209              << fNB << ", " << bsize << ", " << ConvertShapeToString(fShapeY) << ");\n";
0210          out << SP << SP << "std::copy(data, data + " << ConvertShapeToLength(fShapeY) << ", tensor_" << fNBroadcastedB << ");\n";
0211          out << SP << SP << "delete[] data;\n";
0212          out << SP << "}\n";
0213    }
0214    return out.str();
0215 }
0216 
0217 template <typename T>
0218 std::string ROperator_ConvTranspose<T>::Generate(std::string OpName)
0219 {
0220    OpName = "op_" + OpName;
0221 
0222    if (fShapeX.empty() || fShapeW.empty() || (fNB != "" && fShapeB.empty()) || fShapeY.empty()) {
0223       throw std::runtime_error("TMVA SOFIE Conv Op called to Generate without being initialized first");
0224    }
0225 
0226    std::stringstream out;
0227 
0228    size_t bsize = fShapeX[0];
0229    size_t kDepth = (fDim > 2) ? fShapeW[2] : 1;     // kernel depth
0230    size_t kHeight = (fDim > 1) ? fShapeW[fDim] : 1; // kernel height
0231    size_t kWidth = fShapeW[fDim + 1];               // kernel width
0232 
0233    size_t iDepth = (fDim > 2) ? fShapeX[2] : 1;     // input depth
0234    size_t iHeight = (fDim > 1) ? fShapeX[fDim] : 1; // input height
0235    size_t iWidth = fShapeX[fDim + 1];               // input width
0236 
0237    size_t oDepth = (fDim > 2) ? fShapeY[2] : 1;     // output depth
0238    size_t oHeight = (fDim > 1) ? fShapeY[fDim] : 1; // ouput height
0239    size_t oWidth = fShapeY[fDim + 1];               // output width
0240 
0241    out << "\n//----  operator ConvTranspose " << OpName << "\n";
0242 
0243    // create first matrix with convolution kernels
0244    if (!fUseSession) {
0245       size_t kernelSize = fAttrKernelShape[0];
0246       if (fDim > 1)
0247          kernelSize *= fAttrKernelShape[1];
0248       out << SP << fType << " tensor_" << fNX << "_f[" << fShapeW[0] * fShapeW[1] * kernelSize << "] = {0};\n";
0249    }
0250 
0251    // vectorize the (dilated)convolution kernels into a matrix
0252    // The shape of the kernel is W for 1d image, H x W for 2d image and D x H x W
0253    // for 3d image
0254    size_t id = (fDim > 2) ? fDim - 3 : 2;
0255    size_t ih = (fDim > 1) ? fDim - 2 : 1;
0256    size_t iw = fDim - 1;
0257    size_t wstrideDil = fAttrDilations[iw];
0258    size_t hstride = kWidth;
0259    size_t hstrideDil = fAttrKernelShape[iw];
0260    if (fDim > 1) 
0261       hstrideDil *= fAttrDilations[ih];
0262    // stride dilated in the height
0263    size_t dstride = kHeight * kWidth;
0264    size_t dstrideDil = fAttrKernelShape[iw];
0265    if (fDim > 1)
0266       dstrideDil *= fAttrKernelShape[ih];
0267    if (fDim > 2)
0268       dstrideDil *= fAttrDilations[id];
0269    size_t icstride = kHeight * kWidth * kDepth;
0270    size_t icstrideDil = 1;
0271    for (size_t i = 0; i < fDim; i++)
0272       icstrideDil *= fAttrKernelShape[i];
0273    size_t ocstride = fShapeW[1] * icstride;
0274    size_t ocstrideDil = fShapeW[1] * icstrideDil;
0275 
0276    // The shape of f is [M/group, kHeight x kWidth]
0277    out << SP << "for (std::size_t ic = 0; ic < " << fShapeW[0] << "; ic++) {\n";
0278    out << SP << SP << "for (std::size_t oc = 0; oc < " << fShapeW[1] << "; oc++) {\n";
0279    //out << SP << SP << SP << "size_t kIndex = 0;\n";  // filter index
0280    if (fDim > 2)
0281       out << SP << SP << SP << "for (std::size_t kd = 0; kd < " << kDepth << "; kd++) {\n";
0282    if (fDim > 1)
0283       out << SP << SP << SP << "for (std::size_t kh = 0; kh < " << kHeight << "; kh++) {\n";
0284    out << SP << SP << SP << SP << "for (std::size_t kw = 0; kw < " << kWidth << "; kw++) {\n";
0285 
0286    out << SP << SP << SP << SP << SP << "tensor_" << fNX << "_f[ic * " << ocstrideDil << " + oc * " << icstrideDil;
0287    if (fDim > 2)
0288       out << " + kd * " << dstrideDil;
0289    if (fDim > 1)
0290       out << " + kh * " << hstrideDil;
0291    out << " + kw * " << wstrideDil << "  ] = tensor_" << fNW << "[ic * " << ocstride << " + oc * " << icstride;
0292 
0293    if (fDim > 2)
0294       out << " + kd * " << dstride;
0295    if (fDim > 1)
0296       out << " + kh * " << hstride;
0297    out << " + kw ];\n";
0298 
0299    // here we rotate the input kernel tranforming  0,1,2,...N-1 in N-1,N-2,...,2,1,0
0300    // out << " + " << icstride -1 << " - kIndex ];\n"; // tranform 1,2,3,4 in 4,3,2,1
0301    // out << SP << SP << SP << SP << SP << "kIndex++;\n";  // update input filter index
0302 
0303    out << SP << SP << SP << SP << "}\n";
0304    if (fDim > 1)
0305       out << SP << SP << SP << "}\n";
0306    if (fDim > 2)
0307       out << SP << SP << SP << "}\n";
0308 
0309    out << SP << SP << "}\n";
0310    out << SP << "}\n";
0311 
0312    out << SP << "char " << OpName << "_transA = 'N';\n";
0313    out << SP << "char " << OpName << "_transB = 'T';\n";
0314    out << SP << "int " << OpName << "_m = " << iHeight * iWidth * iDepth << ";\n";
0315    out << SP << "int " << OpName << "_n = " << icstrideDil*fShapeW[1] << ";\n";   // output channels * filters
0316    out << SP << "int " << OpName << "_k = " << fShapeW[0] << ";\n";  // input channels
0317    out << SP << "float " << OpName << "_alpha = 1.0;\n";
0318    out << SP << "float " << OpName << "_beta = 0.0;\n";
0319 
0320    if (!fUseSession) {
0321       out << SP << fType << " tensor_" << fNX << "_xcol[" << fShapeW[0]*icstrideDil * oDepth * oHeight * oWidth << "] = {0};\n";
0322    }
0323 
0324    // Loop on batch size
0325    out << SP << "for (size_t n = 0; n < " << bsize << "; n++) {\n";
0326 
0327    // IM2COL: Unroll the input tensor
0328    // order input data as  (e.g. kernel 2x2)  and (xa,ya) is channel 1 and (xb,yb) is channel 2
0329    //   (xa1,..,xak,ya1,..yak)(xb1,...,xbk,yb1,..,ybk)
0330    //   (xa2,...xak+1,ya1,...yak)(......)
0331    // trick for speed is using caffe im2col and output a matrix which contains filtered values as rows.
0332    // By doing this one has consecutive memory reads and writes
0333    // Resulting matrix op_xcol is (output channels * filter_h * filter_w , output_h * output_w)
0334    if (fDim == 1) {
0335       if (fAttrPads[0] != fAttrPads[1]) {
0336          std::cout << "TMVA SOFIE Operator Conv:  asymmetric padding not supported. Assume an average padding "
0337                    << std::endl;
0338          fAttrPads[0] = (fAttrPads[0] + fAttrPads[1]) / 2;
0339       }
0340       fAttrPads[1] = 0;
0341    }
0342    if (fDim == 2) {
0343       if (fAttrPads[0] != fAttrPads[2] || fAttrPads[1] != fAttrPads[3]) {
0344          std::cout << "TMVA SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
0345                    << std::endl;
0346          fAttrPads[0] = (fAttrPads[0] + fAttrPads[2]) / 2;
0347          fAttrPads[1] = (fAttrPads[1] + fAttrPads[3]) / 2;
0348       }
0349    }
0350    if (fDim == 3) {
0351       if (fAttrPads[0] != fAttrPads[3] || fAttrPads[1] != fAttrPads[4] || fAttrPads[2] != fAttrPads[5]) {
0352          std::cout << "TMVA SOFIE Operator ConvTranspose:  asymmetric padding not supported. Assume an average padding "
0353                    << std::endl;
0354          fAttrPads[0] = (fAttrPads[0] + fAttrPads[3]) / 2;
0355          fAttrPads[1] = (fAttrPads[1] + fAttrPads[4]) / 2;
0356          fAttrPads[2] = (fAttrPads[2] + fAttrPads[5]) / 2;
0357       }
0358    }
0359 
0360    if (fAttrGroup == 1) {
0361       out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iDepth * iHeight * iWidth << ";\n";
0362       out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0363 
0364       // DO BLAS before:
0365        // BLAS
0366       out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0367           << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, "
0368           <<  "tensor_" << fNX << " + x_offset, &" << OpName << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0369       out << SP << SP << SP << "tensor_" << fNX <<"_f, &" << OpName << "_n, &" << OpName << "_beta, tensor_" 
0370       << fNX <<"_xcol, &" << OpName << "_m);\n";
0371 
0372       // when using im2col - resulting matrix is transposed, is (input_c * filter_h * filter_w,  output_h *
0373       // output_w)
0374       // before using col2im I need to transpose matrix
0375       if (fDim < 3) {
0376          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::col2im<float>(tensor_" << fNX << "_xcol,"
0377              //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0378              //  dilation_w,
0379              << fShapeY[1] << "," << oHeight << "," << oWidth << ",";
0380          if (fDim == 1)
0381             out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0382                 << fAttrDilations[0];
0383          else // dim ==2
0384             out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0385                 << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0386                 << fAttrDilations[1];
0387          out << ", tensor_" << fNY << " + out_offset);\n\n ";
0388       } else {
0389          // 3d : needs a col2im for 3d
0390          throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported");
0391          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0392              << " + x_offset,"
0393              //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0394              //  dilation_d, dilation_h, dilation_w,
0395              //
0396              << fShapeX[1] << "," << oDepth << "," << oHeight << "," << oWidth << "," << fAttrKernelShape[0] << ","
0397              << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] << ","
0398              << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
0399              << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << 
0400              ",tensor_" << fNX <<"_xcol);\n\n ";
0401       }
0402       // // BLAS
0403       // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0404       //     << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName
0405       //     << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0406       // out << SP << SP << SP <<"tensor_" << fNX << "_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0407       //     << " + out_offset, &" << OpName << "_m);\n";
0408    } else {
0409       // case of group transposed convolution
0410       // Unroll (IM2COL) the input tensor- make loop on groups and repeat operations (IM2COL + GEMM for each
0411       // group)
0412       out << SP << SP << "for (size_t g = 0; g < " << fAttrGroup << "; g++) {\n";
0413       out << SP << SP << "size_t x_offset = n * " << fShapeX[1] * iHeight * iWidth  << " + g * "
0414           << fShapeX[1] * iHeight * iWidth / fAttrGroup << ";\n ";
0415       out << SP << SP << "size_t out_offset = n * " << fShapeY[1] * oHeight * oWidth << " + g * "
0416           << fShapeY[1] * oHeight * oWidth / fAttrGroup << ";\n ";
0417 
0418       // do BLAS here (LM: probably need an offset for op_f the kernels)
0419       out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0420           << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, "
0421           << "tensor_" << fNX << " + x_offset, &" << OpName
0422           << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0423       out << SP << SP << SP << "tensor_" << fNX << "_f, &" << OpName << "_n, &" << OpName
0424       << "_beta, tensor_" << fNX << "_xcol , &" << OpName << "_m);\n";
0425 
0426       if (fDim < 3) {
0427          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::col2im<float>(tensor_" << fNX << "_xcol,"
0428              //  channels, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
0429              //  dilation_w,
0430             << fShapeY[1] << "," << oHeight << "," << oWidth << ",";
0431          if (fDim == 1)
0432             out << "1, " << fAttrKernelShape[0] << ",0," << fAttrPads[0] << ",1," << fAttrStrides[0] << ",1,"
0433                 << fAttrDilations[0];
0434          else // dim ==2
0435             out << fAttrKernelShape[0] << "," << fAttrKernelShape[1] << "," << fAttrPads[0] << "," << fAttrPads[1]
0436                 << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrDilations[0] << ","
0437                 << fAttrDilations[1];
0438          out << ", tensor_" << fNY << " + out_offset);\n\n ";
0439       } else {
0440          // 3d im2col
0441          throw std::runtime_error("TMVA SOFIE 3D Conv Transpose not yet supported");
0442 
0443          out << SP << SP << "TMVA::Experimental::SOFIE::UTILITY::Im2col_3d<float>(tensor_" << fNX
0444              << " + x_offset,"
0445              //  channels, d, h, w, k_d, k_h, k_w, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w,
0446              //  dilation_d, dilation_h, dilation_w,
0447              //
0448              << fShapeX[1] << "," << oDepth << "," << oHeight << "," << oWidth << "," << fAttrKernelShape[0] << ","
0449              << fAttrKernelShape[1] << "," << fAttrKernelShape[2] << "," << fAttrPads[0] << "," << fAttrPads[1] << ","
0450              << fAttrPads[2] << "," << fAttrStrides[0] << "," << fAttrStrides[1] << "," << fAttrStrides[2] << ","
0451              << fAttrDilations[0] << "," << fAttrDilations[1] << "," << fAttrDilations[2] << "," << "tensor_" << fNX
0452              << "_xcol);\n\n ";
0453       }
0454 
0455       // // BLAS
0456       // // offset g must be  g * k * n
0457       // out << SP << SP << SP << "size_t offset_f = g * " << fShapeW[0] * fShapeW[1] * icstrideDil / fAttrGroup << ";\n";
0458       // out << SP << SP << "BLAS::sgemm_(&" << OpName << "_transA, &" << OpName << "_transB, &" << OpName << "_m, &"
0459       //     << OpName << "_n, &" << OpName << "_k, &" << OpName << "_alpha, tensor_" << fNX << "_xcol, &" << OpName
0460       //     << "_m,\n"; // use m if op_xcol is not transpose , otherwise k
0461       // out << SP << SP << SP << "tensor_" << fNX << "_f + offset_f, &" << OpName << "_k, &" << OpName << "_beta, tensor_" << fNY
0462       //     << " + out_offset"
0463       //     << ", &" << OpName << "_m);\n";
0464 
0465       out << SP << SP << "}\n"; // end of group loop
0466    }
0467 
0468    out << SP << "}\n"; // end of batch size loop
0469 
0470    if (fNBroadcastedB != "") {
0471       out << SP << "int " << OpName << "_size = " << fShapeY[0] * fShapeY[1] * oDepth * oHeight * oWidth << ";\n";
0472       out << SP << "float " << OpName << "_gamma = 1.0;\n";
0473       out << SP << "int " << OpName << "_incx = 1;\n";
0474       out << SP << "int " << OpName << "_incy = 1;\n";
0475 
0476       out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNBroadcastedB << ", &"
0477           << OpName << "_incx, tensor_" << fNY << ", &" << OpName << "_incy);\n";
0478    }
0479 
0480    return out.str();
0481 }
0482 
0483 } // namespace SOFIE
0484 } // namespace Experimental
0485 } // namespace TMVA
0486 
0487 #endif