Back to home page

EIC code displayed by LXR

 
 

    


Warning, file /include/root/TMVA/SOFIE_common.hxx was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 #ifndef TMVA_SOFIE_SOFIE_COMMON
0002 #define TMVA_SOFIE_SOFIE_COMMON
0003 
0004 #include "TMVA/RTensor.hxx"
0005 
0006 #include "ROOT/RSpan.hxx"
0007 
0008 #include <stdexcept>
0009 #include <type_traits>
0010 #include <cstdint>
0011 #include <cstring>
0012 #include <complex>
0013 #include <string>
0014 #include <vector>
0015 #include <map>
0016 #include <memory>
0017 #include <regex>
0018 #include <sstream>
0019 #include <iostream>
0020 #include <iomanip>
0021 #include <cassert>
0022 #include <limits>
0023 
0024 namespace TMVA {
0025 namespace Experimental {
0026 namespace SOFIE {
0027 
0028 enum class ETensorType{
0029    UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
0030     FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16
0031 };
0032 
0033 enum class EActivationType{
0034    UNDEFINED = 0, RELU = 1, SOFTMAX = 2, SIGMOID = 3, LEAKYRELU = 4, TANH = 5, ELU = 6
0035 };
0036 
0037 constexpr size_t GetTypeSize(ETensorType type) {
0038     switch (type) {
0039         case ETensorType::FLOAT:     return sizeof(float);
0040         case ETensorType::DOUBLE:    return sizeof(double);
0041         case ETensorType::UINT8:     return sizeof(uint8_t);
0042         case ETensorType::INT8:      return sizeof(int8_t);
0043         case ETensorType::UINT16:    return sizeof(uint16_t);
0044         case ETensorType::INT16:     return sizeof(int16_t);
0045         case ETensorType::INT32:     return sizeof(int32_t);
0046         case ETensorType::INT64:     return sizeof(int64_t);
0047         case ETensorType::UINT32:    return sizeof(uint32_t);
0048         case ETensorType::UINT64:    return sizeof(uint64_t);
0049         case ETensorType::BOOL:      return sizeof(bool);
0050         case ETensorType::STRING:    return sizeof(std::string);
0051         default: return 0;
0052     }
0053 }
0054 
0055 typedef std::int64_t int_t;
0056 
0057 std::string ConvertTypeToString(ETensorType type);
0058 ETensorType ConvertStringToType(std::string type);
0059 
0060 // find if a string represents a number
0061 bool IsInteger(const std::string & s);
0062 
0063 struct Dim{
0064    bool isParam = false;
0065    size_t dim = 0;
0066    std::string param;
0067 
0068     // default constructor (for I/O)
0069    Dim() {}
0070 
0071    // constructor for a parametric dimension with the option to pass a default dim value
0072    // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2")
0073    // in case the string represents a number make Dim not parametric
0074    Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p)
0075    {
0076       if (IsInteger(p)) {
0077             isParam = false;
0078             dim = std::stoi(p);
0079       }
0080    }
0081 
0082    // constructor for a non-parametric dimension
0083    Dim(size_t d) : dim(d) {}
0084 
0085    std::string GetVal() const {
0086       // cast to int64_t for negative shape values
0087       return (isParam) ? param : std::to_string(static_cast<int64_t>(dim));
0088    }
0089 
0090    std::ostream& operator<< (std::ostream& os) const {
0091       os << GetVal();
0092       return os;
0093    }
0094 
0095    bool operator==(const Dim& rhs) const {
0096        return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim;
0097    }
0098    bool operator!=(const Dim& rhs) const {
0099        return !(*this == rhs);
0100    }
0101 };
0102 
0103 //bool operator==(const Dim& lhs, const Dim& rhs);
0104 inline std::ostream & operator<< (std::ostream &os, const Dim &d) {
0105    os << d.GetVal();
0106    return os;
0107 }
0108 
0109 struct InputTensorInfo{
0110    ETensorType type;
0111    std::vector<Dim> shape;
0112 };
0113 
0114 struct TensorInfo{
0115    ETensorType type;
0116    std::vector<size_t> shape;
0117 };
0118 
0119 struct DynamicTensorInfo{
0120    ETensorType type;
0121    std::vector<Dim> shape;
0122 };
0123 
0124 // template traits for Tensor Shape
0125 template <typename T>
0126 struct TensorShape {};
0127 template<>
0128 struct TensorShape<Dim> {
0129    static bool IsDim() { return true; }
0130 };
0131 template<>
0132 struct TensorShape<size_t> {
0133    static bool IsDim() { return false; }
0134 };
0135 
0136 // template traits for Tensor type
0137 template <typename T>
0138 struct TensorType {};
0139 template<>
0140 struct TensorType<float> {
0141    static const std::string Name() { return "float"; }
0142 };
0143 template<>
0144 struct TensorType<double> {
0145    static const std::string Name() { return "double"; }
0146 };
0147 template<>
0148 struct TensorType<int64_t> {
0149    static const std::string Name() { return "int64_t"; }
0150 };
0151 template<>
0152 struct TensorType<int32_t> {
0153    static const std::string Name() { return "int32_t"; }
0154 };
0155 template<>
0156 struct TensorType<uint32_t> {
0157    static const std::string Name() { return "uint32_t"; }
0158 };
0159 template<>
0160 struct TensorType<uint64_t> {
0161    static const std::string Name() { return "uint64_t"; }
0162 };
0163 template<>
0164 struct TensorType<bool> {
0165    static const std::string Name() { return "bool"; }
0166 };
0167 
0168 struct TensorMemoryInfo {
0169    std::string_view tensor_name;
0170    size_t tensor_size;
0171 
0172    TensorMemoryInfo split(const std::string_view new_name, size_t new_size) {
0173         if (new_size > tensor_size) {
0174             throw std::invalid_argument("New size exceeds available tensor size.");
0175         }
0176         tensor_size -= new_size;
0177         return TensorMemoryInfo{new_name, new_size};
0178    }
0179 
0180     // Method to merge another struct into this one
0181    void merge(const TensorMemoryInfo& other) {
0182         tensor_size += other.tensor_size;
0183    }
0184 };
0185 
0186 struct MemoryPoolInfo {
0187 
0188    // ordered map with chunk_idx as key and TensorMemoryInfo as value
0189    std::map<size_t, TensorMemoryInfo> total_stack;
0190 
0191    // ordered map with chunk_idx as key and chunk_size as value
0192    std::map<size_t, size_t> available_stack;
0193 };
0194 
0195 std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape);
0196 
0197 std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape);
0198 
0199 std::size_t ConvertShapeToLength(const std::vector<size_t> & shape);
0200 
0201 std::string ConvertShapeToString(const std::vector<size_t> & shape);
0202 std::string ConvertDimShapeToString(const std::vector<Dim> & shape);
0203 std::string ConvertShapeToString(const std::vector<Dim> & shape);
0204 
0205 
0206 
0207 std::string ConvertDimShapeToLength(const std::vector<Dim> & shape);
0208 std::string ConvertDynamicShapeToLength(const std::vector<Dim> & shape);
0209 
0210 
0211 template<class T>
0212 std::string ConvertValToString(T value) {
0213    std::stringstream ret;
0214    if (std::is_floating_point_v<T>)
0215       ret << std::setprecision(std::numeric_limits<T>::max_digits10);
0216    ret << value;
0217    return ret.str();
0218 }
0219 
0220 
0221 // convert list of values in a string taking into account the precision
0222 template<class T>
0223 std::string ConvertValuesToString(size_t n, const T * data) {
0224    std::stringstream ret;
0225    ret << "{ ";
0226    for (size_t i = 0; i < n; i++) {
0227       if (std::is_floating_point_v<T>)
0228          ret << std::setprecision(std::numeric_limits<T>::max_digits10);
0229       ret << data[i];
0230       if (i < n-1) ret << ", ";
0231    }
0232    ret << "}";
0233    return ret.str();
0234 }
0235 template<class T>
0236 std::string ConvertValuesToString(const std::vector<T> & data) {
0237   return ConvertValuesToString(data.size(), data.data());
0238 }
0239 
0240 class InitializedTensor {
0241 public:
0242    InitializedTensor() = default;
0243    InitializedTensor(ETensorType type, std::span<std::size_t> shape, std::shared_ptr<void> data, bool typeConstant = false)
0244       : fConstant(typeConstant), fType{type}, fShape{shape.begin(), shape.end()}, fData{data}
0245    {
0246    }
0247 
0248    ETensorType const &type() const { return fType; }
0249    std::vector<std::size_t> const &shape() const { return fShape; }
0250    std::shared_ptr<void> const &sharedptr() const { return fData; }
0251    // query if tensor comes from a Constant operator
0252    bool IsConstantTensor() const { return fConstant;}
0253    // query if tensor needs to be written in a weight file. Constant tensors are not written in a file
0254    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;}
0255    // set not writable initialized tensors - i.e. tensor that must not be written in a file
0256    void SetNotWritable() { fIsNotWritable = true;}
0257 
0258    template <class T = void>
0259    T const *data() const
0260    {
0261       return static_cast<T const *>(fData.get());
0262    }
0263 
0264    void CastSharedToPersistent()
0265    {
0266       // We only calculate fSize here, because it is only used for IO to know
0267       // the size of the persistent data.
0268       fSize = 1;
0269       for (std::size_t item : fShape) {
0270          fSize *= static_cast<int>(item);
0271       }
0272       switch (fType) {
0273       case ETensorType::FLOAT: fSize *= sizeof(float); break;
0274       case ETensorType::DOUBLE: fSize *= sizeof(double); break;
0275       case ETensorType::INT32: fSize *= sizeof(int32_t); break;
0276       case ETensorType::INT64: fSize *= sizeof(int64_t); break;
0277       case ETensorType::BOOL: fSize *= sizeof(bool); break;
0278       default:
0279          throw std::runtime_error("TMVA::SOFIE doesn't yet supports serialising data-type " +
0280                                   ConvertTypeToString(fType));
0281       }
0282       fPersistentData = static_cast<char *>(fData.get());
0283    }
0284    void CastPersistentToShared()
0285    {
0286       // If there is no persistent data, do nothing
0287       if (fSize == 0 || fPersistentData == nullptr) {
0288          return;
0289       }
0290 
0291       // Nothing to be done if the pointed-to data is the same
0292       if (fPersistentData == static_cast<char *>(fData.get())) {
0293          return;
0294       }
0295 
0296       // Initialize the shared_ptr
0297       fData = std::shared_ptr<void>{malloc(fSize), free};
0298       std::memcpy(fData.get(), fPersistentData, fSize);
0299 
0300       // Make sure the data read from disk doesn't leak and delete the
0301       // persistent data
0302       delete[] fPersistentData;
0303       fPersistentData = nullptr;
0304       fSize = 0;
0305    }
0306 
0307 private:
0308    bool  fConstant = false;      ///< Flag specifying if tensor is a Constant one (coming from a Constant operator)
0309    bool  fIsNotWritable = false; ///< Flag to indicate that tensor values do not need to be written as weight or generated code
0310    ETensorType fType;               ///< Encodes the type of the data
0311    std::vector<std::size_t> fShape; ///< The shape of the data in terms of elements in each dimension
0312    std::shared_ptr<void> fData;     ///<! Transient shared data
0313    int fSize = 0;                   ///< The size of the persistent data in bytes (not number of elements!)
0314    char *fPersistentData = nullptr; ///<[fSize] Persistent version of the data
0315 };
0316 
0317 template <typename T>
0318 ETensorType GetTemplatedType(T /*obj*/ ){
0319    if (std::is_same<T, float>::value) return ETensorType::FLOAT;
0320    if (std::is_same<T, uint8_t>::value) return ETensorType::UINT8;
0321    if (std::is_same<T, int8_t>::value) return ETensorType::INT8;
0322    if (std::is_same<T, uint16_t>::value) return ETensorType::UINT16;
0323    if (std::is_same<T, int16_t>::value) return ETensorType::INT16;
0324    if (std::is_same<T, int32_t>::value) return ETensorType::INT32;
0325    if (std::is_same<T, int64_t>::value) return ETensorType::INT64;
0326    if (std::is_same<T, std::string>::value) return ETensorType::STRING;
0327    if (std::is_same<T, bool>::value) return ETensorType::BOOL;
0328    //float16 unimplemented
0329    if (std::is_same<T, double>::value) return ETensorType::DOUBLE;
0330    if (std::is_same<T, uint32_t>::value) return ETensorType::UINT32;
0331    if (std::is_same<T, uint64_t>::value) return ETensorType::UINT64;
0332    //complex 64, 28, bfloat 16 unimplemented
0333 }
0334 
0335 namespace UTILITY{
0336 
0337 
0338 
0339 // clean operator and tensor names
0340 std::string Clean_name(std::string input_tensor_name);
0341 
0342 // Check if two shapes are equal
0343 bool AreSameShape(const std::vector<size_t>&, const std::vector<size_t>&);
0344 bool AreSameShape(const std::vector<size_t>&, const std::vector<Dim>&);
0345 bool AreSameShape(const std::vector<Dim>&, const std::vector<Dim>&);
0346 
0347 
0348 // Multidirectional broadcast a list of tensors to the same shape
0349 std::vector<size_t> MultidirectionalBroadcastShape(std::vector<std::vector<size_t>>);
0350 
0351 // Multidirectional broadcast two shapes to the same shape
0352 
0353 std::pair<int, std::vector<size_t>> MultidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
0354 std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
0355 
0356 std::pair<int, std::vector<Dim>> MultidirectionalBroadcastShape(std::vector<Dim> &, std::vector<Dim> &);
0357 
0358 
0359 
0360 template<typename T>
0361 T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size_t>& targetShape) {
0362    size_t size = targetShape.size();
0363    if (targetShape[1] != channel) {
0364       std::stringstream ss;
0365       ss << "TMVA::SOFIE - Error broadcasting Conv Bias of shape {";
0366       ss << std::to_string(channel);
0367       ss << "} to ";
0368       ss << ConvertShapeToString(targetShape);
0369       throw
0370          std::runtime_error(ss.str());
0371    }
0372 
0373    size_t targetLength = ConvertShapeToLength(targetShape);
0374    T* newData = new T[targetLength];
0375 
0376    if (targetLength == channel) {
0377       std::copy(data, data + channel, newData);
0378       return newData;
0379    }
0380 
0381    // cStride = OutDepth * outHeight * outWidth
0382    size_t cStride = 1;
0383    for (size_t i = 2; i < size; i++)
0384       cStride *= targetShape[i];
0385    // Broadcast each element of the bias to a vector of size cStride and concatenate them
0386    // into a vector of size channel * cStride
0387    for (size_t i = 0; i < channel; i++) {
0388       std::fill(newData + i * cStride, newData + (i + 1) * cStride, data[i]);
0389    }
0390    // Broadcast newData[0...channel * cStride) to newData[0...batch * channel * cStride)
0391    size_t batch = targetShape[0];
0392    size_t bStride = channel * cStride;
0393    for (size_t i = 1; i < batch; i++) {
0394       std::copy(newData, newData + bStride, newData + i * bStride);
0395    }
0396    return newData;
0397 }
0398 
0399 // Broadcast a tensor from shape to targetShape according to numpy broadcasting rules
0400 // See more at https://numpy.org/doc/stable/user/basics.broadcasting.html
0401 // and https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md .
0402 template<typename T, class ConstContT = std::span<const T>, class ContT = std::span<T> >
0403 void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, ContT broadcastedData) {
0404    // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones )
0405    size_t size = shape.size();
0406    // Current length of the broadcasted tensor
0407    size_t curLength = data.size();
0408    size_t targetLength = broadcastedData.size();
0409    assert(ConvertShapeToLength(targetShape) == targetLength);
0410    // special case when broadcasting last dimensions (initial shapes must be the same)
0411    if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) {
0412       size_t bsize = targetShape.back();
0413       // compute the size of the data to broadcast
0414       for (int k = int(size)-2; k >=0; k--) {
0415          if (shape[k] != 1) break;
0416          bsize *= targetShape[k];
0417       }
0418       for (size_t i = 0; i < curLength; i++) {
0419          std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]);
0420       }
0421       return;
0422    }
0423 
0424    std::copy(data.begin(), data.end(), broadcastedData.begin());
0425    // Product of the previous dimensions of targetShape
0426    size_t arrayNum = 1;
0427    // New broadcasted data: is this needed?
0428    std::vector<T> newData(targetLength);
0429 
0430    for (size_t idx = 0; idx < size; idx++) {
0431       size_t dim = shape[idx];
0432       size_t targetDim = targetShape[idx];
0433       if (dim == 1 && targetDim > 1) {
0434          // Set the new length of the data
0435          size_t newLength = curLength * targetDim;
0436          // View the data as a list of arrayNum arrays of size arrayLength
0437          size_t arrayLength = curLength / arrayNum;
0438          // Broadcast each array dim times
0439          if (arrayLength > 1) {
0440             // If each array has at least two elements
0441             for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) {
0442                for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) {
0443                   size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength;
0444                   std::copy(broadcastedData.begin() + arrayIdx * arrayLength,
0445                      broadcastedData.begin() + (arrayIdx + 1) * arrayLength,
0446                      newData.begin() + offset);
0447                }
0448             }
0449          } else {
0450             // If each array has one element
0451             for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) {
0452                std::fill(newData.begin() + arrayIdx * targetDim,
0453                   newData.begin() + (arrayIdx + 1) * targetDim, broadcastedData[arrayIdx]);
0454             }
0455          }
0456          // Update current length
0457          curLength = newLength;
0458          // Update broadcasted data
0459          std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin());
0460       }
0461       // Update the number of arrays
0462       arrayNum *= targetDim;
0463    }
0464    //return broadcastedData;
0465 }
0466 
0467 // interface where we allocate a new array for broadcasted data
0468 template<typename T>
0469 T* CreateBroadcastTensor(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, size_t targetLength) {
0470    // newShape is an array of size equal to dimension along which we are broadcasting the tensor
0471    T* broadcastedData = new T[targetLength];
0472    std::span<T> bData(broadcastedData, broadcastedData+targetLength);
0473    size_t curLength = ConvertShapeToLength(shape);
0474    std::span<const T> inData(data, curLength);
0475    BroadcastTensor<T, std::span<const T>, std::span<T>>(inData, shape, targetShape, bData);
0476    return broadcastedData;
0477 }
0478 // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not
0479 // tensor A - otherwise is a multidirectional broadcast
0480 template<typename T>
0481 T* UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape) {
0482    // Prepend shape with ones
0483    if (shape.size() < targetShape.size()) {
0484       size_t targetSize = targetShape.size();
0485       std::vector<size_t> newShape(targetSize, 1);
0486       size_t offset = targetSize - shape.size();
0487       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
0488       return CreateBroadcastTensor<T>(data, newShape, targetShape, ConvertShapeToLength(targetShape));
0489    }
0490    return CreateBroadcastTensor<T>(data, shape, targetShape, ConvertShapeToLength(targetShape));
0491 }
0492 
0493 // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations
0494 template<typename T>
0495 void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::span<T> broadcastedData) {
0496    size_t curLength = ConvertShapeToLength(shape);
0497    std::span<T> inData(const_cast<T*>(data), curLength);
0498    // Prepend shape with ones
0499    if (shape.size() < targetShape.size()) {
0500       size_t targetSize = targetShape.size();
0501       std::vector<size_t> newShape(targetSize, 1);
0502       size_t offset = targetSize - shape.size();
0503       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
0504       BroadcastTensor<T>(inData, newShape, targetShape, broadcastedData);
0505    }
0506    BroadcastTensor<T, std::span<T>>(inData, shape, targetShape, broadcastedData);
0507 }
0508 
0509 /// compute stride of a tensor given its shape (assume layout is row-major)
0510 std::vector<size_t> ComputeStrideFromShape(const std::vector<size_t> & shape);
0511 std::vector<Dim> ComputeStrideFromShape(const std::vector<Dim> & shape);
0512 
0513 /// function to check if a >> 0 and a < MAX using a single comparison
0514 //// use trick casting to unsigned values so it becomes a single comparison
0515 inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
0516    return static_cast<unsigned>(a) < static_cast<unsigned>(b);
0517 }
0518 
0519 
0520 /// im2col : efficient function to re-arrange input data of convolution to a matrix
0521 /// that can be used by BLAS
0522 /// Use trick to loop on each element of filtered region first and follow input data layout
0523 /// By doing this reads and writes are of consecutive data in memory and one gains in efficiency
0524 /// The resulting matrix will be already transposed and can be used directly in BLAS
0525 /// since output will be a matrix : (channels*kernel_h*kernel_w , output_h*output_w)
0526 /// Example: with an input matrix
0527 ///    a1 a2 a3
0528 ///    b1 b2 b3    and a 2x2 kernel    (k1,k2,k3,k4) and padding 1 :
0529 ///    c1 c2 c3
0530 ///     outpout will be a matrix (4 x 16)
0531 ///  the routine will follow output order :
0532 //     first all elements which will be operated by k1 then k2 then k3
0533 ///  -> ( 0  0  0  0  0  a1 a2 a3 0  b1 b2 b3  0 c1 c2 c3  )    all elements for k1
0534 ///     ( 0  0  0  0  a1 a2 a3  0 b1 b2 b3  0 c1 c2 c3  0  )     for k2
0535 ///     ( 0  a1 a2 a3 0  b1 b2 b3 0  c1 c2 c3  0  0  0  0  )     for k3
0536 ///     ( a1 a2 a3 0  b1 b2 b3  0 c1 c2 c3  0  0  0  0  0  )     for k4
0537 ///
0538 
0539 template <typename T>
0540 void Im2col(const T *data_im, const int channels, const int height, const int width, const int kernel_h,
0541                 const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w,
0542                 const int dilation_h, const int dilation_w, T *data_col)
0543 {
0544    const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
0545    const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
0546    const int channel_size = height * width;
0547    for (int channel = channels; channel--; data_im += channel_size) {
0548       for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
0549          for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
0550             int input_row = -pad_h + kernel_row * dilation_h;
0551             for (int output_rows = output_h; output_rows; output_rows--) {
0552                if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
0553                   for (int output_cols = output_w; output_cols; output_cols--) {
0554                      *(data_col++) = 0;
0555                   }
0556                } else {
0557                   int input_col = -pad_w + kernel_col * dilation_w;
0558                   for (int output_col = output_w; output_col; output_col--) {
0559                      if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
0560                         *(data_col++) = data_im[input_row * width + input_col];
0561                      } else {
0562                         *(data_col++) = 0;
0563                      }
0564                      input_col += stride_w;
0565                   }
0566                }
0567                input_row += stride_h;
0568             }
0569          }
0570       }
0571    }
0572 }
0573 
0574 /// 3d implementation
0575 template <typename T>
0576 void Im2col_3d(const T *data_im, const int channels,
0577             const int depth, const int height, const int width,
0578             const int kernel_d, const int kernel_h, const int kernel_w,
0579             const int pad_d, const int pad_h, const int pad_w,
0580             const int stride_d, const int stride_h, const int stride_w,
0581             const int dilation_d, const int dilation_h,  const int dilation_w, T *data_col)
0582 {
0583    const int output_h = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
0584    const int output_w = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
0585    const int output_d = (depth + 2 * pad_d - (dilation_d * (kernel_d - 1) + 1)) / stride_d + 1;
0586    const int channel_size = height * width * depth;
0587    // assume data are c x d x h x w
0588    for (int channel = channels; channel--; data_im += channel_size) {
0589       for (int kernel_depth = 0; kernel_depth < kernel_d; kernel_depth++) {
0590          for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
0591             for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
0592                int input_dep = -pad_d + kernel_depth * dilation_d;
0593                for (int output_dep = output_d; output_dep; output_dep--) {
0594                   if (!is_a_ge_zero_and_a_lt_b(input_dep, depth)) {
0595                      for (int output_rows = output_h; output_rows; output_rows--) {
0596                         for (int output_cols = output_w; output_cols; output_cols--) {
0597                            *(data_col++) = 0;
0598                         }
0599                      }
0600                   } else {
0601                      int input_row = -pad_h + kernel_row * dilation_h;
0602                      for (int output_rows = output_h; output_rows; output_rows--) {
0603                         if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
0604                            for (int output_cols = output_w; output_cols; output_cols--) {
0605                               *(data_col++) = 0;
0606                            }
0607                         } else {
0608                            int input_col = -pad_w + kernel_col * dilation_w;
0609                            for (int output_col = output_w; output_col; output_col--) {
0610                               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
0611                                  *(data_col++) = data_im[input_dep * width * height + input_row * width + input_col];
0612                               } else {
0613                                  *(data_col++) = 0;
0614                               }
0615                               input_col += stride_w;
0616                            }
0617                         }
0618                         input_row += stride_h;
0619                      }
0620                   }
0621                   input_dep += stride_d;
0622                }
0623             }
0624          }
0625       }
0626    }
0627 }
0628 
0629 template <typename Dtype>
0630 void col2im(const Dtype* data_col, const int channels,
0631     const int height, const int width, const int kernel_h, const int kernel_w,
0632     const int pad_h, const int pad_w,
0633     const int stride_h, const int stride_w,
0634     const int dilation_h, const int dilation_w,
0635     Dtype* data_im) {
0636    // note that output data_im needs to be set to zero value!!!!
0637    std::fill(data_im, data_im + height * width * channels, 0.);
0638   //caffe_set(height * width * channels, Dtype(0), data_im);
0639   // data_im must be a zero vector
0640   //const Dtype * data_col_0 = data_col;
0641   const int output_h = (height + 2 * pad_h -
0642     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
0643   const int output_w = (width + 2 * pad_w -
0644     (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
0645   const int channel_size = height * width;
0646   for (int channel = channels; channel--; data_im += channel_size) {
0647     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
0648       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
0649         int input_row = -pad_h + kernel_row * dilation_h;
0650         for (int output_rows = output_h; output_rows; output_rows--) {
0651           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
0652             data_col += output_w;
0653           } else {
0654             int input_col = -pad_w + kernel_col * dilation_w;
0655             for (int output_col = output_w; output_col; output_col--) {
0656               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
0657                 //assert(input_row*width+input_col < height * width * channels);
0658                 //assert(data_col - data_col_0 < output_h*output_w*channels);
0659                //  std::cout << "COL2IM: input_row" << "  " << input_row << "  " << input_col
0660                //       << " <---- " << data_col - data_col_0 << " values:  "
0661                //       << data_im[input_row * width + input_col] << " <--- " << *data_col << std::endl;
0662                 data_im[input_row * width + input_col] += *data_col;
0663               }
0664               data_col++;
0665               input_col += stride_w;
0666             }
0667           }
0668           input_row += stride_h;
0669         }
0670       }
0671     }
0672   }
0673   //std::cout << "finishing col2imp" << std::endl;
0674 }
0675 
0676 // Used at the end of infer() to fill the return object.
0677 template <class T>
0678 void FillOutput(T const *arr, std::vector<T> &out, std::size_t n)
0679 {
0680    out.resize(n);
0681    for (std::size_t i = 0; i < n; ++i) {
0682       out[i] = arr[i];
0683    }
0684 }
0685 
0686 }  // end namespace UTILITY
0687 
0688 namespace BLAS{
0689 extern "C" void sgemm_(const char * transa, const char * transb, const int * m, const int * n, const int * k,
0690                        const float * alpha, const float * A, const int * lda, const float * B, const int * ldb,
0691                        const float * beta, float * C, const int * ldc);
0692 }//BLAS
0693 
0694 
0695 struct GNN_Data {
0696       RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
0697       RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
0698       RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
0699       RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
0700                                      // edge_index[0,:] are the receivers and edge_index[1,:] are the senders
0701 
0702 
0703       // need to have default constructor since RTensor has not one
0704       GNN_Data(): node_data(RTensor<float>({})), edge_data(RTensor<float>({})), global_data(RTensor<float>({})), edge_index(RTensor<int>({})) {}
0705 
0706 };
0707 
0708 template<typename T>
0709 TMVA::Experimental::RTensor<T> Concatenate( TMVA::Experimental::RTensor<T> & t1,  TMVA::Experimental::RTensor<T> & t2, int axis = 0)
0710 {
0711    // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis
0712    if (t1.GetMemoryLayout() != t2.GetMemoryLayout())
0713       throw std::runtime_error("TMVA RTensor Concatenate - tensors have different memory layout");
0714    auto & shape1 = t1.GetShape();
0715    auto & shape2 = t2.GetShape();
0716    if (t1.GetSize()/shape1[axis] != t2.GetSize()/shape2[axis]) {
0717       std::cout << "axis " << axis << " sizes " << t1.GetSize() << " " << t2.GetSize() << "  ";
0718       std::cout << "shape 1 : " << ConvertShapeToString(t1.GetShape());
0719       std::cout << " shape 2 : " << ConvertShapeToString(t2.GetShape()) << std::endl;
0720       throw std::runtime_error("TMVA RTensor Concatenate - tensors have incompatible shapes");
0721    }
0722    std::vector<size_t> outShape = shape1;
0723    outShape[axis] = shape1[axis] + shape2[axis];
0724    TMVA::Experimental::RTensor<T> tout(outShape, t1.GetMemoryLayout());
0725    if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) {
0726       throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors");
0727    }
0728 
0729    auto & stride1 = t1.GetStrides();
0730    auto & stride2 = t2.GetStrides();
0731    auto & outStride = tout.GetStrides();
0732 
0733    size_t s1 = (axis > 0) ? stride1[axis-1] : t1.GetSize();  // block size to copy from first tensor
0734    size_t s2 = (axis > 0) ? stride2[axis-1] : t2.GetSize();  // block size to copy from second tensor
0735    size_t sout = (axis > 0) ? outStride[axis-1] : tout.GetSize();
0736    size_t nb = t1.GetSize()/s1;
0737    for (size_t i = 0; i < nb; i++) {
0738       std::copy(t1.GetData() + i*s1, t1.GetData() + (i+1)*s1, tout.GetData() + i * sout );
0739       std::copy(t2.GetData() + i*s2, t2.GetData() + (i+1)*s2, tout.GetData() + i * sout + s1 );
0740    }
0741 
0742    return tout;
0743 }
0744 
0745 
0746 inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) {
0747    GNN_Data out;
0748    out.node_data = Concatenate(data1.node_data,data2.node_data, axis);
0749    out.edge_data = Concatenate(data1.edge_data,data2.edge_data, axis);
0750    out.global_data = Concatenate<float>(data1.global_data,data2.global_data, axis-1);
0751    // assume sender/receivers of data1 and data2 are the same
0752    out.edge_index = data1.edge_index.Copy();
0753    return out;
0754 }
0755 
0756 inline GNN_Data Copy(const GNN_Data & data) {
0757    GNN_Data out;
0758    out.node_data = RTensor<float>(data.node_data.GetShape());
0759    out.edge_data = RTensor<float>(data.edge_data.GetShape());
0760    out.global_data = RTensor<float>(data.global_data.GetShape());
0761    out.edge_index = RTensor<int>(data.edge_index.GetShape());
0762    std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData());
0763    std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData());
0764    std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData());
0765    std::copy(data.edge_index.GetData(), data.edge_index.GetData()+ data.edge_index.GetSize(), out.edge_index.GetData());
0766    return out;
0767 }
0768 
0769 inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A,
0770                       const float *B, float beta, const float *C)
0771 {
0772    char ct = 't';
0773    char cn = 'n';
0774    const int *lda = transa ? &k : &m;
0775    const int *ldb = transb ? &n : &k;
0776    const int *ldc = &m;
0777    if (C != nullptr) {
0778       std::copy(C, C + m * n, output);
0779    }
0780    TMVA::Experimental::SOFIE::BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb,
0781                                            &beta, output, ldc);
0782 }
0783 
0784 template <class T>
0785 void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength)
0786 {
0787    std::string name;
0788    std::size_t length;
0789    is >> name >> length;
0790    if (name != expectedName) {
0791       std::string err_msg =
0792          "TMVA-SOFIE failed to read the correct tensor name; expected name is " + expectedName + " , read " + name;
0793       throw std::runtime_error(err_msg);
0794    }
0795    if (length != expectedLength) {
0796       std::string err_msg = "TMVA-SOFIE failed to read the correct tensor size; expected size is " +
0797                             std::to_string(expectedLength) + " , read " + std::to_string(length);
0798       throw std::runtime_error(err_msg);
0799    }
0800    for (size_t i = 0; i < length; ++i) {
0801       is >> target[i];
0802    }
0803    if (is.fail()) {
0804       throw std::runtime_error("TMVA-SOFIE failed to read the values for tensor " + expectedName);
0805    }
0806 }
0807 
0808 } // namespace SOFIE
0809 } // namespace Experimental
0810 } // namespace TMVA
0811 
0812 #endif //TMVA_SOFIE_COMMON