Gnn/src/TensorRTEdgeClassifier.cpp

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008
0009 #include "ActsPlugins/Gnn/TensorRTEdgeClassifier.hpp"
0010
0011 #include "ActsPlugins/Gnn/detail/CudaUtils.hpp"
0012
0013 #include <chrono>
0014 #include <filesystem>
0015 #include <fstream>
0016
0017 #include <NvInfer.h>
0018 #include <NvInferPlugin.h>
0019 #include <NvInferRuntimeBase.h>
0020 #include <cuda_runtime.h>
0021
0022 #include "printCudaMemInfo.hpp"
0023
0024 using namespace Acts;
0025
0026 namespace {
0027
0028 class TensorRTLogger : public nvinfer1::ILogger {
0029   std::unique_ptr<const Logger> m_logger;
0030
0031  public:
0032   TensorRTLogger(Logging::Level lvl)
0033       : m_logger(getDefaultLogger("TensorRT", lvl)) {}
0034
0035   void log(Severity severity, const char *msg) noexcept override {
0036     const auto &logger = *m_logger;
0037     switch (severity) {
0038       case Severity::kVERBOSE:
0039         ACTS_DEBUG(msg);
0040         break;
0041       case Severity::kINFO:
0042         ACTS_INFO(msg);
0043         break;
0044       case Severity::kWARNING:
0045         ACTS_WARNING(msg);
0046         break;
0047       case Severity::kERROR:
0048         ACTS_ERROR(msg);
0049         break;
0050       case Severity::kINTERNAL_ERROR:
0051         ACTS_FATAL(msg);
0052         break;
0053     }
0054   }
0055 };
0056
0057 }  // namespace
0058
0059 namespace ActsPlugins {
0060
0061 TensorRTEdgeClassifier::TensorRTEdgeClassifier(
0062     const Config &cfg, std::unique_ptr<const Logger> _logger)
0063     : m_logger(std::move(_logger)),
0064       m_cfg(cfg),
0065       m_trtLogger(std::make_unique<TensorRTLogger>(m_logger->level())) {
0066   auto status = initLibNvInferPlugins(m_trtLogger.get(), "");
0067   if (!status) {
0068     throw std::runtime_error("Failed to initialize TensorRT plugins");
0069   }
0070
0071   std::size_t fsize =
0072       std::filesystem::file_size(std::filesystem::path(m_cfg.modelPath));
0073   std::vector<char> engineData(fsize);
0074
0075   ACTS_DEBUG("Load '" << m_cfg.modelPath << "' with size " << fsize);
0076
0077   std::ifstream engineFile(m_cfg.modelPath);
0078   if (!engineFile) {
0079     throw std::runtime_error("Failed to open engine file");
0080   } else if (!engineFile.read(engineData.data(), fsize)) {
0081     throw std::runtime_error("Failed to read engine file");
0082   }
0083
0084   m_runtime.reset(nvinfer1::createInferRuntime(*m_trtLogger));
0085   if (!m_runtime) {
0086     throw std::runtime_error("Failed to create TensorRT runtime");
0087   }
0088
0089   m_engine.reset(m_runtime->deserializeCudaEngine(engineData.data(), fsize));
0090   if (!m_engine) {
0091     throw std::runtime_error("Failed to deserialize CUDA engine");
0092   }
0093
0094   ACTS_INFO("Device memory required by TRT context: "
0095             << m_engine->getDeviceMemorySizeV2() * 1e-9 << " GB");
0096
0097   for (auto i = 0ul; i < m_cfg.numExecutionContexts; ++i) {
0098     ACTS_DEBUG("Create execution context " << i);
0099     m_contexts.emplace_back(m_engine->createExecutionContext());
0100     if (!m_contexts.back()) {
0101       throw std::runtime_error("Failed to create execution context");
0102     }
0103   }
0104
0105   std::size_t freeMem{}, totalMem{};
0106   ACTS_CUDA_CHECK(cudaMemGetInfo(&freeMem, &totalMem));
0107   ACTS_DEBUG("Used CUDA memory after TensorRT initialization: "
0108              << (totalMem - freeMem) * 1e-9 << " / " << totalMem * 1e-9
0109              << " GB");
0110
0111   m_count.emplace(m_contexts.size());
0112
0113   if (m_engine->getNbOptimizationProfiles() > 1) {
0114     ACTS_WARNING("Cannot handle more then one optimization profile for now");
0115   }
0116
0117   m_maxNodes =
0118       m_engine->getProfileShape("x", 0, nvinfer1::OptProfileSelector::kMAX)
0119           .d[0];
0120   ACTS_INFO("Maximum number of nodes: " << m_maxNodes);
0121
0122   auto maxEdgesA =
0123       m_engine
0124           ->getProfileShape("edge_index", 0, nvinfer1::OptProfileSelector::kMAX)
0125           .d[1];
0126   auto maxEdgesB =
0127       m_engine
0128           ->getProfileShape("edge_attr", 0, nvinfer1::OptProfileSelector::kMAX)
0129           .d[0];
0130
0131   if (maxEdgesA != maxEdgesB) {
0132     throw std::invalid_argument(
0133         "Inconsistent max edges definition in engine for 'edge_index' and "
0134         "'edge_attr'");
0135   }
0136
0137   m_maxEdges = maxEdgesA;
0138   ACTS_INFO("Maximum number of edges: " << m_maxEdges);
0139 }
0140
0141 TensorRTEdgeClassifier::~TensorRTEdgeClassifier() {}
0142
0143 PipelineTensors TensorRTEdgeClassifier::operator()(
0144     PipelineTensors tensors, const ExecutionContext &execContext) {
0145   assert(execContext.device.type == Device::Type::eCUDA);
0146
0147   decltype(std::chrono::high_resolution_clock::now()) t0, t1, t2, t3, t4;
0148   t0 = std::chrono::high_resolution_clock::now();
0149
0150   // Curing this would require more complicated handling, and should happen
0151   // almost never
0152   if (auto nNodes = tensors.nodeFeatures.shape().at(0); nNodes > m_maxNodes) {
0153     ACTS_WARNING("Number of nodes ("
0154                  << nNodes << ") exceeds configured maximum, return 0 edges");
0155     throw NoEdgesError{};
0156   }
0157
0158   if (auto nEdges = tensors.edgeIndex.shape().at(1); nEdges > m_maxEdges) {
0159     ACTS_WARNING("Number of edges ("
0160                  << nEdges << ") exceeds maximum, shrink edge tensor to "
0161                  << m_maxEdges);
0162
0163     auto [newEdgeIndex, newEdgeFeatures] =
0164         applyEdgeLimit(tensors.edgeIndex, tensors.edgeFeatures, m_maxEdges,
0165                        execContext.stream);
0166     tensors.edgeIndex = std::move(newEdgeIndex);
0167     tensors.edgeFeatures = std::move(newEdgeFeatures);
0168   }
0169
0170   // get a context from the list of contexts
0171   std::unique_ptr<nvinfer1::IExecutionContext> context;
0172
0173   // Try to decrement the count before getting a context. By this it should be
0174   // garantueed that a context is available
0175   m_count->acquire();
0176   assert(!m_contexts.empty());
0177
0178   // Protect access to the context by a mutex
0179   {
0180     std::lock_guard<std::mutex> lock(m_contextMutex);
0181     context = std::move(m_contexts.back());
0182     m_contexts.pop_back();
0183   }
0184
0185   context->setInputShape(
0186       "x", nvinfer1::Dims2{static_cast<long>(tensors.nodeFeatures.shape()[0]),
0187                            static_cast<long>(tensors.nodeFeatures.shape()[1])});
0188   context->setTensorAddress("x", tensors.nodeFeatures.data());
0189
0190   context->setInputShape(
0191       "edge_index",
0192       nvinfer1::Dims2{static_cast<long>(tensors.edgeIndex.shape()[0]),
0193                       static_cast<long>(tensors.edgeIndex.shape()[1])});
0194   context->setTensorAddress("edge_index", tensors.edgeIndex.data());
0195
0196   if (tensors.edgeFeatures.has_value()) {
0197     context->setInputShape(
0198         "edge_attr",
0199         nvinfer1::Dims2{static_cast<long>(tensors.edgeFeatures->shape()[0]),
0200                         static_cast<long>(tensors.edgeFeatures->shape()[1])});
0201     context->setTensorAddress("edge_attr", tensors.edgeFeatures->data());
0202   }
0203
0204   auto scores =
0205       Tensor<float>::Create({tensors.edgeIndex.shape()[1], 1ul}, execContext);
0206   context->setTensorAddress("output", scores.data());
0207
0208   t2 = std::chrono::high_resolution_clock::now();
0209
0210   auto stream = execContext.stream.value();
0211   auto status = context->enqueueV3(stream);
0212   if (!status) {
0213     throw std::runtime_error("Failed to execute TensorRT model");
0214   }
0215   ACTS_CUDA_CHECK(cudaStreamSynchronize(stream));
0216
0217   t3 = std::chrono::high_resolution_clock::now();
0218
0219   {
0220     std::lock_guard<std::mutex> lock(m_contextMutex);
0221     m_contexts.push_back(std::move(context));
0222   }
0223
0224   // Increment the count after the context have been put back in the vector
0225   m_count->release();
0226
0227   sigmoid(scores, execContext.stream);
0228
0229   ACTS_VERBOSE("Size after classifier: " << scores.shape()[0]);
0230   printCudaMemInfo(logger());
0231
0232   auto [newScores, newEdgeIndex] =
0233       applyScoreCut(scores, tensors.edgeIndex, m_cfg.cut, execContext.stream);
0234   ACTS_VERBOSE("Size after score cut: " << newEdgeIndex.shape()[1]);
0235   printCudaMemInfo(logger());
0236
0237   t4 = std::chrono::high_resolution_clock::now();
0238
0239   auto milliseconds = [](const auto &a, const auto &b) {
0240     return std::chrono::duration<double, std::milli>(b - a).count();
0241   };
0242   ACTS_DEBUG("Time anycast:  " << milliseconds(t0, t1));
0243   ACTS_DEBUG("Time alloc, set shape " << milliseconds(t1, t2));
0244   ACTS_DEBUG("Time inference:       " << milliseconds(t2, t3));
0245   ACTS_DEBUG("Time sigmoid and cut: " << milliseconds(t3, t4));
0246
0247   return {std::move(tensors.nodeFeatures), std::move(newEdgeIndex),
0248           std::nullopt, std::move(newScores)};
0249 }
0250
0251 }  // namespace ActsPlugins