ExaTrkX/src/TorchMetricLearning.cpp

0001 // This file is part of the ACTS project.
0002 //
0003 // Copyright (C) 2016 CERN for the benefit of the ACTS project
0004 //
0005 // This Source Code Form is subject to the terms of the Mozilla Public
0006 // License, v. 2.0. If a copy of the MPL was not distributed with this
0007 // file, You can obtain one at https://mozilla.org/MPL/2.0/.
0008
0009 #include "Acts/Plugins/ExaTrkX/TorchMetricLearning.hpp"
0010
0011 #include "Acts/Plugins/ExaTrkX/detail/TensorVectorConversion.hpp"
0012 #include "Acts/Plugins/ExaTrkX/detail/buildEdges.hpp"
0013
0014 #ifndef ACTS_EXATRKX_CPUONLY
0015 #include <c10/cuda/CUDAGuard.h>
0016 #endif
0017
0018 #include <numbers>
0019
0020 #include <torch/script.h>
0021 #include <torch/torch.h>
0022
0023 #include "printCudaMemInfo.hpp"
0024
0025 using namespace torch::indexing;
0026
0027 namespace Acts {
0028
0029 TorchMetricLearning::TorchMetricLearning(const Config &cfg,
0030                                          std::unique_ptr<const Logger> _logger)
0031     : m_logger(std::move(_logger)), m_cfg(cfg) {
0032   c10::InferenceMode guard(true);
0033   torch::Device device = torch::kCPU;
0034
0035   if (!torch::cuda::is_available()) {
0036     ACTS_DEBUG("Running on CPU...");
0037   } else {
0038     if (cfg.deviceID >= 0 &&
0039         static_cast<std::size_t>(cfg.deviceID) < torch::cuda::device_count()) {
0040       ACTS_DEBUG("GPU device " << cfg.deviceID << " is being used.");
0041       device = torch::Device(torch::kCUDA, cfg.deviceID);
0042     } else {
0043       ACTS_WARNING("GPU device " << cfg.deviceID
0044                                  << " not available, falling back to CPU.");
0045     }
0046   }
0047
0048   ACTS_DEBUG("Using torch version " << TORCH_VERSION_MAJOR << "."
0049                                     << TORCH_VERSION_MINOR << "."
0050                                     << TORCH_VERSION_PATCH);
0051 #ifndef ACTS_EXATRKX_CPUONLY
0052   if (!torch::cuda::is_available()) {
0053     ACTS_INFO("CUDA not available, falling back to CPU");
0054   }
0055 #endif
0056
0057   try {
0058     m_model = std::make_unique<torch::jit::Module>();
0059     *m_model = torch::jit::load(m_cfg.modelPath, device);
0060     m_model->eval();
0061   } catch (const c10::Error &e) {
0062     throw std::invalid_argument("Failed to load models: " + e.msg());
0063   }
0064 }
0065
0066 TorchMetricLearning::~TorchMetricLearning() {}
0067
0068 PipelineTensors TorchMetricLearning::operator()(
0069     std::vector<float> &inputValues, std::size_t numNodes,
0070     const std::vector<std::uint64_t> & /*moduleIds*/,
0071     const ExecutionContext &execContext) {
0072   const auto device =
0073       execContext.device.type == Acts::Device::Type::eCUDA
0074           ? torch::Device(torch::kCUDA, execContext.device.index)
0075           : torch::kCPU;
0076   ACTS_DEBUG("Start graph construction");
0077   c10::InferenceMode guard(true);
0078
0079   // add a protection to avoid calling for kCPU
0080 #ifdef ACTS_EXATRKX_CPUONLY
0081   assert(device == torch::Device(torch::kCPU));
0082 #else
0083   std::optional<c10::cuda::CUDAGuard> device_guard;
0084   if (device.is_cuda()) {
0085     device_guard.emplace(device.index());
0086   }
0087 #endif
0088
0089   const std::int64_t numAllFeatures = inputValues.size() / numNodes;
0090
0091   // printout the r,phi,z of the first spacepoint
0092   ACTS_VERBOSE("First spacepoint information: " << [&]() {
0093     std::stringstream ss;
0094     for (int i = 0; i < numAllFeatures; ++i) {
0095       ss << inputValues[i] << "  ";
0096     }
0097     return ss.str();
0098   }());
0099   printCudaMemInfo(logger());
0100
0101   auto inputTensor = detail::vectorToTensor2D(inputValues, numAllFeatures);
0102
0103   // If we are on CPU, clone to get ownership (is this necessary?), else bring
0104   // to device.
0105   if (inputTensor.options().device() == device) {
0106     inputTensor = inputTensor.clone();
0107   } else {
0108     inputTensor = inputTensor.to(device);
0109   }
0110
0111   // **********
0112   // Embedding
0113   // **********
0114
0115   // Clone models (solve memory leak? members can be const...)
0116   auto model = m_model->clone();
0117   model.to(device);
0118
0119   std::vector<torch::jit::IValue> inputTensors;
0120   auto selectedFeaturesTensor =
0121       at::tensor(at::ArrayRef<int>(m_cfg.selectedFeatures));
0122   inputTensors.push_back(
0123       !m_cfg.selectedFeatures.empty()
0124           ? inputTensor.index({Slice{}, selectedFeaturesTensor})
0125           : std::move(inputTensor));
0126
0127   ACTS_DEBUG("embedding input tensor shape "
0128              << inputTensors[0].toTensor().size(0) << ", "
0129              << inputTensors[0].toTensor().size(1));
0130
0131   auto output = model.forward(inputTensors).toTensor();
0132
0133   ACTS_VERBOSE("Embedding space of the first SP:\n"
0134                << output.slice(/*dim=*/0, /*start=*/0, /*end=*/1));
0135   printCudaMemInfo(logger());
0136
0137   // ****************
0138   // Building Edges
0139   // ****************
0140
0141   auto edgeList = detail::buildEdges(output, m_cfg.rVal, m_cfg.knnVal,
0142                                      m_cfg.shuffleDirections);
0143
0144   ACTS_VERBOSE("Shape of built edges: (" << edgeList.size(0) << ", "
0145                                          << edgeList.size(1));
0146   ACTS_VERBOSE("Slice of edgelist:\n" << edgeList.slice(1, 0, 5));
0147   printCudaMemInfo(logger());
0148
0149   // Note: this unfortunately makes a copy right now
0150   return {detail::torchToActsTensor<float>(inputTensor, execContext),
0151           detail::torchToActsTensor<std::int64_t>(edgeList, execContext),
0152           std::nullopt, std::nullopt};
0153 }
0154 }  // namespace Acts