Back to home page

EIC code displayed by LXR

 
 

    


Warning, file /include/hwloc/rsmi.h was not indexed or was modified since last indexation (in which case cross-reference links may be missing, inaccurate or erroneous).

0001 /*
0002  * SPDX-License-Identifier: BSD-3-Clause
0003  * Copyright © 2012-2023 Inria.  All rights reserved.
0004  * Copyright (c) 2020, Advanced Micro Devices, Inc. All rights reserved.
0005  * Written by Advanced Micro Devices,
0006  * See COPYING in top-level directory.
0007  */
0008 
0009 /** \file
0010  * \brief Macros to help interaction between hwloc and the ROCm SMI Management Library.
0011  *
0012  * Applications that use both hwloc and the ROCm SMI Management Library may want to
0013  * include this file so as to get topology information for AMD GPU devices.
0014  */
0015 
0016 #ifndef HWLOC_RSMI_H
0017 #define HWLOC_RSMI_H
0018 
0019 #include "hwloc.h"
0020 #include "hwloc/autogen/config.h"
0021 #include "hwloc/helper.h"
0022 #ifdef HWLOC_LINUX_SYS
0023 #include "hwloc/linux.h"
0024 #endif
0025 
0026 #include <rocm_smi/rocm_smi.h>
0027 
0028 
0029 #ifdef __cplusplus
0030 extern "C" {
0031 #endif
0032 
0033 
0034 /** \defgroup hwlocality_rsmi Interoperability with the ROCm SMI Management Library
0035  *
0036  * This interface offers ways to retrieve topology information about
0037  * devices managed by the ROCm SMI Management Library.
0038  *
0039  * @{
0040  */
0041 
0042 /** \brief Get the CPU set of logical processors that are physically
0043  * close to AMD GPU device whose index is \p dv_ind.
0044  *
0045  * Store in \p set the CPU-set describing the locality of the AMD GPU device
0046  * whose index is \p dv_ind.
0047  *
0048  * Topology \p topology and device \p dv_ind must match the local machine.
0049  * I/O devices detection and the ROCm SMI component are not needed in the
0050  * topology.
0051  *
0052  * The function only returns the locality of the device.
0053  * If more information about the device is needed, OS objects should
0054  * be used instead, see hwloc_rsmi_get_device_osdev()
0055  * and hwloc_rsmi_get_device_osdev_by_index().
0056  *
0057  * This function is currently only implemented in a meaningful way for
0058  * Linux; other systems will simply get a full cpuset.
0059  *
0060  * \return 0 on success.
0061  * \return -1 on error, for instance if device information could not be found.
0062  */
0063 static __hwloc_inline int
0064 hwloc_rsmi_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
0065                              uint32_t dv_ind, hwloc_cpuset_t set)
0066 {
0067 #ifdef HWLOC_LINUX_SYS
0068   /* If we're on Linux, use the sysfs mechanism to get the local cpus */
0069 #define HWLOC_RSMI_DEVICE_SYSFS_PATH_MAX 128
0070   char path[HWLOC_RSMI_DEVICE_SYSFS_PATH_MAX];
0071   rsmi_status_t ret;
0072   uint64_t bdfid = 0;
0073   unsigned domain, device, bus;
0074 
0075   if (!hwloc_topology_is_thissystem(topology)) {
0076     errno = EINVAL;
0077     return -1;
0078   }
0079 
0080   ret = rsmi_dev_pci_id_get(dv_ind, &bdfid);
0081   if (RSMI_STATUS_SUCCESS != ret) {
0082     errno = EINVAL;
0083     return -1;
0084   }
0085   domain = (bdfid>>32) & 0xffffffff;
0086   bus = ((bdfid & 0xffff)>>8) & 0xff;
0087   device = ((bdfid & 0xff)>>3) & 0x1f;
0088 
0089   sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domain, bus, device);
0090   if (hwloc_linux_read_path_as_cpumask(path, set) < 0
0091       || hwloc_bitmap_iszero(set))
0092     hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
0093 #else
0094   /* Non-Linux systems simply get a full cpuset */
0095   hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
0096 #endif
0097   return 0;
0098 }
0099 
0100 /** \brief Get the hwloc OS device object corresponding to the
0101  * AMD GPU device whose index is \p dv_ind.
0102  *
0103  * \return The hwloc OS device object describing the AMD GPU device whose
0104  * index is \p dv_ind.
0105  * \return \c NULL if none could be found.
0106  *
0107  * The topology \p topology does not necessarily have to match the current
0108  * machine. For instance the topology may be an XML import of a remote host.
0109  * I/O devices detection and the ROCm SMI component must be enabled in the
0110  * topology.
0111  *
0112  * \note The corresponding PCI device object can be obtained by looking
0113  * at the OS device parent object (unless PCI devices are filtered out).
0114  */
0115 static __hwloc_inline hwloc_obj_t
0116 hwloc_rsmi_get_device_osdev_by_index(hwloc_topology_t topology, uint32_t dv_ind)
0117 {
0118   hwloc_obj_t osdev = NULL;
0119   while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
0120     if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
0121       && osdev->name
0122       && !strncmp("rsmi", osdev->name, 4)
0123       && atoi(osdev->name + 4) == (int) dv_ind)
0124       return osdev;
0125   }
0126   return NULL;
0127 }
0128 
0129 /** \brief Get the hwloc OS device object corresponding to AMD GPU device,
0130  * whose index is \p dv_ind.
0131  *
0132  * \return The hwloc OS device object that describes the given
0133  * AMD GPU, whose index is \p dv_ind.
0134  * \return \c NULL if none could be found.
0135  *
0136  * Topology \p topology and device \p dv_ind must match the local machine.
0137  * I/O devices detection and the ROCm SMI component must be enabled in the
0138  * topology. If not, the locality of the object may still be found using
0139  * hwloc_rsmi_get_device_cpuset().
0140  *
0141  * \note The corresponding hwloc PCI device may be found by looking
0142  * at the result parent pointer (unless PCI devices are filtered out).
0143  */
0144 static __hwloc_inline hwloc_obj_t
0145 hwloc_rsmi_get_device_osdev(hwloc_topology_t topology, uint32_t dv_ind)
0146 {
0147   hwloc_obj_t osdev;
0148   rsmi_status_t ret;
0149   uint64_t bdfid = 0;
0150   unsigned domain, device, bus, func;
0151   uint64_t id;
0152   char uuid[64];
0153 
0154   if (!hwloc_topology_is_thissystem(topology)) {
0155     errno = EINVAL;
0156     return NULL;
0157   }
0158 
0159   ret = rsmi_dev_pci_id_get(dv_ind, &bdfid);
0160   if (RSMI_STATUS_SUCCESS != ret) {
0161     errno = EINVAL;
0162     return NULL;
0163   }
0164   domain = (bdfid>>32) & 0xffffffff;
0165   bus = ((bdfid & 0xffff)>>8) & 0xff;
0166   device = ((bdfid & 0xff)>>3) & 0x1f;
0167   func = bdfid & 0x7;
0168 
0169   ret = rsmi_dev_unique_id_get(dv_ind, &id);
0170   if (RSMI_STATUS_SUCCESS != ret)
0171     uuid[0] = '\0';
0172   else
0173     sprintf(uuid, "%lx", id);
0174 
0175   osdev = NULL;
0176   while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
0177     hwloc_obj_t pcidev = osdev->parent;
0178     const char *info;
0179 
0180     if (strncmp(osdev->name, "rsmi", 4))
0181       continue;
0182 
0183     if (pcidev
0184       && pcidev->type == HWLOC_OBJ_PCI_DEVICE
0185       && pcidev->attr->pcidev.domain == domain
0186       && pcidev->attr->pcidev.bus == bus
0187       && pcidev->attr->pcidev.dev == device
0188       && pcidev->attr->pcidev.func == func)
0189       return osdev;
0190 
0191     info = hwloc_obj_get_info_by_name(osdev, "AMDUUID");
0192     if (info && !strcmp(info, uuid))
0193       return osdev;
0194   }
0195 
0196   return NULL;
0197 }
0198 
0199 /** @} */
0200 
0201 
0202 #ifdef __cplusplus
0203 } /* extern "C" */
0204 #endif
0205 
0206 
0207 #endif /* HWLOC_RSMI_H */