include/nvtx3/nvToolsExtMemCudaRt.h

0001 /*
0002  * SPDX-FileCopyrightText: Copyright (c) 2009-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
0003  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
0004  *
0005  * Licensed under the Apache License, Version 2.0 (the "License");
0006  * you may not use this file except in compliance with the License.
0007  * You may obtain a copy of the License at
0008  *
0009  *     http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  *
0017  * Licensed under the Apache License v2.0 with LLVM Exceptions.
0018  * See https://nvidia.github.io/NVTX/LICENSE.txt for license information.
0019  */
0020
0021 #if defined(NVTX_AS_SYSTEM_HEADER)
0022 #if defined(__clang__)
0023 #pragma clang system_header
0024 #elif defined(__GNUC__) || defined(__NVCOMPILER)
0025 #pragma GCC system_header
0026 #elif defined(_MSC_VER)
0027 #pragma system_header
0028 #endif
0029 #endif
0030
0031 #include "nvToolsExtMem.h"
0032
0033 #include "cuda.h"
0034 #include "cuda_runtime.h"
0035
0036 #ifdef __cplusplus
0037 extern "C" {
0038 #endif /* __cplusplus */
0039
0040 #ifndef NVTX_MEM_CUDART_CONTENTS_V1
0041 #define NVTX_MEM_CUDART_CONTENTS_V1
0042
0043 /** \defgroup MEMORY_CUDART Memory CUDA Runtime
0044  * See page \ref PAGE_MEMORY_CUDART.
0045  * @{
0046  */
0047
0048 /** \brief The memory is from a CUDA runtime array.
0049  *
0050  * Relevant functions: cudaMallocArray,  cudaMalloc3DArray
0051  * Also cudaArray_t from other types such as cudaMipmappedArray_t
0052  *
0053  * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
0054  *
0055  * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
0056  * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCudaArrayRangeDesc_t
0057  */
0058 #define NVTX_MEM_TYPE_CUDA_ARRAY 0x11
0059
0060 /** \brief structure to describe memory in a CUDA array object
0061  */
0062 typedef struct nvtxMemCudaArrayRangeDesc_v1
0063 {
0064     uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
0065     uint16_t structSize; /* Size of the structure. */
0066     uint32_t reserved0;
0067     cudaArray_t  src;
0068     size_t offset[3];
0069     size_t extent[3];
0070 } nvtxMemCudaArrayRangeDesc_v1;
0071 typedef nvtxMemCudaArrayRangeDesc_v1 nvtxMemCudaArrayRangeDesc_t;
0072
0073
0074 /** \brief The memory is from a CUDA device array.
0075  *
0076  * Relevant functions: cuArrayCreate,  cuArray3DCreate
0077  * Also CUarray from other types such as CUmipmappedArray
0078  *
0079  * NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE is not supported
0080  *
0081  * nvtxMemHeapRegister receives a heapDesc of type cudaArray_t because the description can be retrieved by tools through cudaArrayGetInfo()
0082  * nvtxMemRegionRegisterEx receives a regionDesc of type nvtxMemCuArrayRangeDesc_t
0083  */
0084 #define NVTX_MEM_TYPE_CU_ARRAY 0x12
0085
0086 /** \brief structure to describe memory in a CUDA array object
0087  */
0088 typedef struct nvtxMemCuArrayRangeDesc_v1
0089 {
0090     uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
0091     uint16_t structSize; /* Size of the structure. */
0092     uint32_t reserved0;
0093     CUarray  src;
0094     size_t offset[3];
0095     size_t extent[3];
0096 } nvtxMemCuArrayRangeDesc_v1;
0097 typedef nvtxMemCuArrayRangeDesc_v1 nvtxMemCuArrayRangeDesc_t;
0098
0099 /* Reserving 0x2-0xF for more common types */
0100
0101 #define NVTX_MEM_CUDA_PEER_ALL_DEVICES -1
0102
0103 /** \brief Get the permission object that represent the CUDA runtime device
0104  * or cuda driver context
0105  *
0106  * This object will allow developers to adjust permissions applied to work executed
0107  * on the GPU.  It may be inherited or overridden by permissions object bound
0108  * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
0109  *
0110  * Ex. change the peer to peer access permissions between devices in entirety
0111  * or punch through special holes
0112  *
0113  * By default, all memory is accessible that naturally would be to a CUDA kernel until
0114  * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
0115  *
0116  * This object should also represent the CUDA driver API level context.
0117 */
0118 NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetProcessWidePermissions(
0119     nvtxDomainHandle_t domain);
0120
0121 /** \brief Get the permission object that represent the CUDA runtime device
0122  * or cuda driver context
0123  *
0124  * This object will allow developers to adjust permissions applied to work executed
0125  * on the GPU.  It may be inherited or overridden by permissions object bound
0126  * with NVTX_MEM_PERMISSIONS_BIND_SCOPE_CUDA_STREAM, depending on the binding flags.
0127  *
0128  * Ex. change the peer to peer access permissions between devices in entirety
0129  * or punch through special holes
0130  *
0131  * By default, all memory is accessible that naturally would be to a CUDA kernel until
0132  * modified otherwise by nvtxMemCudaSetPeerAccess or changing regions.
0133  *
0134  * This object should also represent the CUDA driver API level context.
0135 */
0136 NVTX_DECLSPEC nvtxMemPermissionsHandle_t NVTX_API nvtxMemCudaGetDeviceWidePermissions(
0137     nvtxDomainHandle_t domain,
0138     int device);
0139
0140 /** \brief Change the default behavior for all memory mapped in from a particular device.
0141  *
0142  * While typically all memory defaults to readable and writable, users may desire to limit
0143  * access to reduced default permissions such as read-only and a per-device basis.
0144  *
0145  * Regions can used to further override smaller windows of memory.
0146  *
0147  * devicePeer can be NVTX_MEM_CUDA_PEER_ALL_DEVICES
0148  *
0149 */
0150 NVTX_DECLSPEC void NVTX_API nvtxMemCudaSetPeerAccess(
0151     nvtxDomainHandle_t domain,
0152     nvtxMemPermissionsHandle_t permissions,
0153     int devicePeer, /* device number such as from cudaGetDevice() or NVTX_MEM_CUDA_PEER_ALL_DEVICES */
0154     uint32_t flags); /* NVTX_MEM_PERMISSIONS_REGION_FLAGS_* */
0155
0156 /** \brief Mark memory ranges as initialized.
0157 *
0158 * The heap refers the the heap within which the region resides.
0159 * This can be from nvtxMemHeapRegister, NVTX_MEM_HEAP_HANDLE_PROCESS_WIDE, or one provided from other extension API.
0160 *
0161 * The regionType arg will define which type is used in regionDescArray.
0162 * The most commonly used type is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.
0163 *
0164 * The regionCount arg is how many element are in regionDescArray and regionHandleArrayOut.
0165 *
0166 * The regionHandleArrayOut arg points to an array where the tool will provide region handles.
0167 * If a pointer if provided, it is expected to have regionCount elements.
0168 * This pointer can be NULL if regionType is NVTX_MEM_TYPE_VIRTUAL_ADDRESS.  In this case,
0169 * the user can use the pointer to the virtual memory to reference the region in other
0170 * related functions which accept a nvtxMemRegionRef_t.
0171 */
0172 typedef struct nvtxMemMarkInitializedBatch_v1
0173 {
0174     uint16_t extCompatID; /* Set to NVTX_EXT_COMPATID_MEM */
0175     uint16_t structSize; /* Size of the structure. */
0176
0177     uint32_t regionType; /* NVTX_MEM_TYPE_* */
0178
0179     size_t regionDescCount;
0180     size_t regionDescElementSize;
0181     void const* regionDescElements; /* this will also become the handle for this region */
0182
0183 } nvtxMemMarkInitializedBatch_v1;
0184 typedef nvtxMemMarkInitializedBatch_v1 nvtxMemMarkInitializedBatch_t;
0185
0186 /** \brief Register a region of memory inside of a heap of linear process virtual memory
0187 *
0188 * stream is the CUDA stream where the range was accessed and initialized.
0189 */
0190 NVTX_DECLSPEC void NVTX_API nvtxMemCudaMarkInitialized(
0191     nvtxDomainHandle_t domain,
0192     cudaStream_t stream,
0193     uint8_t isPerThreadStream, /* 0 for false, otherwise true */
0194     nvtxMemMarkInitializedBatch_t const* desc);
0195
0196 /** @} */
0197
0198 #endif /* NVTX_MEM_CUDART_CONTENTS_V1 */
0199
0200 #ifdef __GNUC__
0201 #pragma GCC visibility push(internal)
0202 #endif
0203
0204 #ifndef NVTX_NO_IMPL
0205 #define NVTX_EXT_IMPL_MEM_CUDART_GUARD /* Ensure other headers cannot be included directly */
0206 #include "nvtxDetail/nvtxExtImplMemCudaRt_v1.h"
0207 #undef NVTX_EXT_IMPL_MEM_CUDART_GUARD
0208 #endif /*NVTX_NO_IMPL*/
0209
0210 #ifdef __GNUC__
0211 #pragma GCC visibility pop
0212 #endif
0213
0214
0215 #ifdef __cplusplus
0216 }
0217 #endif /* __cplusplus */