nvmlwrap.h 9.48 KB
Newer Older
lishen's avatar
lishen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
/*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#ifndef SCCL_NVMLWRAP_H_
#define SCCL_NVMLWRAP_H_

#include "check.h"

namespace sccl {
namespace hardware {
namespace topology {

// #define SCCL_NVML_DIRECT 1
#ifndef SCCL_NVML_DIRECT
#define SCCL_NVML_DIRECT 0
#endif

#if SCCL_NVML_DIRECT
#include "nvml.h"
#else
// Dynamically handle dependencies on NVML

/* Extracted from nvml.h */
typedef struct nvmlDevice_st* nvmlDevice_t;
#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16

typedef enum nvmlEnableState_enum {
    NVML_FEATURE_DISABLED = 0, //!< Feature disabled
    NVML_FEATURE_ENABLED  = 1  //!< Feature enabled
} nvmlEnableState_t;

typedef enum nvmlNvLinkCapability_enum {
    NVML_NVLINK_CAP_P2P_SUPPORTED  = 0, // P2P over NVLink is supported
    NVML_NVLINK_CAP_SYSMEM_ACCESS  = 1, // Access to system memory is supported
    NVML_NVLINK_CAP_P2P_ATOMICS    = 2, // P2P atomics are supported
    NVML_NVLINK_CAP_SYSMEM_ATOMICS = 3, // System memory atomics are supported
    NVML_NVLINK_CAP_SLI_BRIDGE     = 4, // SLI is supported over this link
    NVML_NVLINK_CAP_VALID          = 5, // Link is supported on this device
    // should be last
    NVML_NVLINK_CAP_COUNT
} nvmlNvLinkCapability_t;

typedef enum nvmlReturn_enum {
    NVML_SUCCESS                       = 0,  //!< The operation was successful
    NVML_ERROR_UNINITIALIZED           = 1,  //!< NVML was not first initialized with nvmlInit()
    NVML_ERROR_INVALID_ARGUMENT        = 2,  //!< A supplied argument is invalid
    NVML_ERROR_NOT_SUPPORTED           = 3,  //!< The requested operation is not available on target device
    NVML_ERROR_NO_PERMISSION           = 4,  //!< The current user does not have permission for operation
    NVML_ERROR_ALREADY_INITIALIZED     = 5,  //!< Deprecated: Multiple initializations are now allowed through ref counting
    NVML_ERROR_NOT_FOUND               = 6,  //!< A query to find an object was unsuccessful
    NVML_ERROR_INSUFFICIENT_SIZE       = 7,  //!< An input argument is not large enough
    NVML_ERROR_INSUFFICIENT_POWER      = 8,  //!< A device's external power cables are not properly attached
    NVML_ERROR_DRIVER_NOT_LOADED       = 9,  //!< NVIDIA driver is not loaded
    NVML_ERROR_TIMEOUT                 = 10, //!< User provided timeout passed
    NVML_ERROR_IRQ_ISSUE               = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU
    NVML_ERROR_LIBRARY_NOT_FOUND       = 12, //!< NVML Shared Library couldn't be found or loaded
    NVML_ERROR_FUNCTION_NOT_FOUND      = 13, //!< Local version of NVML doesn't implement this function
    NVML_ERROR_CORRUPTED_INFOROM       = 14, //!< infoROM is corrupted
    NVML_ERROR_GPU_IS_LOST             = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible
    NVML_ERROR_RESET_REQUIRED          = 16, //!< The GPU requires a reset before it can be used again
    NVML_ERROR_OPERATING_SYSTEM        = 17, //!< The GPU control device has been blocked by the operating system/cgroups
    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch
    NVML_ERROR_IN_USE                  = 19, //!< An operation cannot be performed because the GPU is currently in use
    NVML_ERROR_UNKNOWN                 = 999 //!< An internal driver error occurred
} nvmlReturn_t;

typedef struct nvmlPciInfo_st {
    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
    unsigned int domain;                            //!< The PCI domain on which the device's bus resides, 0 to 0xffff
    unsigned int bus;                               //!< The bus on which the device resides, 0 to 0xff
    unsigned int device;                            //!< The device's id on the bus, 0 to 31
    unsigned int pciDeviceId;                       //!< The combined 16-bit device id and 16-bit vendor id

    // Added in NVML 2.285 API
    unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID

    // NVIDIA reserved for internal use only
    unsigned int reserved0;
    unsigned int reserved1;
    unsigned int reserved2;
    unsigned int reserved3;
} nvmlPciInfo_t;

/* P2P Capability Index Status*/
typedef enum nvmlGpuP2PStatus_enum {
    NVML_P2P_STATUS_OK = 0,
    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
    NVML_P2P_STATUS_NOT_SUPPORTED,
    NVML_P2P_STATUS_UNKNOWN
} nvmlGpuP2PStatus_t;

/* P2P Capability Index*/
typedef enum nvmlGpuP2PCapsIndex_enum {
    NVML_P2P_CAPS_INDEX_READ = 0,
    NVML_P2P_CAPS_INDEX_WRITE,
    NVML_P2P_CAPS_INDEX_NVLINK,
    NVML_P2P_CAPS_INDEX_ATOMICS,
    NVML_P2P_CAPS_INDEX_PROP,
    NVML_P2P_CAPS_INDEX_UNKNOWN
} nvmlGpuP2PCapsIndex_t;

/**
 * Represents the type for sample value returned
 */
typedef enum nvmlValueType_enum {
    NVML_VALUE_TYPE_DOUBLE             = 0,
    NVML_VALUE_TYPE_UNSIGNED_INT       = 1,
    NVML_VALUE_TYPE_UNSIGNED_LONG      = 2,
    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
    NVML_VALUE_TYPE_SIGNED_LONG_LONG   = 4,

    // Keep this last
    NVML_VALUE_TYPE_COUNT
} nvmlValueType_t;

/**
 * Union to represent different types of Value
 */
typedef union nvmlValue_st {
    double dVal;               //!< If the value is double
    unsigned int uiVal;        //!< If the value is unsigned int
    unsigned long ulVal;       //!< If the value is unsigned long
    unsigned long long ullVal; //!< If the value is unsigned long long
    signed long long sllVal;   //!< If the value is signed long long
} nvmlValue_t;

/**
 * Field Identifiers.
 *
 * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
 */

/* NVLink Speed */
#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90       //!< Common NVLink Speed in MBps for active links
#define NVML_FI_DEV_NVLINK_LINK_COUNT 91              //!< Number of NVLinks present on the device

/**
 * Remote device NVLink ID
 *
 * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
 */
#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146       //!< Remote device NVLink ID

/**
 * NVSwitch: connected NVLink count
 */
#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch

#define NVML_FI_DEV_NVLINK_GET_SPEED 164
#define NVML_FI_DEV_NVLINK_GET_STATE 165
#define NVML_FI_DEV_NVLINK_GET_VERSION 166
#define NVML_FI_MAX 167 //!< One greater than the largest field ID defined above

/**
 * Information for a Field Value Sample
 */
typedef struct nvmlFieldValue_st {
    unsigned int
        fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
    unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId
                          //!< can represent linkId.
    long long timestamp;  //!< CPU Timestamp of this value in microseconds since 1970
    long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by
                           //!< the same driver call.
    nvmlValueType_t valueType; //!< Type of the value stored in value
    nvmlReturn_t nvmlReturn;   //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn !=
                               //!< NVML_SUCCESS
    nvmlValue_t value;         //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
} nvmlFieldValue_t;

/* End of nvml.h */
#endif                  // SCCL_NVML_DIRECT

constexpr int scclNvmlMaxDevices = 32;
struct scclNvmlDeviceInfo {
    nvmlDevice_t handle;
    int computeCapabilityMajor, computeCapabilityMinor;
};
struct scclNvmlDevicePairInfo {
    nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
};
extern int scclNvmlDeviceCount;
extern scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
extern scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];

// All scclNvmlFoo() functions call scclNvmlEnsureInitialized() implicitly.
// Outsiders need only call it if they want to inspect the scclNvml global
// tables above.
scclResult_t scclNvmlEnsureInitialized();

scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device);
scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive);
scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci);
scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult);
scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values);

} // namespace topology
} // namespace hardware
} // namespace sccl

#endif // End include guard