nvmlwrap.cc

/*************************************************************************
 * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/

#include "nvmlwrap.h"
#include "base.h"

#include <initializer_list>
#include <memory>
#include <mutex>

namespace sccl {
namespace hardware {
namespace topology {

int scclNvmlDeviceCount = 0;
scclNvmlDeviceInfo scclNvmlDevices[scclNvmlMaxDevices];
scclNvmlDevicePairInfo scclNvmlDevicePairs[scclNvmlMaxDevices][scclNvmlMaxDevices];

#if SCCL_NVML_DIRECT
#define SCCL_NVML_FN(name, rettype, arglist) constexpr rettype(*pfn_##name) arglist = name;
#else
#include <dlfcn.h>
#define SCCL_NVML_FN(name, rettype, arglist) rettype(*pfn_##name) arglist = nullptr;
#endif

namespace {
SCCL_NVML_FN(nvmlInit, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlInit_v2, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlShutdown, nvmlReturn_t, ())
SCCL_NVML_FN(nvmlDeviceGetCount, nvmlReturn_t, (unsigned int*))
SCCL_NVML_FN(nvmlDeviceGetCount_v2, nvmlReturn_t, (unsigned int*))
SCCL_NVML_FN(nvmlDeviceGetHandleByPciBusId, nvmlReturn_t, (const char* pciBusId, nvmlDevice_t* device))
SCCL_NVML_FN(nvmlDeviceGetHandleByIndex, nvmlReturn_t, (unsigned int index, nvmlDevice_t* device))
SCCL_NVML_FN(nvmlDeviceGetIndex, nvmlReturn_t, (nvmlDevice_t device, unsigned* index))
SCCL_NVML_FN(nvmlErrorString, char const*, (nvmlReturn_t r))
SCCL_NVML_FN(nvmlDeviceGetNvLinkState, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive))
SCCL_NVML_FN(nvmlDeviceGetNvLinkRemotePciInfo, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci))
SCCL_NVML_FN(nvmlDeviceGetNvLinkCapability, nvmlReturn_t, (nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult))
SCCL_NVML_FN(nvmlDeviceGetCudaComputeCapability, nvmlReturn_t, (nvmlDevice_t device, int* major, int* minor))
SCCL_NVML_FN(nvmlDeviceGetP2PStatus, nvmlReturn_t, (nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus))
SCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values))

std::mutex lock; // NVML has had some thread safety bugs
bool initialized                    = false;
thread_local bool threadInitialized = false;
scclResult_t initResult;
} // namespace

scclResult_t scclNvmlEnsureInitialized() {
    // Optimization to avoid repeatedly grabbing the lock when we only want to
    // read from the global tables.
    if(threadInitialized)
        return initResult;
    threadInitialized = true;

    std::lock_guard<std::mutex> locked(lock);

    if(initialized)
        return initResult;
    initialized = true;

#if !SCCL_NVML_DIRECT
    if(pfn_nvmlInit == nullptr) {
        void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
        if(libhandle == nullptr) {
            WARN("Failed to open libnvidia-ml.so.1");
            initResult = scclSystemError;
            return initResult;
        }

        struct Symbol {
            void** ppfn;
            char const* name;
        };
        std::initializer_list<Symbol> symbols = {{(void**)&pfn_nvmlInit, "nvmlInit"},
                                                 {(void**)&pfn_nvmlInit_v2, "nvmlInit_v2"},
                                                 {(void**)&pfn_nvmlShutdown, "nvmlShutdown"},
                                                 {(void**)&pfn_nvmlDeviceGetCount, "nvmlDeviceGetCount"},
                                                 {(void**)&pfn_nvmlDeviceGetCount_v2, "nvmlDeviceGetCount_v2"},
                                                 {(void**)&pfn_nvmlDeviceGetHandleByPciBusId, "nvmlDeviceGetHandleByPciBusId"},
                                                 {(void**)&pfn_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
                                                 {(void**)&pfn_nvmlDeviceGetIndex, "nvmlDeviceGetIndex"},
                                                 {(void**)&pfn_nvmlErrorString, "nvmlErrorString"},
                                                 {(void**)&pfn_nvmlDeviceGetNvLinkState, "nvmlDeviceGetNvLinkState"},
                                                 {(void**)&pfn_nvmlDeviceGetNvLinkRemotePciInfo, "nvmlDeviceGetNvLinkRemotePciInfo"},
                                                 {(void**)&pfn_nvmlDeviceGetNvLinkCapability, "nvmlDeviceGetNvLinkCapability"},
                                                 {(void**)&pfn_nvmlDeviceGetCudaComputeCapability, "nvmlDeviceGetCudaComputeCapability"},
                                                 {(void**)&pfn_nvmlDeviceGetP2PStatus, "nvmlDeviceGetP2PStatus"},
                                                 {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"}};
        for(Symbol sym : symbols) {
            *sym.ppfn = dlsym(libhandle, sym.name);
        }
    }
#endif

#if SCCL_NVML_DIRECT
    bool have_v2 = true;
#else
    bool have_v2 = pfn_nvmlInit_v2 != nullptr; // if this compare is done in the SCCL_NVML_DIRECT=1 case then GCC warns about it never being null
#endif
    nvmlReturn_t res1 = (have_v2 ? pfn_nvmlInit_v2 : pfn_nvmlInit)();
    if(res1 != NVML_SUCCESS) {
        WARN("nvmlInit%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
        initResult = scclSystemError;
        return initResult;
    }

    unsigned int ndev;
    res1 = (have_v2 ? pfn_nvmlDeviceGetCount_v2 : pfn_nvmlDeviceGetCount)(&ndev);
    if(res1 != NVML_SUCCESS) {
        WARN("nvmlDeviceGetCount%s() failed: %s", have_v2 ? "_v2" : "", pfn_nvmlErrorString(res1));
        initResult = scclSystemError;
        return initResult;
    }

    scclNvmlDeviceCount = int(ndev);
    if(scclNvmlMaxDevices < scclNvmlDeviceCount) {
        WARN("nvmlDeviceGetCount() reported more devices (%d) than the internal maximum (scclNvmlMaxDevices=%d)", scclNvmlDeviceCount, scclNvmlMaxDevices);
        initResult = scclInternalError;
        return initResult;
    }

    for(int a = 0; a < scclNvmlDeviceCount; a++) {
        res1 = pfn_nvmlDeviceGetHandleByIndex(a, &scclNvmlDevices[a].handle);
        if(res1 != NVML_SUCCESS) {
            WARN("nvmlDeviceGetHandleByIndex(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
            initResult = scclSystemError;
            return initResult;
        }

        res1 = pfn_nvmlDeviceGetCudaComputeCapability(
            scclNvmlDevices[a].handle, &scclNvmlDevices[a].computeCapabilityMajor, &scclNvmlDevices[a].computeCapabilityMinor);
        if(res1 != NVML_SUCCESS) {
            WARN("nvmlDeviceGetCudaComputeCapability(%d) failed: %s", int(a), pfn_nvmlErrorString(res1));
            initResult = scclSystemError;
            return initResult;
        }
    }

    for(int a = 0; a < scclNvmlDeviceCount; a++) {
        for(int b = 0; b < scclNvmlDeviceCount; b++) {
            nvmlDevice_t da = scclNvmlDevices[a].handle;
            nvmlDevice_t db = scclNvmlDevices[b].handle;

            res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_READ, &scclNvmlDevicePairs[a][b].p2pStatusRead);
            if(res1 != NVML_SUCCESS) {
                WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
                initResult = scclSystemError;
                return initResult;
            }

            res1 = pfn_nvmlDeviceGetP2PStatus(da, db, NVML_P2P_CAPS_INDEX_WRITE, &scclNvmlDevicePairs[a][b].p2pStatusWrite);
            if(res1 != NVML_SUCCESS) {
                WARN("nvmlDeviceGetP2PStatus(%d,%d,NVML_P2P_CAPS_INDEX_READ) failed: %s", a, b, pfn_nvmlErrorString(res1));
                initResult = scclSystemError;
                return initResult;
            }
        }
    }

    initResult = scclSuccess;
    return initResult;
}

#define NVMLCHECK(name, ...)                                             \
    do {                                                                 \
        nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__);                \
        if(e44241808 != NVML_SUCCESS) {                                  \
            WARN(#name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
            return scclSystemError;                                      \
        }                                                                \
    } while(0)

#define NVMLTRY(name, ...)                                                                  \
    do {                                                                                    \
        if(!SCCL_NVML_DIRECT && pfn_##name == nullptr)                                      \
            return scclInternalError; /* missing symbol is not a warned error */            \
        nvmlReturn_t e44241808 = pfn_##name(__VA_ARGS__);                                   \
        if(e44241808 != NVML_SUCCESS) {                                                     \
            if(e44241808 != NVML_ERROR_NOT_SUPPORTED)                                       \
                INFO(SCCL_LOG_TOPO, #name "() failed: %s", pfn_nvmlErrorString(e44241808)); \
            return scclSystemError;                                                         \
        }                                                                                   \
    } while(0)

scclResult_t scclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    std::lock_guard<std::mutex> locked(lock);
    NVMLCHECK(nvmlDeviceGetHandleByPciBusId, pciBusId, device);
    return scclSuccess;
}

scclResult_t scclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t* device) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    *device = scclNvmlDevices[index].handle;
    return scclSuccess;
}

scclResult_t scclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    for(int d = 0; d < scclNvmlDeviceCount; d++) {
        if(scclNvmlDevices[d].handle == device) {
            *index = d;
            return scclSuccess;
        }
    }
    return scclInvalidArgument;
}

scclResult_t scclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    std::lock_guard<std::mutex> locked(lock);
    NVMLTRY(nvmlDeviceGetNvLinkState, device, link, isActive);
    return scclSuccess;
}

scclResult_t scclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    std::lock_guard<std::mutex> locked(lock);
    NVMLTRY(nvmlDeviceGetNvLinkRemotePciInfo, device, link, pci);
    return scclSuccess;
}

scclResult_t scclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    std::lock_guard<std::mutex> locked(lock);
    NVMLTRY(nvmlDeviceGetNvLinkCapability, device, link, capability, capResult);
    return scclSuccess;
}

scclResult_t scclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor) {
    SCCLCHECK(scclNvmlEnsureInitialized());

    for(int d = 0; d < scclNvmlDeviceCount; d++) {
        if(device == scclNvmlDevices[d].handle) {
            *major = scclNvmlDevices[d].computeCapabilityMajor;
            *minor = scclNvmlDevices[d].computeCapabilityMinor;
            return scclSuccess;
        }
    }
    return scclInvalidArgument;
}

scclResult_t scclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus) {
    SCCLCHECK(scclNvmlEnsureInitialized());

    if(p2pIndex == NVML_P2P_CAPS_INDEX_READ || p2pIndex == NVML_P2P_CAPS_INDEX_WRITE) {
        int a = -1, b = -1;
        for(int d = 0; d < scclNvmlDeviceCount; d++) {
            if(device1 == scclNvmlDevices[d].handle)
                a = d;
            if(device2 == scclNvmlDevices[d].handle)
                b = d;
        }
        if(a == -1 || b == -1)
            return scclInvalidArgument;
        if(p2pIndex == NVML_P2P_CAPS_INDEX_READ)
            *p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusRead;
        else
            *p2pStatus = scclNvmlDevicePairs[a][b].p2pStatusWrite;
    } else {
        std::lock_guard<std::mutex> locked(lock);
        NVMLCHECK(nvmlDeviceGetP2PStatus, device1, device2, p2pIndex, p2pStatus);
    }
    return scclSuccess;
}

scclResult_t scclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t* values) {
    SCCLCHECK(scclNvmlEnsureInitialized());
    std::lock_guard<std::mutex> locked(lock);
    NVMLTRY(nvmlDeviceGetFieldValues, device, valuesCount, values);
    return scclSuccess;
}

} // namespace topology
} // namespace hardware
} // namespace sccl