gpu_info_cuda.c 2.68 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

#ifndef _WIN32
const char *cuda_lib_paths[] = {
    "libnvidia-ml.so",
    "/usr/local/cuda/lib64/libnvidia-ml.so",
11
    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
12
    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
13
14
15
16
17
18
19
20
21
22
23
    NULL,
};
#else
const char *cuda_lib_paths[] = {
    "nvml.dll",
    "",
    NULL,
};
#endif

void cuda_init(cuda_init_resp_t *resp) {
24
  nvmlReturn_t ret;
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[4] = {
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
  };

  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->ch.handle) {
44
45
    // TODO improve error message, as the LOAD_ERR will have typically have the
    // final path that was checked which might be confusing.
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
             cuda_lib_paths[0], LOAD_ERR());
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < 4; i++) {  // TODO - fix this to use a null terminated list
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
               LOAD_ERR());
      resp->err = strdup(buf);
      return;
    }
  }
64
65
66
67
68
69
70

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
  return;
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

  // TODO - handle multiple GPUs
  ret = (*h.getHandle)(0, &device);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device handle: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  ret = (*h.getMemInfo)(device, &memInfo);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "device memory info lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  resp->total = memInfo.total;
  resp->free = memInfo.free;
  return;
}
#endif  // __APPLE__