gpu_info_cuda.c 3.14 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

#ifndef _WIN32
const char *cuda_lib_paths[] = {
    "libnvidia-ml.so",
    "/usr/local/cuda/lib64/libnvidia-ml.so",
11
    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
12
    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
13
14
15
16
17
18
19
20
21
22
    NULL,
};
#else
const char *cuda_lib_paths[] = {
    "nvml.dll",
    "",
    NULL,
};
#endif

23
24
#define CUDA_LOOKUP_SIZE 5

25
void cuda_init(cuda_init_resp_t *resp) {
26
  nvmlReturn_t ret;
27
28
29
30
31
32
33
34
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
35
  } l[CUDA_LOOKUP_SIZE] = {
36
37
38
39
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
40
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
41
42
43
44
45
46
  };

  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->ch.handle) {
47
48
    // TODO improve error message, as the LOAD_ERR will have typically have the
    // final path that was checked which might be confusing.
49
    char *msg = LOAD_ERR();
50
51
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
52
53
             cuda_lib_paths[0], msg);
    free(msg);
54
55
56
57
    resp->err = strdup(buf);
    return;
  }

58
  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
59
60
61
62
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
63
      char *msg = LOAD_ERR();
64
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
65
66
               msg);
      free(msg);
67
68
69
70
      resp->err = strdup(buf);
      return;
    }
  }
71
72
73
74
75
76
77

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  return;
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

95
96
  unsigned int devices;
  ret = (*h.getCount)(&devices);
97
  if (ret != NVML_SUCCESS) {
98
    snprintf(buf, buflen, "unable to get device count: %d", ret);
99
100
101
102
    resp->err = strdup(buf);
    return;
  }

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
  resp->total = 0;
  resp->free = 0;

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    resp->total += memInfo.total;
    resp->free += memInfo.free;
123
124
125
  }
}
#endif  // __APPLE__