gpu_info_cuda.c 4.57 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

#ifndef _WIN32
const char *cuda_lib_paths[] = {
    "libnvidia-ml.so",
    "/usr/local/cuda/lib64/libnvidia-ml.so",
11
    "/usr/lib/x86_64-linux-gnu/nvidia/current/libnvidia-ml.so",
12
    "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1",
13
    "/usr/lib/wsl/lib/libnvidia-ml.so.1",  // TODO Maybe glob?
14
15
16
17
18
19
20
21
22
23
    NULL,
};
#else
const char *cuda_lib_paths[] = {
    "nvml.dll",
    "",
    NULL,
};
#endif

24
#define CUDA_LOOKUP_SIZE 6
25

26
void cuda_init(cuda_init_resp_t *resp) {
27
  nvmlReturn_t ret;
28
29
30
31
32
33
34
35
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
36
  } l[CUDA_LOOKUP_SIZE] = {
37
38
39
40
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
41
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
42
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
43
44
45
46
47
48
  };

  for (i = 0; cuda_lib_paths[i] != NULL && resp->ch.handle == NULL; i++) {
    resp->ch.handle = LOAD_LIBRARY(cuda_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->ch.handle) {
49
50
    // TODO improve error message, as the LOAD_ERR will have typically have the
    // final path that was checked which might be confusing.
51
    char *msg = LOAD_ERR();
52
53
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
54
55
             cuda_lib_paths[0], msg);
    free(msg);
56
57
58
59
    resp->err = strdup(buf);
    return;
  }

60
  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
61
62
63
64
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
65
      char *msg = LOAD_ERR();
66
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
67
68
               msg);
      free(msg);
69
70
71
72
      resp->err = strdup(buf);
      return;
    }
  }
73
74
75
76
77
78
79

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  return;
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

97
98
  unsigned int devices;
  ret = (*h.getCount)(&devices);
99
  if (ret != NVML_SUCCESS) {
100
    snprintf(buf, buflen, "unable to get device count: %d", ret);
101
102
103
104
    resp->err = strdup(buf);
    return;
  }

105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  resp->total = 0;
  resp->free = 0;

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    resp->total += memInfo.total;
    resp->free += memInfo.free;
125
126
  }
}
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  nvmlDevice_t device;
  int major = 0;
  int minor = 0;
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle not initialized");
    return;
  }

  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getComputeCapability)(device, &major, &minor);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
}
176
#endif  // __APPLE__