gpu_info_cuda.c 4.03 KB
Newer Older
1
2
3
4
5
6
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include "gpu_info_cuda.h"

#include <string.h>

7
#define CUDA_LOOKUP_SIZE 6
8

9
void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
10
  nvmlReturn_t ret;
11
12
13
14
15
16
17
18
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
19
  } l[CUDA_LOOKUP_SIZE] = {
20
21
22
23
      {"nvmlInit_v2", (void *)&resp->ch.initFn},
      {"nvmlShutdown", (void *)&resp->ch.shutdownFn},
      {"nvmlDeviceGetHandleByIndex", (void *)&resp->ch.getHandle},
      {"nvmlDeviceGetMemoryInfo", (void *)&resp->ch.getMemInfo},
24
      {"nvmlDeviceGetCount_v2", (void *)&resp->ch.getCount},
25
      {"nvmlDeviceGetCudaComputeCapability", (void *)&resp->ch.getComputeCapability},
26
27
  };

28
  resp->ch.handle = LOAD_LIBRARY(cuda_lib_path, RTLD_LAZY);
29
  if (!resp->ch.handle) {
30
    char *msg = LOAD_ERR();
31
32
    snprintf(buf, buflen,
             "Unable to load %s library to query for Nvidia GPUs: %s",
33
             cuda_lib_path, msg);
34
    free(msg);
35
36
37
38
    resp->err = strdup(buf);
    return;
  }

39
  for (i = 0; i < CUDA_LOOKUP_SIZE; i++) {  // TODO - fix this to use a null terminated list
40
41
42
43
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
44
      char *msg = LOAD_ERR();
45
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
46
47
               msg);
      free(msg);
48
49
50
51
      resp->err = strdup(buf);
      return;
    }
  }
52
53
54

  ret = (*resp->ch.initFn)();
  if (ret != NVML_SUCCESS) {
55
56
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
57
58
59
60
    snprintf(buf, buflen, "nvml vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
  return;
}

void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  nvmlDevice_t device;
  nvmlMemory_t memInfo = {0};
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
    return;
  }

78
  ret = (*h.getCount)(&resp->count);
79
  if (ret != NVML_SUCCESS) {
80
    snprintf(buf, buflen, "unable to get device count: %d", ret);
81
82
83
84
    resp->err = strdup(buf);
    return;
  }

85
86
  resp->total = 0;
  resp->free = 0;
87
  for (i = 0; i < resp->count; i++) {
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getMemInfo)(device, &memInfo);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device memory info lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    resp->total += memInfo.total;
    resp->free += memInfo.free;
104
105
  }
}
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154

void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
  resp->err = NULL;
  resp->major = 0;
  resp->minor = 0;
  nvmlDevice_t device;
  int major = 0;
  int minor = 0;
  nvmlReturn_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  if (h.handle == NULL) {
    resp->err = strdup("nvml handle not initialized");
    return;
  }

  unsigned int devices;
  ret = (*h.getCount)(&devices);
  if (ret != NVML_SUCCESS) {
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < devices; i++) {
    ret = (*h.getHandle)(i, &device);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "unable to get device handle %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }

    ret = (*h.getComputeCapability)(device, &major, &minor);
    if (ret != NVML_SUCCESS) {
      snprintf(buf, buflen, "device compute capability lookup failure %d: %d", i, ret);
      resp->err = strdup(buf);
      return;
    }
    // Report the lowest major.minor we detect as that limits our compatibility
    if (resp->major == 0 || resp->major > major ) {
      resp->major = major;
      resp->minor = minor;
    } else if ( resp->major == major && resp->minor > minor ) {
      resp->minor = minor;
    }
  }
}
155
#endif  // __APPLE__