gpu_info_nvcuda.c 7.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include <string.h>
#include "gpu_info_nvcuda.h"

void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
10
  resp->cudaErr = CUDA_SUCCESS;
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[] = {
   
      {"cuInit", (void *)&resp->ch.cuInit},
      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
      {NULL, NULL},
  };

  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            nvcuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
42
    resp->cudaErr = -1;
43
44
45
46
47
    return;
  }

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
    if (!*(l[i].p)) {
49
50
51
52
53
54
55
56
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
57
      resp->cudaErr = -1;
58
59
60
61
62
63
64
65
66
      return;
    }
  }

  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
67
    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
68
    resp->err = strdup(buf);
69
    resp->cudaErr = ret;
70
71
72
73
    return;
  }

  int version = 0;
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
75
  resp->ch.driver_major = 0;
  resp->ch.driver_minor = 0;
76
77
78
79
80
81

  // Report driver version if we're in verbose mode, ignore errors
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
83
84
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
85
86
87
88
89
90
91
92
93
  }

  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
94
    resp->cudaErr = ret;
95
96
97
98
99
    return;
  }
}

const int buflen = 256;
100
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
101
102
103
104
105
106
107
108
109
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
  CUdevice device = -1;
  CUcontext ctx = NULL;
  char buf[buflen + 1];
  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

  if (h.handle == NULL) {
110
    resp->err = strdup("cuda driver library handle isn't initialized");
111
112
113
114
115
    return;
  }

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
116
    snprintf(buf, buflen, "cuda driver library device failed to initialize");
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    resp->err = strdup(buf);
    return;
  }

  int major = 0;
  int minor = 0;
  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  } else {
    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
    if (ret != CUDA_SUCCESS) {
      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
    } else {
      resp->minor = minor;  
      resp->major = major;  
    }
  }

  ret = (*h.cuDeviceGetUuid)(&uuid, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  } else {
    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
        uuid.bytes[0],
        uuid.bytes[1],
        uuid.bytes[2],
        uuid.bytes[3],
        uuid.bytes[4],
        uuid.bytes[5],
        uuid.bytes[6],
        uuid.bytes[7],
        uuid.bytes[8],
        uuid.bytes[9],
        uuid.bytes[10],
        uuid.bytes[11],
        uuid.bytes[12],
        uuid.bytes[13],
        uuid.bytes[14],
        uuid.bytes[15]
      );
  }

Daniel Hiltgen's avatar
Daniel Hiltgen committed
163
164
165
166
167
168
  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
    resp->gpu_name[0] = '\0';
  }

169
170
171
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
172
    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
173
174
175
176
177
178
    resp->err = strdup(buf);
    return;
  }

  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  if (ret != CUDA_SUCCESS) {
179
    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
    resp->err = strdup(buf);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }

  resp->total = memInfo.total;
  resp->free = memInfo.free;

  LOG(h.verbose, "[%s] CUDA totalMem %lu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  LOG(h.verbose, "[%s] CUDA freeMem %lu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);

  

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
197
    LOG(1, "cuda driver library failed to release device context %d", ret);
198
199
200
  }
}

201
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
202
203
204
205
  CUresult ret;
  CUcontext ctx = NULL;
  CUdevice device = -1;
  *free = 0;
206
  *total = 0;
207
208
209

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
210
    LOG(1, "cuda driver library device failed to initialize");
211
212
213
214
215
216
217
    return;
  }


  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
218
    LOG(1, "cuda driver library failed to get device context %d", ret);
219
220
221
    return;
  }

222
  ret = (*h.cuMemGetInfo_v2)(free, total);
223
  if (ret != CUDA_SUCCESS) {
224
    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
225
226
227
228
229
230
231
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
232
    LOG(1, "cuda driver library failed to release device context %d", ret);
233
234
235
236
  }
}

void nvcuda_release(nvcuda_handle_t h) {
237
  LOG(h.verbose, "releasing cuda driver library\n");
238
239
240
241
242
243
  UNLOAD_LIBRARY(h.handle);
  // TODO and other context release logic?
  h.handle = NULL;
}

#endif  // __APPLE__