gpu_info_nvcuda.c 7.63 KB
Newer Older
1
2
3
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include <string.h>
4
#include <inttypes.h>
5
6
7
#include "gpu_info_nvcuda.h"

void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
8
  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
9
10
11
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
12
  resp->cudaErr = CUDA_SUCCESS;
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[] = {
   
      {"cuInit", (void *)&resp->ch.cuInit},
      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
28
      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
      {NULL, NULL},
  };

  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            nvcuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
44
    resp->cudaErr = -1;
45
46
47
48
49
    return;
  }

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
    if (!*(l[i].p)) {
51
52
53
54
55
56
57
58
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
59
      resp->cudaErr = -1;
60
61
      return;
    }
62
    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
63
64
  }

65
  LOG(resp->ch.verbose, "calling cuInit\n");
66
67
68
69
70
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
71
    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
72
    resp->err = strdup(buf);
73
    resp->cudaErr = ret;
74
75
76
77
    return;
  }

  int version = 0;
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
  resp->ch.driver_major = 0;
  resp->ch.driver_minor = 0;
80
81

  // Report driver version if we're in verbose mode, ignore errors
82
  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
83
84
85
86
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
87
    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
89
90
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
91
92
  }

93
  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
94
95
96
97
98
99
100
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
101
    resp->cudaErr = ret;
102
103
    return;
  }
104
  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
105
106
107
}

const int buflen = 256;
108
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
109
110
111
112
113
114
115
116
117
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
  CUdevice device = -1;
  CUcontext ctx = NULL;
  char buf[buflen + 1];
  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

  if (h.handle == NULL) {
118
    resp->err = strdup("cuda driver library handle isn't initialized");
119
120
121
122
123
    return;
  }

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
124
    snprintf(buf, buflen, "cuda driver library device failed to initialize");
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    resp->err = strdup(buf);
    return;
  }

  int major = 0;
  int minor = 0;
  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  } else {
    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
    if (ret != CUDA_SUCCESS) {
      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
    } else {
      resp->minor = minor;  
      resp->major = major;  
    }
  }

  ret = (*h.cuDeviceGetUuid)(&uuid, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  } else {
    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
        uuid.bytes[0],
        uuid.bytes[1],
        uuid.bytes[2],
        uuid.bytes[3],
        uuid.bytes[4],
        uuid.bytes[5],
        uuid.bytes[6],
        uuid.bytes[7],
        uuid.bytes[8],
        uuid.bytes[9],
        uuid.bytes[10],
        uuid.bytes[11],
        uuid.bytes[12],
        uuid.bytes[13],
        uuid.bytes[14],
        uuid.bytes[15]
      );
  }

Daniel Hiltgen's avatar
Daniel Hiltgen committed
171
172
173
174
175
176
  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
    resp->gpu_name[0] = '\0';
  }

177
178
179
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
180
    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
181
182
183
184
185
186
    resp->err = strdup(buf);
    return;
  }

  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  if (ret != CUDA_SUCCESS) {
187
    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
188
189
190
191
192
193
194
195
196
    resp->err = strdup(buf);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }

  resp->total = memInfo.total;
  resp->free = memInfo.free;

197
198
  LOG(h.verbose, "[%s] CUDA totalMem %" PRId64 "mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  LOG(h.verbose, "[%s] CUDA freeMem %" PRId64 "mb\n", resp->gpu_id, resp->free / 1024 / 1024);
199
200
201
202
203
204
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);

  

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
205
    LOG(1, "cuda driver library failed to release device context %d", ret);
206
207
208
  }
}

209
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
210
211
212
213
  CUresult ret;
  CUcontext ctx = NULL;
  CUdevice device = -1;
  *free = 0;
214
  *total = 0;
215
216
217

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
218
    LOG(1, "cuda driver library device failed to initialize");
219
220
221
222
223
224
225
    return;
  }


  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
226
    LOG(1, "cuda driver library failed to get device context %d", ret);
227
228
229
    return;
  }

230
  ret = (*h.cuMemGetInfo_v2)(free, total);
231
  if (ret != CUDA_SUCCESS) {
232
    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
233
234
235
236
237
238
239
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
240
    LOG(1, "cuda driver library failed to release device context %d", ret);
241
242
243
244
  }
}

void nvcuda_release(nvcuda_handle_t h) {
245
  LOG(h.verbose, "releasing cuda driver library\n");
246
247
248
249
250
  UNLOAD_LIBRARY(h.handle);
  // TODO and other context release logic?
  h.handle = NULL;
}

251
#endif  // __APPLE__