"examples/dreambooth/requirements_sd3.txt" did not exist on "6fd458e99d0b465bea6a8002aff5357514862751"
gpu_info_nvcuda.c 7.6 KB
Newer Older
1
2
3
4
5
6
#ifndef __APPLE__  // TODO - maybe consider nvidia support on intel macs?

#include <string.h>
#include "gpu_info_nvcuda.h"

void nvcuda_init(char *nvcuda_lib_path, nvcuda_init_resp_t *resp) {
7
  LOG(resp->ch.verbose, "initializing %s\n", nvcuda_lib_path);
8
9
10
  CUresult ret;
  resp->err = NULL;
  resp->num_devices = 0;
11
  resp->cudaErr = CUDA_SUCCESS;
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

  struct lookup {
    char *s;
    void **p;
  } l[] = {
   
      {"cuInit", (void *)&resp->ch.cuInit},
      {"cuDriverGetVersion", (void *)&resp->ch.cuDriverGetVersion},
      {"cuDeviceGetCount", (void *)&resp->ch.cuDeviceGetCount},
      {"cuDeviceGet", (void *)&resp->ch.cuDeviceGet},
      {"cuDeviceGetAttribute", (void *)&resp->ch.cuDeviceGetAttribute},
      {"cuDeviceGetUuid", (void *)&resp->ch.cuDeviceGetUuid},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
      {"cuDeviceGetName", (void *)&resp->ch.cuDeviceGetName},
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
      {"cuCtxCreate_v3", (void *)&resp->ch.cuCtxCreate_v3},
      {"cuMemGetInfo_v2", (void *)&resp->ch.cuMemGetInfo_v2},
      {"cuCtxDestroy", (void *)&resp->ch.cuCtxDestroy},
      {NULL, NULL},
  };

  resp->ch.handle = LOAD_LIBRARY(nvcuda_lib_path, RTLD_LAZY);
  if (!resp->ch.handle) {
    char *msg = LOAD_ERR();
    LOG(resp->ch.verbose, "library %s load err: %s\n", nvcuda_lib_path, msg);
    snprintf(buf, buflen,
            "Unable to load %s library to query for Nvidia GPUs: %s",
            nvcuda_lib_path, msg);
    free(msg);
    resp->err = strdup(buf);
43
    resp->cudaErr = -1;
44
45
46
47
48
    return;
  }

  for (i = 0; l[i].s != NULL; i++) {
    *l[i].p = LOAD_SYMBOL(resp->ch.handle, l[i].s);
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
    if (!*(l[i].p)) {
50
51
52
53
54
55
56
57
      char *msg = LOAD_ERR();
      LOG(resp->ch.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->ch.handle);
      resp->ch.handle = NULL;
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
              msg);
      free(msg);
      resp->err = strdup(buf);
58
      resp->cudaErr = -1;
59
60
      return;
    }
61
    LOG(resp->ch.verbose, "dlsym: %s - %p\n", l[i].s, *l[i].p);
62
63
  }

64
  LOG(resp->ch.verbose, "calling cuInit\n");
65
66
67
68
69
  ret = (*resp->ch.cuInit)(0);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuInit err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
70
    snprintf(buf, buflen, "cuda driver library init failure: %d", ret);
71
    resp->err = strdup(buf);
72
    resp->cudaErr = ret;
73
74
75
76
    return;
  }

  int version = 0;
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
  resp->ch.driver_major = 0;
  resp->ch.driver_minor = 0;
79
80

  // Report driver version if we're in verbose mode, ignore errors
81
  LOG(resp->ch.verbose, "calling cuDriverGetVersion\n");
82
83
84
85
  ret = (*resp->ch.cuDriverGetVersion)(&version);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDriverGetVersion failed: %d\n", ret);
  } else {
86
    LOG(resp->ch.verbose, "raw version 0x%x\n", version);
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
88
89
    resp->ch.driver_major = version / 1000;
    resp->ch.driver_minor = (version - (resp->ch.driver_major * 1000)) / 10;
    LOG(resp->ch.verbose, "CUDA driver version: %d.%d\n", resp->ch.driver_major, resp->ch.driver_minor);
90
91
  }

92
  LOG(resp->ch.verbose, "calling cuDeviceGetCount\n");
93
94
95
96
97
98
99
  ret = (*resp->ch.cuDeviceGetCount)(&resp->num_devices);
  if (ret != CUDA_SUCCESS) {
    LOG(resp->ch.verbose, "cuDeviceGetCount err: %d\n", ret);
    UNLOAD_LIBRARY(resp->ch.handle);
    resp->ch.handle = NULL;
    snprintf(buf, buflen, "unable to get device count: %d", ret);
    resp->err = strdup(buf);
100
    resp->cudaErr = ret;
101
102
    return;
  }
103
  LOG(resp->ch.verbose, "device count %d\n", resp->num_devices);
104
105
106
}

const int buflen = 256;
107
void nvcuda_bootstrap(nvcuda_handle_t h, int i, mem_info_t *resp) {
108
109
110
111
112
113
114
115
116
  resp->err = NULL;
  nvcudaMemory_t memInfo = {0,0};
  CUresult ret;
  CUdevice device = -1;
  CUcontext ctx = NULL;
  char buf[buflen + 1];
  CUuuid uuid = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};

  if (h.handle == NULL) {
117
    resp->err = strdup("cuda driver library handle isn't initialized");
118
119
120
121
122
    return;
  }

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
123
    snprintf(buf, buflen, "cuda driver library device failed to initialize");
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    resp->err = strdup(buf);
    return;
  }

  int major = 0;
  int minor = 0;
  ret = (*h.cuDeviceGetAttribute)(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device major lookup failure: %d\n", i, ret);
  } else {
    ret = (*h.cuDeviceGetAttribute)(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
    if (ret != CUDA_SUCCESS) {
      LOG(h.verbose, "[%d] device minor lookup failure: %d\n", i, ret);
    } else {
      resp->minor = minor;  
      resp->major = major;  
    }
  }

  ret = (*h.cuDeviceGetUuid)(&uuid, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device uuid lookup failure: %d\n", i, ret);
    snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", i);
  } else {
    // GPU-d110a105-ac29-1d54-7b49-9c90440f215b
    snprintf(&resp->gpu_id[0], GPU_ID_LEN,
        "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
        uuid.bytes[0],
        uuid.bytes[1],
        uuid.bytes[2],
        uuid.bytes[3],
        uuid.bytes[4],
        uuid.bytes[5],
        uuid.bytes[6],
        uuid.bytes[7],
        uuid.bytes[8],
        uuid.bytes[9],
        uuid.bytes[10],
        uuid.bytes[11],
        uuid.bytes[12],
        uuid.bytes[13],
        uuid.bytes[14],
        uuid.bytes[15]
      );
  }

Daniel Hiltgen's avatar
Daniel Hiltgen committed
170
171
172
173
174
175
  ret = (*h.cuDeviceGetName)(&resp->gpu_name[0], GPU_NAME_LEN, device);
  if (ret != CUDA_SUCCESS) {
    LOG(h.verbose, "[%d] device name lookup failure: %d\n", i, ret);
    resp->gpu_name[0] = '\0';
  }

176
177
178
  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
179
    snprintf(buf, buflen, "cuda driver library failed to get device context %d", ret);
180
181
182
183
184
185
    resp->err = strdup(buf);
    return;
  }

  ret = (*h.cuMemGetInfo_v2)(&memInfo.free, &memInfo.total);
  if (ret != CUDA_SUCCESS) {
186
    snprintf(buf, buflen, "cuda driver library device memory info lookup failure %d", ret);
187
188
189
190
191
192
193
194
195
    resp->err = strdup(buf);
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }

  resp->total = memInfo.total;
  resp->free = memInfo.free;

196
197
  LOG(h.verbose, "[%s] CUDA totalMem %llu mb\n", resp->gpu_id, resp->total / 1024 / 1024);
  LOG(h.verbose, "[%s] CUDA freeMem %llu mb\n", resp->gpu_id, resp->free / 1024 / 1024);
198
199
200
201
202
203
  LOG(h.verbose, "[%s] Compute Capability %d.%d\n", resp->gpu_id, resp->major, resp->minor);

  

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
204
    LOG(1, "cuda driver library failed to release device context %d", ret);
205
206
207
  }
}

208
void nvcuda_get_free(nvcuda_handle_t h, int i, uint64_t *free, uint64_t *total) {
209
210
211
212
  CUresult ret;
  CUcontext ctx = NULL;
  CUdevice device = -1;
  *free = 0;
213
  *total = 0;
214
215
216

  ret = (*h.cuDeviceGet)(&device, i);
  if (ret != CUDA_SUCCESS) {
217
    LOG(1, "cuda driver library device failed to initialize");
218
219
220
221
222
223
224
    return;
  }


  // To get memory we have to set (and release) a context
  ret = (*h.cuCtxCreate_v3)(&ctx, NULL, 0, 0, device);
  if (ret != CUDA_SUCCESS) {
225
    LOG(1, "cuda driver library failed to get device context %d", ret);
226
227
228
    return;
  }

229
  ret = (*h.cuMemGetInfo_v2)(free, total);
230
  if (ret != CUDA_SUCCESS) {
231
    LOG(1, "cuda driver library device memory info lookup failure %d", ret);
232
233
234
235
236
237
238
    // Best effort on failure...
    (*h.cuCtxDestroy)(ctx);
    return;
  }

  ret = (*h.cuCtxDestroy)(ctx);
  if (ret != CUDA_SUCCESS) {
239
    LOG(1, "cuda driver library failed to release device context %d", ret);
240
241
242
243
  }
}

void nvcuda_release(nvcuda_handle_t h) {
244
  LOG(h.verbose, "releasing cuda driver library\n");
245
246
247
248
249
250
  UNLOAD_LIBRARY(h.handle);
  // TODO and other context release logic?
  h.handle = NULL;
}

#endif  // __APPLE__