gpu_info_rocm.c 6.36 KB
Newer Older
1
2
3
4
5
6
#ifndef __APPLE__

#include "gpu_info_rocm.h"

#include <string.h>

7
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
8
  rsmi_status_t ret;
9
10
11
12
13
14
15
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
16
17
18
19
20
21
  } l[] = {
      {"rsmi_init", (void *)&resp->rh.rsmi_init},
      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
22
23
24
25
26
27
28
29
30
      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
31
      {NULL, NULL},
32
33
  };

34
  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
35
  if (!resp->rh.handle) {
36
    char *msg = LOAD_ERR();
37
38
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
39
             rocm_lib_path, msg);
40
    free(msg);
41
42
43
44
    resp->err = strdup(buf);
    return;
  }

45
46
47
48
49
50
51
  // TODO once we've squashed the remaining corner cases remove this log
  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);

  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);

52
53
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
54
      resp->rh.handle = NULL;
55
      char *msg = LOAD_ERR();
56
57
      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->rh.handle);
58
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
59
60
               msg);
      free(msg);
61
62
63
64
      resp->err = strdup(buf);
      return;
    }
  }
65

66
  ret = (*resp->rh.rsmi_init)(0);
67
  if (ret != RSMI_STATUS_SUCCESS) {
68
    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
69
70
    UNLOAD_LIBRARY(resp->rh.handle);
    resp->rh.handle = NULL;
71
72
73
74
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

75
76
77
78
79
80
81
82
83
84
85
86
  return;
}

void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

87
  if (h.handle == NULL) {
88
    resp->err = strdup("rocm handle not initialized");
89
90
91
    return;
  }

92
  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
93
  if (ret != RSMI_STATUS_SUCCESS) {
94
    snprintf(buf, buflen, "unable to get device count: %d", ret);
95
96
97
    resp->err = strdup(buf);
    return;
  }
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);

  resp->total = 0;
  resp->free = 0;
  for (i = 0; i < resp->count; i++) {
    if (h.verbose) {
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
      }
    }
149

150
    // Get total memory - used memory for available memory
151
    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
152
153
154
155
156
    if (ret != RSMI_STATUS_SUCCESS) {
      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
      resp->err = strdup(buf);
      return;
    }
157
    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
158
159
160
161
162
163
164
165
166
167
    if (ret != RSMI_STATUS_SUCCESS) {
      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
      resp->err = strdup(buf);
      return;
    }
    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
    resp->total += totalMem;
    resp->free += totalMem - usedMem;
  }
168
169
}

170
171
172
173
174
175
176
177
178
179
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
    resp->str = strdup("nvml handle not initialized");
    resp->status = 1;
    return;
  }
  rsmi_version_t ver;
  rsmi_status_t ret;
180
  ret = h.rsmi_version_get(&ver);
181
182
183
184
185
186
187
188
189
190
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
    resp->status = 1;
  } else {
    snprintf(buf, buflen, "%d", ver.major);
    resp->status = 0;
  }
  resp->str = strdup(buf);
}

191
#endif  // __APPLE__