gpu_info_rocm.c 6.63 KB
Newer Older
1
2
3
4
5
6
#ifndef __APPLE__

#include "gpu_info_rocm.h"

#include <string.h>

7
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
8
  rsmi_status_t ret;
9
10
11
12
13
14
15
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
16
17
18
19
20
21
  } l[] = {
      {"rsmi_init", (void *)&resp->rh.rsmi_init},
      {"rsmi_shut_down", (void *)&resp->rh.rsmi_shut_down},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.rsmi_dev_memory_total_get},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.rsmi_dev_memory_usage_get},
      {"rsmi_version_get", (void *)&resp->rh.rsmi_version_get},
22
23
24
25
26
27
28
29
30
      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
31
      {NULL, NULL},
32
33
  };

34
  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
35
  if (!resp->rh.handle) {
36
    char *msg = LOAD_ERR();
37
38
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
39
             rocm_lib_path, msg);
40
    free(msg);
41
42
43
44
    resp->err = strdup(buf);
    return;
  }

45
46
47
48
49
50
51
  // TODO once we've squashed the remaining corner cases remove this log
  LOG(resp->rh.verbose, "wiring rocm management library functions in %s\n", rocm_lib_path);

  for (i = 0; l[i].s != NULL; i++) {
    // TODO once we've squashed the remaining corner cases remove this log
    LOG(resp->rh.verbose, "dlsym: %s\n", l[i].s);

52
53
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
54
      resp->rh.handle = NULL;
55
      char *msg = LOAD_ERR();
56
57
      LOG(resp->rh.verbose, "dlerr: %s\n", msg);
      UNLOAD_LIBRARY(resp->rh.handle);
58
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
59
60
               msg);
      free(msg);
61
62
63
64
      resp->err = strdup(buf);
      return;
    }
  }
65

66
  ret = (*resp->rh.rsmi_init)(0);
67
  if (ret != RSMI_STATUS_SUCCESS) {
68
    LOG(resp->rh.verbose, "rsmi_init err: %d\n", ret);
69
70
    UNLOAD_LIBRARY(resp->rh.handle);
    resp->rh.handle = NULL;
71
72
73
74
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

75
76
77
78
79
  return;
}

void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
  resp->igpu_index = -1;
81
82
83
84
85
86
87
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

88
  if (h.handle == NULL) {
89
    resp->err = strdup("rocm handle not initialized");
90
91
92
    return;
  }

93
  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
94
  if (ret != RSMI_STATUS_SUCCESS) {
95
    snprintf(buf, buflen, "unable to get device count: %d", ret);
96
97
98
    resp->err = strdup(buf);
    return;
  }
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);

  resp->total = 0;
  resp->free = 0;
  for (i = 0; i < resp->count; i++) {
    if (h.verbose) {
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
      }
    }
150

151
    // Get total memory - used memory for available memory
152
    ret = (*h.rsmi_dev_memory_total_get)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
153
154
155
156
157
    if (ret != RSMI_STATUS_SUCCESS) {
      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
      resp->err = strdup(buf);
      return;
    }
158
    ret = (*h.rsmi_dev_memory_usage_get)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
159
160
161
162
163
164
165
    if (ret != RSMI_STATUS_SUCCESS) {
      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
      resp->err = strdup(buf);
      return;
    }
    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166
167
168
169
170
171
172
173
    if (totalMem < 1024 * 1024 * 1024) {
      // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
      LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
      resp->igpu_index = i;
    } else {
      resp->total += totalMem;
      resp->free += totalMem - usedMem;
    }
174
  }
175
176
}

177
178
179
180
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
181
    resp->str = strdup("rocm handle not initialized");
182
183
184
185
186
    resp->status = 1;
    return;
  }
  rsmi_version_t ver;
  rsmi_status_t ret;
187
  ret = h.rsmi_version_get(&ver);
188
189
190
191
192
193
194
195
196
197
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
    resp->status = 1;
  } else {
    snprintf(buf, buflen, "%d", ver.major);
    resp->status = 0;
  }
  resp->str = strdup(buf);
}

198
#endif  // __APPLE__