gpu_info_rocm.c 5.93 KB
Newer Older
1
2
3
4
5
6
#ifndef __APPLE__

#include "gpu_info_rocm.h"

#include <string.h>

7
#define ROCM_LOOKUP_SIZE 14
8

9
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
10
  rsmi_status_t ret;
11
12
13
14
15
16
17
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
18
  } l[ROCM_LOOKUP_SIZE] = {
19
20
21
22
      {"rsmi_init", (void *)&resp->rh.initFn},
      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
23
      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
24
25
26
27
28
29
30
31
32
      {"rsmi_num_monitor_devices", (void*)&resp->rh.rsmi_num_monitor_devices},
      {"rsmi_dev_id_get", (void*)&resp->rh.rsmi_dev_id_get},
      {"rsmi_dev_name_get", (void *)&resp->rh.rsmi_dev_name_get},
      {"rsmi_dev_brand_get", (void *)&resp->rh.rsmi_dev_brand_get},
      {"rsmi_dev_vendor_name_get", (void *)&resp->rh.rsmi_dev_vendor_name_get},
      {"rsmi_dev_vram_vendor_get", (void *)&resp->rh.rsmi_dev_vram_vendor_get},
      {"rsmi_dev_serial_number_get", (void *)&resp->rh.rsmi_dev_serial_number_get},
      {"rsmi_dev_subsystem_name_get", (void *)&resp->rh.rsmi_dev_subsystem_name_get},
      {"rsmi_dev_vbios_version_get", (void *)&resp->rh.rsmi_dev_vbios_version_get},
33
34
  };

35
  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
36
  if (!resp->rh.handle) {
37
    char *msg = LOAD_ERR();
38
39
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
40
             rocm_lib_path, msg);
41
    free(msg);
42
43
44
45
    resp->err = strdup(buf);
    return;
  }

46
  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
47
48
49
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->rh.handle);
50
      resp->rh.handle = NULL;
51
      char *msg = LOAD_ERR();
52
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
53
54
               msg);
      free(msg);
55
56
57
58
      resp->err = strdup(buf);
      return;
    }
  }
59
60
61

  ret = (*resp->rh.initFn)(0);
  if (ret != RSMI_STATUS_SUCCESS) {
62
63
    UNLOAD_LIBRARY(resp->rh.handle);
    resp->rh.handle = NULL;
64
65
66
67
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

68
69
70
71
72
73
74
75
76
77
78
79
  return;
}

void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

80
  if (h.handle == NULL) {
81
    resp->err = strdup("rocm handle not initialized");
82
83
84
    return;
  }

85
  ret = (*h.rsmi_num_monitor_devices)(&resp->count);
86
  if (ret != RSMI_STATUS_SUCCESS) {
87
    snprintf(buf, buflen, "unable to get device count: %d", ret);
88
89
90
    resp->err = strdup(buf);
    return;
  }
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  LOG(h.verbose, "discovered %d ROCm GPU Devices\n", resp->count);

  resp->total = 0;
  resp->free = 0;
  for (i = 0; i < resp->count; i++) {
    if (h.verbose) {
      // When in verbose mode, report more information about
      // the card we discover, but don't fail on error
      ret = (*h.rsmi_dev_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm device name: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_brand_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_brand_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm brand: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vendor_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vendor_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm vendor: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vram_vendor_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vram_vendor_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm VRAM vendor: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_serial_number_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_serial_number_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm S/N: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_subsystem_name_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_subsystem_name_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm subsystem name: %s\n", i, buf);
      }
      ret = (*h.rsmi_dev_vbios_version_get)(i, buf, buflen);
      if (ret != RSMI_STATUS_SUCCESS) {
        LOG(h.verbose, "rsmi_dev_vbios_version_get failed: %d\n", ret);
      } else {
        LOG(h.verbose, "[%d] ROCm vbios version: %s\n", i, buf);
      }
    }
142

143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
    // Get total memory - used memory for available memory
    ret = (*h.totalMemFn)(i, RSMI_MEM_TYPE_VRAM, &totalMem);
    if (ret != RSMI_STATUS_SUCCESS) {
      snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
      resp->err = strdup(buf);
      return;
    }
    ret = (*h.usageMemFn)(i, RSMI_MEM_TYPE_VRAM, &usedMem);
    if (ret != RSMI_STATUS_SUCCESS) {
      snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
      resp->err = strdup(buf);
      return;
    }
    LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
    LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
    resp->total += totalMem;
    resp->free += totalMem - usedMem;
  }
161
162
}

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
    resp->str = strdup("nvml handle not initialized");
    resp->status = 1;
    return;
  }
  rsmi_version_t ver;
  rsmi_status_t ret;
  ret = h.versionGetFn(&ver);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
    resp->status = 1;
  } else {
    snprintf(buf, buflen, "%d", ver.major);
    resp->status = 0;
  }
  resp->str = strdup(buf);
}

184
#endif  // __APPLE__