gpu_info_rocm.c 2.85 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#ifndef __APPLE__

#include "gpu_info_rocm.h"

#include <string.h>

#ifndef _WIN32
const char *rocm_lib_paths[] = {
    "librocm_smi64.so",
    "/opt/rocm/lib/librocm_smi64.so",
    NULL,
};
#else
// TODO untested
const char *rocm_lib_paths[] = {
    "rocm_smi64.dll",
    "/opt/rocm/lib/rocm_smi64.dll",
    NULL,
};
#endif

void rocm_init(rocm_init_resp_t *resp) {
23
  rsmi_status_t ret;
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
  } l[4] = {
      {"rsmi_init", (void *)&resp->rh.initFn},
      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  };

  for (i = 0; rocm_lib_paths[i] != NULL && resp->rh.handle == NULL; i++) {
    resp->rh.handle = LOAD_LIBRARY(rocm_lib_paths[i], RTLD_LAZY);
  }
  if (!resp->rh.handle) {
43
    char *msg = LOAD_ERR();
44
45
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
46
47
             rocm_lib_paths[0], msg);
    free(msg);
48
49
50
51
52
53
54
55
    resp->err = strdup(buf);
    return;
  }

  for (i = 0; i < 4; i++) {
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->rh.handle);
56
      char *msg = LOAD_ERR();
57
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
58
59
               msg);
      free(msg);
60
61
62
63
      resp->err = strdup(buf);
      return;
    }
  }
64
65
66
67
68
69
70

  ret = (*resp->rh.initFn)(0);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

71
72
73
74
75
76
77
78
79
80
81
82
83
84
  return;
}

void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  // uint32_t num_devices;
  // uint16_t device;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

85
86
  if (h.handle == NULL) {
    resp->err = strdup("nvml handle sn't initialized");
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    return;
  }

  // TODO - iterate through devices...  ret =
  // rsmi_num_monitor_devices(&num_devices);

  // ret = (*h.getHandle)(0, &device);
  // if (ret != RSMI_STATUS_SUCCESS) {
  //     printf("rocm vram device lookup failure: %d\n", ret);
  //     return -1;
  // }

  // Get total memory - used memory for available memory
  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }

  resp->total = totalMem;
  resp->free = totalMem - usedMem;
  return;
}

#endif  // __APPLE__