gpu_info_rocm.c 3.35 KB
Newer Older
1
2
3
4
5
6
#ifndef __APPLE__

#include "gpu_info_rocm.h"

#include <string.h>

7
8
#define ROCM_LOOKUP_SIZE 5

9
void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
10
  rsmi_status_t ret;
11
12
13
14
15
16
17
  resp->err = NULL;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;
  struct lookup {
    char *s;
    void **p;
18
  } l[ROCM_LOOKUP_SIZE] = {
19
20
21
22
      {"rsmi_init", (void *)&resp->rh.initFn},
      {"rsmi_shut_down", (void *)&resp->rh.shutdownFn},
      {"rsmi_dev_memory_total_get", (void *)&resp->rh.totalMemFn},
      {"rsmi_dev_memory_usage_get", (void *)&resp->rh.usageMemFn},
23
      {"rsmi_version_get", (void *)&resp->rh.versionGetFn},
24
25
26
      // { "rsmi_dev_id_get", (void*)&resp->rh.getHandle },
  };

27
  resp->rh.handle = LOAD_LIBRARY(rocm_lib_path, RTLD_LAZY);
28
  if (!resp->rh.handle) {
29
    char *msg = LOAD_ERR();
30
31
    snprintf(buf, buflen,
             "Unable to load %s library to query for Radeon GPUs: %s\n",
32
             rocm_lib_path, msg);
33
    free(msg);
34
35
36
37
    resp->err = strdup(buf);
    return;
  }

38
  for (i = 0; i < ROCM_LOOKUP_SIZE; i++) {
39
40
41
    *l[i].p = LOAD_SYMBOL(resp->rh.handle, l[i].s);
    if (!l[i].p) {
      UNLOAD_LIBRARY(resp->rh.handle);
42
      resp->rh.handle = NULL;
43
      char *msg = LOAD_ERR();
44
      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s,
45
46
               msg);
      free(msg);
47
48
49
50
      resp->err = strdup(buf);
      return;
    }
  }
51
52
53

  ret = (*resp->rh.initFn)(0);
  if (ret != RSMI_STATUS_SUCCESS) {
54
55
    UNLOAD_LIBRARY(resp->rh.handle);
    resp->rh.handle = NULL;
56
57
58
59
    snprintf(buf, buflen, "rocm vram init failure: %d", ret);
    resp->err = strdup(buf);
  }

60
61
62
63
64
65
66
67
68
69
70
71
72
73
  return;
}

void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
  resp->err = NULL;
  // uint32_t num_devices;
  // uint16_t device;
  uint64_t totalMem = 0;
  uint64_t usedMem = 0;
  rsmi_status_t ret;
  const int buflen = 256;
  char buf[buflen + 1];
  int i;

74
  if (h.handle == NULL) {
75
    resp->err = strdup("rocm handle not initialized");
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    return;
  }

  // TODO - iterate through devices...  ret =
  // rsmi_num_monitor_devices(&num_devices);

  // ret = (*h.getHandle)(0, &device);
  // if (ret != RSMI_STATUS_SUCCESS) {
  //     printf("rocm vram device lookup failure: %d\n", ret);
  //     return -1;
  // }

  // Get total memory - used memory for available memory
  ret = (*h.totalMemFn)(0, RSMI_MEM_TYPE_VRAM, &totalMem);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "rocm total mem lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }
  ret = (*h.usageMemFn)(0, RSMI_MEM_TYPE_VRAM, &usedMem);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "rocm usage mem lookup failure: %d", ret);
    resp->err = strdup(buf);
    return;
  }

102
103
  // TODO: set this to the actual number of devices
  resp->count = 1;
104
105
106
107
108
  resp->total = totalMem;
  resp->free = totalMem - usedMem;
  return;
}

109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
  const int buflen = 256;
  char buf[buflen + 1];
  if (h.handle == NULL) {
    resp->str = strdup("nvml handle not initialized");
    resp->status = 1;
    return;
  }
  rsmi_version_t ver;
  rsmi_status_t ret;
  ret = h.versionGetFn(&ver);
  if (ret != RSMI_STATUS_SUCCESS) {
    snprintf(buf, buflen, "unexpected response on version lookup %d", ret);
    resp->status = 1;
  } else {
    snprintf(buf, buflen, "%d", ver.major);
    resp->status = 0;
  }
  resp->str = strdup(buf);
}

130
#endif  // __APPLE__