Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a170888d
Unverified
Commit
a170888d
authored
Jan 24, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jan 24, 2024
Browse files
Merge pull request #2174 from dhiltgen/rocm_real_gpus
More logging for gpu management
parents
f63dc2db
013fd071
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
61 additions
and
44 deletions
+61
-44
gpu/gpu.go
gpu/gpu.go
+3
-1
gpu/gpu_info_cuda.c
gpu/gpu_info_cuda.c
+26
-18
gpu/gpu_info_cuda.h
gpu/gpu_info_cuda.h
+6
-6
gpu/gpu_info_rocm.c
gpu/gpu_info_rocm.c
+21
-14
gpu/gpu_info_rocm.h
gpu/gpu_info_rocm.h
+5
-5
No files found.
gpu/gpu.go
View file @
a170888d
...
@@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
...
@@ -40,11 +40,13 @@ var CudaLinuxGlobs = []string{
"/usr/lib/wsl/lib/libnvidia-ml.so*"
,
"/usr/lib/wsl/lib/libnvidia-ml.so*"
,
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*"
,
"/usr/lib/wsl/drivers/*/libnvidia-ml.so*"
,
"/opt/cuda/lib64/libnvidia-ml.so*"
,
"/opt/cuda/lib64/libnvidia-ml.so*"
,
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*"
,
"/usr/lib*/libnvidia-ml.so*"
,
"/usr/lib*/libnvidia-ml.so*"
,
"/usr/local/lib*/libnvidia-ml.so*"
,
"/usr/local/lib*/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/nvidia/current/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*"
,
"/usr/lib/aarch64-linux-gnu/libnvidia-ml.so*"
,
// TODO: are these stubs ever valid?
"/opt/cuda/targets/x86_64-linux/lib/stubs/libnvidia-ml.so*"
,
}
}
var
CudaWindowsGlobs
=
[]
string
{
var
CudaWindowsGlobs
=
[]
string
{
...
...
gpu/gpu_info_cuda.c
View file @
a170888d
...
@@ -4,8 +4,6 @@
...
@@ -4,8 +4,6 @@
#include <string.h>
#include <string.h>
#define CUDA_LOOKUP_SIZE 12
void
cuda_init
(
char
*
cuda_lib_path
,
cuda_init_resp_t
*
resp
)
{
void
cuda_init
(
char
*
cuda_lib_path
,
cuda_init_resp_t
*
resp
)
{
nvmlReturn_t
ret
;
nvmlReturn_t
ret
;
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
...
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
...
@@ -16,24 +14,26 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
struct
lookup
{
struct
lookup
{
char
*
s
;
char
*
s
;
void
**
p
;
void
**
p
;
}
l
[
CUDA_LOOKUP_SIZE
]
=
{
}
l
[]
=
{
{
"nvmlInit_v2"
,
(
void
*
)
&
resp
->
ch
.
initFn
},
{
"nvmlInit_v2"
,
(
void
*
)
&
resp
->
ch
.
nvmlInit_v2
},
{
"nvmlShutdown"
,
(
void
*
)
&
resp
->
ch
.
s
hutdown
Fn
},
{
"nvmlShutdown"
,
(
void
*
)
&
resp
->
ch
.
nvmlS
hutdown
},
{
"nvmlDeviceGetHandleByIndex"
,
(
void
*
)
&
resp
->
ch
.
g
etHandle
},
{
"nvmlDeviceGetHandleByIndex"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceG
etHandle
ByIndex
},
{
"nvmlDeviceGetMemoryInfo"
,
(
void
*
)
&
resp
->
ch
.
g
etMemInfo
},
{
"nvmlDeviceGetMemoryInfo"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceG
etMem
ory
Info
},
{
"nvmlDeviceGetCount_v2"
,
(
void
*
)
&
resp
->
ch
.
g
etCount
},
{
"nvmlDeviceGetCount_v2"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceG
etCount
_v2
},
{
"nvmlDeviceGetCudaComputeCapability"
,
(
void
*
)
&
resp
->
ch
.
get
ComputeCapability
},
{
"nvmlDeviceGetCudaComputeCapability"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetCuda
ComputeCapability
},
{
"nvmlSystemGetDriverVersion"
,
(
void
*
)
&
resp
->
ch
.
nvmlSystemGetDriverVersion
},
{
"nvmlSystemGetDriverVersion"
,
(
void
*
)
&
resp
->
ch
.
nvmlSystemGetDriverVersion
},
{
"nvmlDeviceGetName"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetName
},
{
"nvmlDeviceGetName"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetName
},
{
"nvmlDeviceGetSerial"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetSerial
},
{
"nvmlDeviceGetSerial"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetSerial
},
{
"nvmlDeviceGetVbiosVersion"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetVbiosVersion
},
{
"nvmlDeviceGetVbiosVersion"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetVbiosVersion
},
{
"nvmlDeviceGetBoardPartNumber"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetBoardPartNumber
},
{
"nvmlDeviceGetBoardPartNumber"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetBoardPartNumber
},
{
"nvmlDeviceGetBrand"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetBrand
},
{
"nvmlDeviceGetBrand"
,
(
void
*
)
&
resp
->
ch
.
nvmlDeviceGetBrand
},
{
NULL
,
NULL
},
};
};
resp
->
ch
.
handle
=
LOAD_LIBRARY
(
cuda_lib_path
,
RTLD_LAZY
);
resp
->
ch
.
handle
=
LOAD_LIBRARY
(
cuda_lib_path
,
RTLD_LAZY
);
if
(
!
resp
->
ch
.
handle
)
{
if
(
!
resp
->
ch
.
handle
)
{
char
*
msg
=
LOAD_ERR
();
char
*
msg
=
LOAD_ERR
();
LOG
(
resp
->
ch
.
verbose
,
"library %s load err: %s
\n
"
,
cuda_lib_path
,
msg
);
snprintf
(
buf
,
buflen
,
snprintf
(
buf
,
buflen
,
"Unable to load %s library to query for Nvidia GPUs: %s"
,
"Unable to load %s library to query for Nvidia GPUs: %s"
,
cuda_lib_path
,
msg
);
cuda_lib_path
,
msg
);
...
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
...
@@ -42,12 +42,19 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
return
;
return
;
}
}
for
(
i
=
0
;
i
<
CUDA_LOOKUP_SIZE
;
i
++
)
{
// TODO - fix this to use a null terminated list
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
ch
.
verbose
,
"wiring nvidia management library functions in %s
\n
"
,
cuda_lib_path
);
for
(
i
=
0
;
l
[
i
].
s
!=
NULL
;
i
++
)
{
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
ch
.
verbose
,
"dlsym: %s
\n
"
,
l
[
i
].
s
);
*
l
[
i
].
p
=
LOAD_SYMBOL
(
resp
->
ch
.
handle
,
l
[
i
].
s
);
*
l
[
i
].
p
=
LOAD_SYMBOL
(
resp
->
ch
.
handle
,
l
[
i
].
s
);
if
(
!
l
[
i
].
p
)
{
if
(
!
l
[
i
].
p
)
{
UNLOAD_LIBRARY
(
resp
->
ch
.
handle
);
resp
->
ch
.
handle
=
NULL
;
resp
->
ch
.
handle
=
NULL
;
char
*
msg
=
LOAD_ERR
();
char
*
msg
=
LOAD_ERR
();
LOG
(
resp
->
ch
.
verbose
,
"dlerr: %s
\n
"
,
msg
);
UNLOAD_LIBRARY
(
resp
->
ch
.
handle
);
snprintf
(
buf
,
buflen
,
"symbol lookup for %s failed: %s"
,
l
[
i
].
s
,
snprintf
(
buf
,
buflen
,
"symbol lookup for %s failed: %s"
,
l
[
i
].
s
,
msg
);
msg
);
free
(
msg
);
free
(
msg
);
...
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
...
@@ -56,8 +63,9 @@ void cuda_init(char *cuda_lib_path, cuda_init_resp_t *resp) {
}
}
}
}
ret
=
(
*
resp
->
ch
.
initFn
)();
ret
=
(
*
resp
->
ch
.
nvmlInit_v2
)();
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
LOG
(
resp
->
ch
.
verbose
,
"nvmlInit_v2 err: %d
\n
"
,
ret
);
UNLOAD_LIBRARY
(
resp
->
ch
.
handle
);
UNLOAD_LIBRARY
(
resp
->
ch
.
handle
);
resp
->
ch
.
handle
=
NULL
;
resp
->
ch
.
handle
=
NULL
;
snprintf
(
buf
,
buflen
,
"nvml vram init failure: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"nvml vram init failure: %d"
,
ret
);
...
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
...
@@ -87,7 +95,7 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
return
;
return
;
}
}
ret
=
(
*
h
.
g
etCount
)(
&
resp
->
count
);
ret
=
(
*
h
.
nvmlDeviceG
etCount
_v2
)(
&
resp
->
count
);
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unable to get device count: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"unable to get device count: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
...
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
...
@@ -97,14 +105,14 @@ void cuda_check_vram(cuda_handle_t h, mem_info_t *resp) {
resp
->
total
=
0
;
resp
->
total
=
0
;
resp
->
free
=
0
;
resp
->
free
=
0
;
for
(
i
=
0
;
i
<
resp
->
count
;
i
++
)
{
for
(
i
=
0
;
i
<
resp
->
count
;
i
++
)
{
ret
=
(
*
h
.
g
etHandle
)(
i
,
&
device
);
ret
=
(
*
h
.
nvmlDeviceG
etHandle
ByIndex
)(
i
,
&
device
);
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unable to get device handle %d: %d"
,
i
,
ret
);
snprintf
(
buf
,
buflen
,
"unable to get device handle %d: %d"
,
i
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
return
;
return
;
}
}
ret
=
(
*
h
.
g
etMemInfo
)(
device
,
&
memInfo
);
ret
=
(
*
h
.
nvmlDeviceG
etMem
ory
Info
)(
device
,
&
memInfo
);
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"device memory info lookup failure %d: %d"
,
i
,
ret
);
snprintf
(
buf
,
buflen
,
"device memory info lookup failure %d: %d"
,
i
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
...
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
...
@@ -172,7 +180,7 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
}
}
unsigned
int
devices
;
unsigned
int
devices
;
ret
=
(
*
h
.
g
etCount
)(
&
devices
);
ret
=
(
*
h
.
nvmlDeviceG
etCount
_v2
)(
&
devices
);
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unable to get device count: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"unable to get device count: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
...
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
...
@@ -180,14 +188,14 @@ void cuda_compute_capability(cuda_handle_t h, cuda_compute_capability_t *resp) {
}
}
for
(
i
=
0
;
i
<
devices
;
i
++
)
{
for
(
i
=
0
;
i
<
devices
;
i
++
)
{
ret
=
(
*
h
.
g
etHandle
)(
i
,
&
device
);
ret
=
(
*
h
.
nvmlDeviceG
etHandle
ByIndex
)(
i
,
&
device
);
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unable to get device handle %d: %d"
,
i
,
ret
);
snprintf
(
buf
,
buflen
,
"unable to get device handle %d: %d"
,
i
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
return
;
return
;
}
}
ret
=
(
*
h
.
get
ComputeCapability
)(
device
,
&
major
,
&
minor
);
ret
=
(
*
h
.
nvmlDeviceGetCuda
ComputeCapability
)(
device
,
&
major
,
&
minor
);
if
(
ret
!=
NVML_SUCCESS
)
{
if
(
ret
!=
NVML_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"device compute capability lookup failure %d: %d"
,
i
,
ret
);
snprintf
(
buf
,
buflen
,
"device compute capability lookup failure %d: %d"
,
i
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
...
...
gpu/gpu_info_cuda.h
View file @
a170888d
...
@@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
...
@@ -23,12 +23,12 @@ typedef enum nvmlBrandType_enum
typedef
struct
cuda_handle
{
typedef
struct
cuda_handle
{
void
*
handle
;
void
*
handle
;
uint16_t
verbose
;
uint16_t
verbose
;
nvmlReturn_t
(
*
initFn
)(
void
);
nvmlReturn_t
(
*
nvmlInit_v2
)(
void
);
nvmlReturn_t
(
*
s
hutdown
Fn
)(
void
);
nvmlReturn_t
(
*
nvmlS
hutdown
)(
void
);
nvmlReturn_t
(
*
g
etHandle
)(
unsigned
int
,
nvmlDevice_t
*
);
nvmlReturn_t
(
*
nvmlDeviceG
etHandle
ByIndex
)(
unsigned
int
,
nvmlDevice_t
*
);
nvmlReturn_t
(
*
g
etMemInfo
)(
nvmlDevice_t
,
nvmlMemory_t
*
);
nvmlReturn_t
(
*
nvmlDeviceG
etMem
ory
Info
)(
nvmlDevice_t
,
nvmlMemory_t
*
);
nvmlReturn_t
(
*
g
etCount
)(
unsigned
int
*
);
nvmlReturn_t
(
*
nvmlDeviceG
etCount
_v2
)(
unsigned
int
*
);
nvmlReturn_t
(
*
get
ComputeCapability
)(
nvmlDevice_t
,
int
*
major
,
int
*
minor
);
nvmlReturn_t
(
*
nvmlDeviceGetCuda
ComputeCapability
)(
nvmlDevice_t
,
int
*
major
,
int
*
minor
);
nvmlReturn_t
(
*
nvmlSystemGetDriverVersion
)
(
char
*
version
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlSystemGetDriverVersion
)
(
char
*
version
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetName
)
(
nvmlDevice_t
device
,
char
*
name
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetName
)
(
nvmlDevice_t
device
,
char
*
name
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetSerial
)
(
nvmlDevice_t
device
,
char
*
serial
,
unsigned
int
length
);
nvmlReturn_t
(
*
nvmlDeviceGetSerial
)
(
nvmlDevice_t
device
,
char
*
serial
,
unsigned
int
length
);
...
...
gpu/gpu_info_rocm.c
View file @
a170888d
...
@@ -4,8 +4,6 @@
...
@@ -4,8 +4,6 @@
#include <string.h>
#include <string.h>
#define ROCM_LOOKUP_SIZE 14
void
rocm_init
(
char
*
rocm_lib_path
,
rocm_init_resp_t
*
resp
)
{
void
rocm_init
(
char
*
rocm_lib_path
,
rocm_init_resp_t
*
resp
)
{
rsmi_status_t
ret
;
rsmi_status_t
ret
;
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
...
@@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
...
@@ -15,12 +13,12 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
struct
lookup
{
struct
lookup
{
char
*
s
;
char
*
s
;
void
**
p
;
void
**
p
;
}
l
[
ROCM_LOOKUP_SIZE
]
=
{
}
l
[]
=
{
{
"rsmi_init"
,
(
void
*
)
&
resp
->
rh
.
init
Fn
},
{
"rsmi_init"
,
(
void
*
)
&
resp
->
rh
.
rsmi_
init
},
{
"rsmi_shut_down"
,
(
void
*
)
&
resp
->
rh
.
shutdown
Fn
},
{
"rsmi_shut_down"
,
(
void
*
)
&
resp
->
rh
.
rsmi_
shut
_
down
},
{
"rsmi_dev_memory_total_get"
,
(
void
*
)
&
resp
->
rh
.
totalMemFn
},
{
"rsmi_dev_memory_total_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_memory_total_get
},
{
"rsmi_dev_memory_usage_get"
,
(
void
*
)
&
resp
->
rh
.
usageMemFn
},
{
"rsmi_dev_memory_usage_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_memory_usage_get
},
{
"rsmi_version_get"
,
(
void
*
)
&
resp
->
rh
.
version
GetFn
},
{
"rsmi_version_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_
version
_get
},
{
"rsmi_num_monitor_devices"
,
(
void
*
)
&
resp
->
rh
.
rsmi_num_monitor_devices
},
{
"rsmi_num_monitor_devices"
,
(
void
*
)
&
resp
->
rh
.
rsmi_num_monitor_devices
},
{
"rsmi_dev_id_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_id_get
},
{
"rsmi_dev_id_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_id_get
},
{
"rsmi_dev_name_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_name_get
},
{
"rsmi_dev_name_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_name_get
},
...
@@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
...
@@ -30,6 +28,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
{
"rsmi_dev_serial_number_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_serial_number_get
},
{
"rsmi_dev_serial_number_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_serial_number_get
},
{
"rsmi_dev_subsystem_name_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_subsystem_name_get
},
{
"rsmi_dev_subsystem_name_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_subsystem_name_get
},
{
"rsmi_dev_vbios_version_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_vbios_version_get
},
{
"rsmi_dev_vbios_version_get"
,
(
void
*
)
&
resp
->
rh
.
rsmi_dev_vbios_version_get
},
{
NULL
,
NULL
},
};
};
resp
->
rh
.
handle
=
LOAD_LIBRARY
(
rocm_lib_path
,
RTLD_LAZY
);
resp
->
rh
.
handle
=
LOAD_LIBRARY
(
rocm_lib_path
,
RTLD_LAZY
);
...
@@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
...
@@ -43,12 +42,19 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
return
;
return
;
}
}
for
(
i
=
0
;
i
<
ROCM_LOOKUP_SIZE
;
i
++
)
{
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
rh
.
verbose
,
"wiring rocm management library functions in %s
\n
"
,
rocm_lib_path
);
for
(
i
=
0
;
l
[
i
].
s
!=
NULL
;
i
++
)
{
// TODO once we've squashed the remaining corner cases remove this log
LOG
(
resp
->
rh
.
verbose
,
"dlsym: %s
\n
"
,
l
[
i
].
s
);
*
l
[
i
].
p
=
LOAD_SYMBOL
(
resp
->
rh
.
handle
,
l
[
i
].
s
);
*
l
[
i
].
p
=
LOAD_SYMBOL
(
resp
->
rh
.
handle
,
l
[
i
].
s
);
if
(
!
l
[
i
].
p
)
{
if
(
!
l
[
i
].
p
)
{
UNLOAD_LIBRARY
(
resp
->
rh
.
handle
);
resp
->
rh
.
handle
=
NULL
;
resp
->
rh
.
handle
=
NULL
;
char
*
msg
=
LOAD_ERR
();
char
*
msg
=
LOAD_ERR
();
LOG
(
resp
->
rh
.
verbose
,
"dlerr: %s
\n
"
,
msg
);
UNLOAD_LIBRARY
(
resp
->
rh
.
handle
);
snprintf
(
buf
,
buflen
,
"symbol lookup for %s failed: %s"
,
l
[
i
].
s
,
snprintf
(
buf
,
buflen
,
"symbol lookup for %s failed: %s"
,
l
[
i
].
s
,
msg
);
msg
);
free
(
msg
);
free
(
msg
);
...
@@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
...
@@ -57,8 +63,9 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
}
}
}
}
ret
=
(
*
resp
->
rh
.
init
Fn
)(
0
);
ret
=
(
*
resp
->
rh
.
rsmi_
init
)(
0
);
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
LOG
(
resp
->
rh
.
verbose
,
"rsmi_init err: %d
\n
"
,
ret
);
UNLOAD_LIBRARY
(
resp
->
rh
.
handle
);
UNLOAD_LIBRARY
(
resp
->
rh
.
handle
);
resp
->
rh
.
handle
=
NULL
;
resp
->
rh
.
handle
=
NULL
;
snprintf
(
buf
,
buflen
,
"rocm vram init failure: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"rocm vram init failure: %d"
,
ret
);
...
@@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
...
@@ -141,13 +148,13 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
}
}
// Get total memory - used memory for available memory
// Get total memory - used memory for available memory
ret
=
(
*
h
.
totalMemFn
)(
i
,
RSMI_MEM_TYPE_VRAM
,
&
totalMem
);
ret
=
(
*
h
.
rsmi_dev_memory_total_get
)(
i
,
RSMI_MEM_TYPE_VRAM
,
&
totalMem
);
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"rocm total mem lookup failure: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"rocm total mem lookup failure: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
return
;
return
;
}
}
ret
=
(
*
h
.
usageMemFn
)(
i
,
RSMI_MEM_TYPE_VRAM
,
&
usedMem
);
ret
=
(
*
h
.
rsmi_dev_memory_usage_get
)(
i
,
RSMI_MEM_TYPE_VRAM
,
&
usedMem
);
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"rocm usage mem lookup failure: %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"rocm usage mem lookup failure: %d"
,
ret
);
resp
->
err
=
strdup
(
buf
);
resp
->
err
=
strdup
(
buf
);
...
@@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
...
@@ -170,7 +177,7 @@ void rocm_get_version(rocm_handle_t h, rocm_version_resp_t *resp) {
}
}
rsmi_version_t
ver
;
rsmi_version_t
ver
;
rsmi_status_t
ret
;
rsmi_status_t
ret
;
ret
=
h
.
version
GetFn
(
&
ver
);
ret
=
h
.
rsmi_
version
_get
(
&
ver
);
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
if
(
ret
!=
RSMI_STATUS_SUCCESS
)
{
snprintf
(
buf
,
buflen
,
"unexpected response on version lookup %d"
,
ret
);
snprintf
(
buf
,
buflen
,
"unexpected response on version lookup %d"
,
ret
);
resp
->
status
=
1
;
resp
->
status
=
1
;
...
...
gpu/gpu_info_rocm.h
View file @
a170888d
...
@@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
...
@@ -25,11 +25,11 @@ typedef enum rsmi_memory_type {
typedef
struct
rocm_handle
{
typedef
struct
rocm_handle
{
void
*
handle
;
void
*
handle
;
uint16_t
verbose
;
uint16_t
verbose
;
rsmi_status_t
(
*
init
Fn
)(
uint64_t
);
rsmi_status_t
(
*
rsmi_
init
)(
uint64_t
);
rsmi_status_t
(
*
shutdown
Fn
)(
void
);
rsmi_status_t
(
*
rsmi_
shut
_
down
)(
void
);
rsmi_status_t
(
*
totalMemFn
)(
uint32_t
,
rsmi_memory_type_t
,
uint64_t
*
);
rsmi_status_t
(
*
rsmi_dev_memory_total_get
)(
uint32_t
,
rsmi_memory_type_t
,
uint64_t
*
);
rsmi_status_t
(
*
usageMemFn
)(
uint32_t
,
rsmi_memory_type_t
,
uint64_t
*
);
rsmi_status_t
(
*
rsmi_dev_memory_usage_get
)(
uint32_t
,
rsmi_memory_type_t
,
uint64_t
*
);
rsmi_status_t
(
*
version
GetFn
)
(
rsmi_version_t
*
version
);
rsmi_status_t
(
*
rsmi_
version
_get
)
(
rsmi_version_t
*
version
);
rsmi_status_t
(
*
rsmi_num_monitor_devices
)
(
uint32_t
*
);
rsmi_status_t
(
*
rsmi_num_monitor_devices
)
(
uint32_t
*
);
rsmi_status_t
(
*
rsmi_dev_id_get
)(
uint32_t
,
uint16_t
*
);
rsmi_status_t
(
*
rsmi_dev_id_get
)(
uint32_t
,
uint16_t
*
);
rsmi_status_t
(
*
rsmi_dev_name_get
)
(
uint32_t
,
char
*
,
size_t
);
rsmi_status_t
(
*
rsmi_dev_name_get
)
(
uint32_t
,
char
*
,
size_t
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment