Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
30c43c28
Unverified
Commit
30c43c28
authored
Jan 26, 2024
by
Daniel Hiltgen
Committed by
GitHub
Jan 26, 2024
Browse files
Merge pull request #2195 from dhiltgen/rocm_real_gpus
Ignore AMD integrated GPUs
parents
23a7ea59
9d7b5d6c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
35 additions
and
3 deletions
+35
-3
gpu/gpu.go
gpu/gpu.go
+25
-1
gpu/gpu_info.h
gpu/gpu_info.h
+1
-0
gpu/gpu_info_rocm.c
gpu/gpu_info_rocm.c
+9
-2
No files found.
gpu/gpu.go
View file @
30c43c28
...
@@ -16,6 +16,7 @@ import (
...
@@ -16,6 +16,7 @@ import (
"os"
"os"
"path/filepath"
"path/filepath"
"runtime"
"runtime"
"strconv"
"strings"
"strings"
"sync"
"sync"
"unsafe"
"unsafe"
...
@@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo {
...
@@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo {
if
memInfo
.
err
!=
nil
{
if
memInfo
.
err
!=
nil
{
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up ROCm GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
slog
.
Info
(
fmt
.
Sprintf
(
"error looking up ROCm GPU memory: %s"
,
C
.
GoString
(
memInfo
.
err
)))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
C
.
free
(
unsafe
.
Pointer
(
memInfo
.
err
))
}
else
if
memInfo
.
igpu_index
>=
0
&&
memInfo
.
count
==
1
{
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog
.
Info
(
"ROCm unsupported integrated GPU detected"
)
}
else
{
}
else
{
if
memInfo
.
igpu_index
>=
0
{
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val
:=
os
.
Getenv
(
"ROCR_VISIBLE_DEVICES"
)
if
val
==
""
{
devices
:=
[]
string
{}
for
i
:=
0
;
i
<
int
(
memInfo
.
count
);
i
++
{
if
i
==
int
(
memInfo
.
igpu_index
)
{
continue
}
devices
=
append
(
devices
,
strconv
.
Itoa
(
i
))
}
val
=
strings
.
Join
(
devices
,
","
)
os
.
Setenv
(
"ROCR_VISIBLE_DEVICES"
,
val
)
}
slog
.
Info
(
fmt
.
Sprintf
(
"ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s"
,
val
))
}
resp
.
Library
=
"rocm"
resp
.
Library
=
"rocm"
var
version
C
.
rocm_version_resp_t
var
version
C
.
rocm_version_resp_t
C
.
rocm_get_version
(
*
gpuHandles
.
rocm
,
&
version
)
C
.
rocm_get_version
(
*
gpuHandles
.
rocm
,
&
version
)
...
@@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) {
...
@@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) {
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
if
overhead
<
gpus
*
1024
*
1024
*
1024
{
overhead
=
gpus
*
1024
*
1024
*
1024
overhead
=
gpus
*
1024
*
1024
*
1024
}
}
return
int64
(
gpuInfo
.
FreeMemory
-
overhead
),
nil
avail
:=
int64
(
gpuInfo
.
FreeMemory
-
overhead
)
slog
.
Debug
(
fmt
.
Sprintf
(
"%s detected %d devices with %dM available memory"
,
gpuInfo
.
Library
,
gpuInfo
.
DeviceCount
,
avail
/
1024
/
1024
))
return
avail
,
nil
}
}
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
return
0
,
fmt
.
Errorf
(
"no GPU detected"
)
// TODO - better handling of CPU based memory determiniation
...
...
gpu/gpu_info.h
View file @
30c43c28
...
@@ -42,6 +42,7 @@ typedef struct mem_info {
...
@@ -42,6 +42,7 @@ typedef struct mem_info {
uint64_t
total
;
uint64_t
total
;
uint64_t
free
;
uint64_t
free
;
unsigned
int
count
;
unsigned
int
count
;
int
igpu_index
;
// If >= 0, we detected an integrated GPU to ignore
char
*
err
;
// If non-nill, caller responsible for freeing
char
*
err
;
// If non-nill, caller responsible for freeing
}
mem_info_t
;
}
mem_info_t
;
...
...
gpu/gpu_info_rocm.c
View file @
30c43c28
...
@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
...
@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
void
rocm_check_vram
(
rocm_handle_t
h
,
mem_info_t
*
resp
)
{
void
rocm_check_vram
(
rocm_handle_t
h
,
mem_info_t
*
resp
)
{
resp
->
err
=
NULL
;
resp
->
err
=
NULL
;
resp
->
igpu_index
=
-
1
;
uint64_t
totalMem
=
0
;
uint64_t
totalMem
=
0
;
uint64_t
usedMem
=
0
;
uint64_t
usedMem
=
0
;
rsmi_status_t
ret
;
rsmi_status_t
ret
;
...
@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
...
@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
}
}
LOG
(
h
.
verbose
,
"[%d] ROCm totalMem %ld
\n
"
,
i
,
totalMem
);
LOG
(
h
.
verbose
,
"[%d] ROCm totalMem %ld
\n
"
,
i
,
totalMem
);
LOG
(
h
.
verbose
,
"[%d] ROCm usedMem %ld
\n
"
,
i
,
usedMem
);
LOG
(
h
.
verbose
,
"[%d] ROCm usedMem %ld
\n
"
,
i
,
usedMem
);
resp
->
total
+=
totalMem
;
if
(
totalMem
<
1024
*
1024
*
1024
)
{
resp
->
free
+=
totalMem
-
usedMem
;
// Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
LOG
(
h
.
verbose
,
"[%d] ROCm integrated GPU
\n
"
,
i
);
resp
->
igpu_index
=
i
;
}
else
{
resp
->
total
+=
totalMem
;
resp
->
free
+=
totalMem
-
usedMem
;
}
}
}
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment