Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
a54d4a28
Unverified
Commit
a54d4a28
authored
Mar 12, 2024
by
Daniel Hiltgen
Committed by
GitHub
Mar 12, 2024
Browse files
Merge pull request #3088 from dhiltgen/rocm_igpu_linux
Fix iGPU detection for linux
parents
ba7cf7fb
82b0c7c2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
14 deletions
+28
-14
gpu/amd_common.go
gpu/amd_common.go
+2
-4
gpu/amd_linux.go
gpu/amd_linux.go
+26
-10
No files found.
gpu/amd_common.go
View file @
a54d4a28
...
...
@@ -40,19 +40,17 @@ func amdSetVisibleDevices(ids []int, skip map[int]interface{}) {
// TODO - does sort order matter?
devices
:=
[]
string
{}
for
i
:=
range
ids
{
slog
.
Debug
(
fmt
.
Sprintf
(
"i=%d"
,
i
))
if
_
,
skipped
:=
skip
[
i
];
skipped
{
slog
.
Debug
(
"skipped"
)
continue
}
devices
=
append
(
devices
,
strconv
.
Itoa
(
i
))
}
slog
.
Debug
(
fmt
.
Sprintf
(
"devices=%v"
,
devices
))
val
:=
strings
.
Join
(
devices
,
","
)
err
:=
os
.
Setenv
(
"HIP_VISIBLE_DEVICES"
,
val
)
if
err
!=
nil
{
slog
.
Warn
(
fmt
.
Sprintf
(
"failed to set env: %s"
,
err
))
}
else
{
slog
.
Info
(
"Setting HIP_VISIBLE_DEVICES="
+
val
)
}
slog
.
Debug
(
"HIP_VISIBLE_DEVICES="
+
val
)
}
gpu/amd_linux.go
View file @
a54d4a28
...
...
@@ -24,6 +24,9 @@ const (
GPUTotalMemoryFileGlob
=
"mem_banks/*/properties"
// size_in_bytes line
GPUUsedMemoryFileGlob
=
"mem_banks/*/used_memory"
RocmStandardLocation
=
"/opt/rocm/lib"
// TODO find a better way to detect iGPU instead of minimum memory
IGPUMemLimit
=
1024
*
1024
*
1024
// 512G is what they typically report, so anything less than 1G must be iGPU
)
var
(
...
...
@@ -146,8 +149,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
resp
.
memInfo
.
DeviceCount
=
0
resp
.
memInfo
.
TotalMemory
=
0
resp
.
memInfo
.
FreeMemory
=
0
slog
.
Debug
(
"discovering VRAM for amdgpu devices"
)
if
len
(
ids
)
==
0
{
slog
.
Debug
(
"discovering all amdgpu devices"
)
entries
,
err
:=
os
.
ReadDir
(
AMDNodesSysfsDir
)
if
err
!=
nil
{
slog
.
Warn
(
fmt
.
Sprintf
(
"failed to read amdgpu sysfs %s - %s"
,
AMDNodesSysfsDir
,
err
))
...
...
@@ -165,7 +168,7 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
ids
=
append
(
ids
,
id
)
}
}
slog
.
Debug
(
fmt
.
Sprintf
(
"
discovering
amdgpu devices %v"
,
ids
))
slog
.
Debug
(
fmt
.
Sprintf
(
"amdgpu devices %v"
,
ids
))
for
_
,
id
:=
range
ids
{
if
_
,
skipped
:=
skip
[
id
];
skipped
{
...
...
@@ -173,7 +176,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
}
totalMemory
:=
uint64
(
0
)
usedMemory
:=
uint64
(
0
)
propGlob
:=
filepath
.
Join
(
AMDNodesSysfsDir
,
strconv
.
Itoa
(
id
),
GPUTotalMemoryFileGlob
)
// Adjust for sysfs vs HIP ids
propGlob
:=
filepath
.
Join
(
AMDNodesSysfsDir
,
strconv
.
Itoa
(
id
+
1
),
GPUTotalMemoryFileGlob
)
propFiles
,
err
:=
filepath
.
Glob
(
propGlob
)
if
err
!=
nil
{
slog
.
Warn
(
fmt
.
Sprintf
(
"error looking up total GPU memory: %s %s"
,
propGlob
,
err
))
...
...
@@ -205,6 +209,13 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
}
}
if
totalMemory
==
0
{
slog
.
Warn
(
fmt
.
Sprintf
(
"amdgpu [%d] reports zero total memory, skipping"
,
id
))
skip
[
id
]
=
struct
{}{}
continue
}
if
totalMemory
<
IGPUMemLimit
{
slog
.
Info
(
fmt
.
Sprintf
(
"amdgpu [%d] appears to be an iGPU with %dM reported total memory, skipping"
,
id
,
totalMemory
/
1024
/
1024
))
skip
[
id
]
=
struct
{}{}
continue
}
usedGlob
:=
filepath
.
Join
(
AMDNodesSysfsDir
,
strconv
.
Itoa
(
id
),
GPUUsedMemoryFileGlob
)
...
...
@@ -232,8 +243,8 @@ func amdProcMemLookup(resp *GpuInfo, skip map[int]interface{}, ids []int) {
}
usedMemory
+=
used
}
slog
.
Info
(
fmt
.
Sprintf
(
"[%d] amdgpu totalMemory %d"
,
id
,
totalMemory
))
slog
.
Info
(
fmt
.
Sprintf
(
"[%d] amdgpu freeMemory %d"
,
id
,
(
totalMemory
-
usedMemory
)))
slog
.
Info
(
fmt
.
Sprintf
(
"[%d] amdgpu totalMemory %d
M
"
,
id
,
totalMemory
/
1024
/
1024
))
slog
.
Info
(
fmt
.
Sprintf
(
"[%d] amdgpu freeMemory %d
M
"
,
id
,
(
totalMemory
-
usedMemory
)
/
1024
/
1024
))
resp
.
memInfo
.
DeviceCount
++
resp
.
memInfo
.
TotalMemory
+=
totalMemory
resp
.
memInfo
.
FreeMemory
+=
(
totalMemory
-
usedMemory
)
...
...
@@ -358,6 +369,8 @@ func AMDDriverVersion() (string, error) {
}
func
AMDGFXVersions
()
map
[
int
]
Version
{
// The amdgpu driver always exposes the host CPU as node 0, but we have to skip that and subtract one
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
res
:=
map
[
int
]
Version
{}
matches
,
_
:=
filepath
.
Glob
(
GPUPropertiesFileGlob
)
for
_
,
match
:=
range
matches
{
...
...
@@ -373,17 +386,20 @@ func AMDGFXVersions() map[int]Version {
continue
}
if
i
==
0
{
// Skipping the CPU
continue
}
// Align with HIP IDs (zero is first GPU, not CPU)
i
-=
1
scanner
:=
bufio
.
NewScanner
(
fp
)
for
scanner
.
Scan
()
{
line
:=
strings
.
TrimSpace
(
scanner
.
Text
())
if
strings
.
HasPrefix
(
line
,
"gfx_target_version"
)
{
ver
:=
strings
.
Fields
(
line
)
if
len
(
ver
)
!=
2
||
len
(
ver
[
1
])
<
5
{
if
ver
[
1
]
==
"0"
{
// Silently skip the CPU
continue
}
else
{
if
ver
[
1
]
!=
"0"
{
slog
.
Debug
(
"malformed "
+
line
)
}
res
[
i
]
=
Version
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment