Commit 7c2a157c authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Ensure amd gpu nodes are numerically sorted

For systems that enumerate over 10 CPUs the default lexicographical
sort order interleaves CPUs and GPUs.
parent ac33aa7d
......@@ -10,6 +10,7 @@ import (
"path/filepath"
"regexp"
"slices"
"sort"
"strconv"
"strings"
......@@ -82,6 +83,20 @@ func AMDGetGPUInfo() []RocmGPUInfo {
// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
// from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU)
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
sort.Slice(matches, func(i, j int) bool {
// /sys/class/kfd/kfd/topology/nodes/<number>/properties
a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64)
if err != nil {
slog.Debug("parse err", "error", err, "match", matches[i])
return false
}
b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64)
if err != nil {
slog.Debug("parse err", "error", err, "match", matches[i])
return false
}
return a < b
})
cpuCount := 0
for _, match := range matches {
slog.Debug("evaluating amdgpu node " + match)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment