Commit 6d84f075 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Detect AMD GPU info via sysfs and block old cards

This wires up some new logic to start using sysfs to discover AMD GPU
information and detects old cards we can't yet support so we can fallback to CPU mode.
parent 1c8435ff
package gpu
import (
"bufio"
"fmt"
"io"
"log/slog"
"os"
"path/filepath"
"strconv"
"strings"
)
// TODO - windows vs. non-windows vs darwin
// Discovery logic for AMD/ROCm GPUs
const (
DriverVersionFile = "/sys/module/amdgpu/version"
GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties"
// TODO probably break these down per GPU to make the logic simpler
GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line
GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory"
)
func AMDDetected() bool {
_, err := AMDDriverVersion()
return err == nil
}
func AMDDriverVersion() (string, error) {
_, err := os.Stat(DriverVersionFile)
if err != nil {
return "", err
}
fp, err := os.Open(DriverVersionFile)
if err != nil {
return "", err
}
defer fp.Close()
verString, err := io.ReadAll(fp)
if err != nil {
return "", err
}
return strings.TrimSpace(string(verString)), nil
}
func AMDGFXVersions() []Version {
res := []Version{}
matches, _ := filepath.Glob(GPUPropertiesFileGlob)
for _, match := range matches {
fp, err := os.Open(match)
if err != nil {
slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err))
continue
}
defer fp.Close()
scanner := bufio.NewScanner(fp)
// optionally, resize scanner's capacity for lines over 64K, see next example
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(line, "gfx_target_version") {
ver := strings.Fields(line)
if len(ver) != 2 || len(ver[1]) < 5 {
slog.Debug("malformed " + line)
continue
}
l := len(ver[1])
patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32)
minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32)
major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32)
if err1 != nil || err2 != nil || err3 != nil {
slog.Debug("malformed int " + line)
continue
}
res = append(res, Version{
Major: uint(major),
Minor: uint(minor),
Patch: uint(patch),
})
}
}
}
return res
}
func (v Version) ToGFXString() string {
return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch)
}
...@@ -149,43 +149,63 @@ func GetGPUInfo() GpuInfo { ...@@ -149,43 +149,63 @@ func GetGPUInfo() GpuInfo {
slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
} }
} }
} else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { } else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") {
C.rocm_check_vram(*gpuHandles.rocm, &memInfo) ver, err := AMDDriverVersion()
if memInfo.err != nil { if err == nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) slog.Info("AMD Driver: " + ver)
C.free(unsafe.Pointer(memInfo.err)) }
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 { gfx := AMDGFXVersions()
// Only one GPU detected and it appears to be an integrated GPU - skip it tooOld := false
slog.Info("ROCm unsupported integrated GPU detected") for _, v := range gfx {
} else if memInfo.count > 0 { if v.Major < 9 {
if memInfo.igpu_index >= 0 { slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString())
// We have multiple GPUs reported, and one of them is an integrated GPU tooOld = true
// so we have to set the env var to bypass it break
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it }
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" { // TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major
devices := []string{} // e.g. gfx1034 works if we map it to gfx1030 at runtime
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) { }
continue if !tooOld {
// TODO - this algo can be shifted over to use sysfs instead of the rocm info library...
C.rocm_check_vram(*gpuHandles.rocm, &memInfo)
if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else if memInfo.count > 0 {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" {
devices := []string{}
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) {
continue
}
devices = append(devices, strconv.Itoa(i))
} }
devices = append(devices, strconv.Itoa(i)) val = strings.Join(devices, ",")
os.Setenv("ROCR_VISIBLE_DEVICES", val)
} }
val = strings.Join(devices, ",") slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
os.Setenv("ROCR_VISIBLE_DEVICES", val)
} }
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val)) resp.Library = "rocm"
} var version C.rocm_version_resp_t
resp.Library = "rocm" C.rocm_get_version(*gpuHandles.rocm, &version)
var version C.rocm_version_resp_t verString := C.GoString(version.str)
C.rocm_get_version(*gpuHandles.rocm, &version) if version.status == 0 {
verString := C.GoString(version.str) resp.Variant = "v" + verString
if version.status == 0 { } else {
resp.Variant = "v" + verString slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString))
} else { }
slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString)) C.free(unsafe.Pointer(version.str))
} }
C.free(unsafe.Pointer(version.str))
} }
} }
if resp.Library == "" { if resp.Library == "" {
......
...@@ -16,3 +16,9 @@ type GpuInfo struct { ...@@ -16,3 +16,9 @@ type GpuInfo struct {
// TODO add other useful attributes about the card here for discovery information // TODO add other useful attributes about the card here for discovery information
} }
type Version struct {
Major uint
Minor uint
Patch uint
}
...@@ -21,7 +21,6 @@ amdGPUs() { ...@@ -21,7 +21,6 @@ amdGPUs() {
return return
fi fi
GPU_LIST=( GPU_LIST=(
"gfx803"
"gfx900" "gfx900"
"gfx906:xnack-" "gfx906:xnack-"
"gfx908:xnack-" "gfx908:xnack-"
......
...@@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string { ...@@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string {
if len(dynLibs) == 0 { if len(dynLibs) == 0 {
dynLibs = []string{availableDynLibs["cpu"]} dynLibs = []string{availableDynLibs["cpu"]}
} }
slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs))
return dynLibs return dynLibs
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment