"git@developer.sourcefind.cn:change/sglang.git" did not exist on "1e2cf2b541c7b22dbf5f0d5bf6da78d93eaa3648"
Commit 9d7b5d6c authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Ignore AMD integrated GPUs

Detect and ignore integrated GPUs reported by rocm.
parent 197e420a
...@@ -16,6 +16,7 @@ import ( ...@@ -16,6 +16,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strconv"
"strings" "strings"
"sync" "sync"
"unsafe" "unsafe"
...@@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo { ...@@ -147,7 +148,28 @@ func GetGPUInfo() GpuInfo {
if memInfo.err != nil { if memInfo.err != nil {
slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err)))
C.free(unsafe.Pointer(memInfo.err)) C.free(unsafe.Pointer(memInfo.err))
} else if memInfo.igpu_index >= 0 && memInfo.count == 1 {
// Only one GPU detected and it appears to be an integrated GPU - skip it
slog.Info("ROCm unsupported integrated GPU detected")
} else { } else {
if memInfo.igpu_index >= 0 {
// We have multiple GPUs reported, and one of them is an integrated GPU
// so we have to set the env var to bypass it
// If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it
val := os.Getenv("ROCR_VISIBLE_DEVICES")
if val == "" {
devices := []string{}
for i := 0; i < int(memInfo.count); i++ {
if i == int(memInfo.igpu_index) {
continue
}
devices = append(devices, strconv.Itoa(i))
}
val = strings.Join(devices, ",")
os.Setenv("ROCR_VISIBLE_DEVICES", val)
}
slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val))
}
resp.Library = "rocm" resp.Library = "rocm"
var version C.rocm_version_resp_t var version C.rocm_version_resp_t
C.rocm_get_version(*gpuHandles.rocm, &version) C.rocm_get_version(*gpuHandles.rocm, &version)
...@@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) { ...@@ -199,7 +221,9 @@ func CheckVRAM() (int64, error) {
if overhead < gpus*1024*1024*1024 { if overhead < gpus*1024*1024*1024 {
overhead = gpus * 1024 * 1024 * 1024 overhead = gpus * 1024 * 1024 * 1024
} }
return int64(gpuInfo.FreeMemory - overhead), nil avail := int64(gpuInfo.FreeMemory - overhead)
slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
return avail, nil
} }
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
......
...@@ -42,6 +42,7 @@ typedef struct mem_info { ...@@ -42,6 +42,7 @@ typedef struct mem_info {
uint64_t total; uint64_t total;
uint64_t free; uint64_t free;
unsigned int count; unsigned int count;
int igpu_index; // If >= 0, we detected an integrated GPU to ignore
char *err; // If non-nill, caller responsible for freeing char *err; // If non-nill, caller responsible for freeing
} mem_info_t; } mem_info_t;
......
...@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) { ...@@ -77,6 +77,7 @@ void rocm_init(char *rocm_lib_path, rocm_init_resp_t *resp) {
void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
resp->err = NULL; resp->err = NULL;
resp->igpu_index = -1;
uint64_t totalMem = 0; uint64_t totalMem = 0;
uint64_t usedMem = 0; uint64_t usedMem = 0;
rsmi_status_t ret; rsmi_status_t ret;
...@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) { ...@@ -162,8 +163,14 @@ void rocm_check_vram(rocm_handle_t h, mem_info_t *resp) {
} }
LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem); LOG(h.verbose, "[%d] ROCm totalMem %ld\n", i, totalMem);
LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem); LOG(h.verbose, "[%d] ROCm usedMem %ld\n", i, usedMem);
resp->total += totalMem; if (totalMem < 1024 * 1024 * 1024) {
resp->free += totalMem - usedMem; // Do not add up integrated GPU memory capacity, it's a bogus 512M, and actually uses system memory
LOG(h.verbose, "[%d] ROCm integrated GPU\n", i);
resp->igpu_index = i;
} else {
resp->total += totalMem;
resp->free += totalMem - usedMem;
}
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment