"src/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "2f7a417d1fb11bd242ad7f9098bb9fdf77c54422"
Unverified Commit 3f308367 authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

CUDA: filter devices on secondary discovery (#13317)

We now do a deeper probe of CUDA devices to verify the library version has
the correct compute capability coverage for the device.  Due to ROCm also
interpreting the CUDA env var to filter AMD devices, we try to avoid setting
it which leads to problems in mixed vendor systems.  However without setting
it for this deeper probe, each CUDA library subprocess discovers all CUDA GPUs
and on systems with lots of GPUs, this can lead to hitting timeouts.  The fix is
to turn on the CUDA visibility env var just for this deeper probe use-case.
parent cc9555af
...@@ -147,7 +147,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. ...@@ -147,7 +147,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
wg.Add(1) wg.Add(1)
go func(i int) { go func(i int) {
defer wg.Done() defer wg.Done()
extraEnvs := ml.GetVisibleDevicesEnv(devices[i : i+1]) extraEnvs := ml.GetVisibleDevicesEnv(devices[i:i+1], true)
devices[i].AddInitValidation(extraEnvs) devices[i].AddInitValidation(extraEnvs)
if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 { if len(bootstrapDevices(ctx2ndPass, devices[i].LibraryPath, extraEnvs)) == 0 {
slog.Debug("filtering device which didn't fully initialize", slog.Debug("filtering device which didn't fully initialize",
...@@ -333,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml. ...@@ -333,7 +333,8 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
defer cancel() defer cancel()
// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct // Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
devFilter := ml.GetVisibleDevicesEnv(devices) // We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
devFilter := ml.GetVisibleDevicesEnv(devices, false)
for dir := range libDirs { for dir := range libDirs {
updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter) updatedDevices := bootstrapDevices(ctx, []string{ml.LibOllamaPath, dir}, devFilter)
......
...@@ -227,7 +227,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st ...@@ -227,7 +227,7 @@ func NewLlamaServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, modelPath st
modelPath, modelPath,
gpuLibs, gpuLibs,
status, status,
ml.GetVisibleDevicesEnv(gpus), ml.GetVisibleDevicesEnv(gpus, false),
) )
s := llmServer{ s := llmServer{
......
...@@ -494,13 +494,14 @@ func FlashAttentionSupported(l []DeviceInfo) bool { ...@@ -494,13 +494,14 @@ func FlashAttentionSupported(l []DeviceInfo) bool {
// Given the list of GPUs this instantiation is targeted for, // Given the list of GPUs this instantiation is targeted for,
// figure out the visible devices environment variables // figure out the visible devices environment variables
func GetVisibleDevicesEnv(l []DeviceInfo) map[string]string { // Set mustFilter true to enable filtering of CUDA devices
func GetVisibleDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
if len(l) == 0 { if len(l) == 0 {
return nil return nil
} }
env := map[string]string{} env := map[string]string{}
for _, d := range l { for _, d := range l {
d.updateVisibleDevicesEnv(env) d.updateVisibleDevicesEnv(env, mustFilter)
} }
return env return env
} }
...@@ -532,7 +533,7 @@ func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool { ...@@ -532,7 +533,7 @@ func (d DeviceInfo) PreferredLibrary(other DeviceInfo) bool {
return false return false
} }
func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string, mustFilter bool) {
var envVar string var envVar string
switch d.Library { switch d.Library {
case "ROCm": case "ROCm":
...@@ -541,8 +542,15 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) { ...@@ -541,8 +542,15 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string) {
if runtime.GOOS != "linux" { if runtime.GOOS != "linux" {
envVar = "HIP_VISIBLE_DEVICES" envVar = "HIP_VISIBLE_DEVICES"
} }
case "CUDA":
if !mustFilter {
// By default we try to avoid filtering CUDA devices because ROCm also
// looks at the CUDA env var, and gets confused in mixed vendor environments.
return
}
envVar = "CUDA_VISIBLE_DEVICES"
default: default:
// CUDA and Vulkan are not filtered via env var, but via scheduling decisions // Vulkan is not filtered via env var, but via scheduling decisions
return return
} }
v, existing := env[envVar] v, existing := env[envVar]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment