Support forced spreading for multi GPU

Our default behavior today is to try to fit into a single GPU if possible. Some users would prefer the old behavior of always spreading across multiple GPUs even if the model can fit into one. This exposes that tunable behavior.

Support forced spreading for multi GPU
Our default behavior today is to try to fit into a single GPU if possible. Some users would prefer the old behavior of always spreading across multiple GPUs even if the model can fit into one. This exposes that tunable behavior.
5e8ff556 · Daniel Hiltgen · 6fd04ca9 · 5e8ff556 · 5e8ff556
Commit 5e8ff556 authored May 08, 2024 by Daniel Hiltgen
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 4 deletions

envconfig/config.go envconfig/config.go +12 -0

server/sched.go server/sched.go +6 -4

No files found.
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -53,6 +53,8 @@ var (
 	NumParallel int
 	// Set via OLLAMA_RUNNERS_DIR in the environment
 	RunnersDir string
+	// Set via OLLAMA_SCHED_SPREAD in the environment
+	SchedSpread bool
 	// Set via OLLAMA_TMPDIR in the environment
 	TmpDir string
 )
@@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
 		"OLLAMA_RUNNERS_DIR":       {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
+		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
 		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
 	}
 }
@@ -191,6 +194,15 @@ func LoadConfig() {
 		NoHistory = true
 	}
+	if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
+		s, err := strconv.ParseBool(spread)
+		if err == nil {
+			SchedSpread = s
+		} else {
+			SchedSpread = true
+		}
+	}
 	if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
 		NoPrune = true
 	}

--- a/server/sched.go
+++ b/server/sched.go
@@ -558,12 +558,14 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
 		sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
 		// First attempt to fit the model into a single GPU
+		if !envconfig.SchedSpread {
 			for _, g := range sgl {
 				if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
 					slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
 					return []gpu.GpuInfo{g}
 				}
 			}
+		}
 		// TODO future refinements
 		// - if multiple Libraries, see if any single GPU in any Library will fit