Commit 5e8ff556 authored by Daniel Hiltgen's avatar Daniel Hiltgen
Browse files

Support forced spreading for multi GPU

Our default behavior today is to try to fit into a single GPU if possible.
Some users would prefer the old behavior of always spreading across
multiple GPUs even if the model can fit into one.  This exposes that
tunable behavior.
parent 6fd04ca9
...@@ -53,6 +53,8 @@ var ( ...@@ -53,6 +53,8 @@ var (
NumParallel int NumParallel int
// Set via OLLAMA_RUNNERS_DIR in the environment // Set via OLLAMA_RUNNERS_DIR in the environment
RunnersDir string RunnersDir string
// Set via OLLAMA_SCHED_SPREAD in the environment
SchedSpread bool
// Set via OLLAMA_TMPDIR in the environment // Set via OLLAMA_TMPDIR in the environment
TmpDir string TmpDir string
) )
...@@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar { ...@@ -79,6 +81,7 @@ func AsMap() map[string]EnvVar {
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"}, "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests (default 1)"},
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"}, "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowOrigins, "A comma separated list of allowed origins"},
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"}, "OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir, "Location for runners"},
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread, "Always schedule model across all GPUs"},
"OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"}, "OLLAMA_TMPDIR": {"OLLAMA_TMPDIR", TmpDir, "Location for temporary files"},
} }
} }
...@@ -191,6 +194,15 @@ func LoadConfig() { ...@@ -191,6 +194,15 @@ func LoadConfig() {
NoHistory = true NoHistory = true
} }
if spread := clean("OLLAMA_SCHED_SPREAD"); spread != "" {
s, err := strconv.ParseBool(spread)
if err == nil {
SchedSpread = s
} else {
SchedSpread = true
}
}
if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" { if noprune := clean("OLLAMA_NOPRUNE"); noprune != "" {
NoPrune = true NoPrune = true
} }
......
...@@ -558,12 +558,14 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu. ...@@ -558,12 +558,14 @@ func pickBestFitGPUs(req *LlmRequest, ggml *llm.GGML, gpus gpu.GpuInfoList) gpu.
sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl))) sort.Sort(sort.Reverse(gpu.ByFreeMemory(sgl)))
// First attempt to fit the model into a single GPU // First attempt to fit the model into a single GPU
if !envconfig.SchedSpread {
for _, g := range sgl { for _, g := range sgl {
if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok { if ok, estimatedVRAM = llm.PredictServerFit([]gpu.GpuInfo{g}, ggml, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts); ok {
slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM)) slog.Debug("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
return []gpu.GpuInfo{g} return []gpu.GpuInfo{g}
} }
} }
}
// TODO future refinements // TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit // - if multiple Libraries, see if any single GPU in any Library will fit
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment