Introduce GPU Overhead env var (#5922)

Provide a mechanism for users to set aside an amount of VRAM on each GPU to make room for other applications they want to start after Ollama, or workaround memory prediction bugs

Introduce GPU Overhead env var (#5922)
Provide a mechanism for users to set aside an amount of VRAM on each GPU to make room for other applications they want to start after Ollama, or workaround memory prediction bugs
b05c9e83 · Daniel Hiltgen · GitHub · a60d9b89 · b05c9e83 · b05c9e83
Unverified Commit b05c9e83 authored Sep 05, 2024 by Daniel Hiltgen Committed by GitHub Sep 05, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 3 deletions

cmd/cmd.go cmd/cmd.go +1 -0

envconfig/config.go envconfig/config.go +20 -0

llm/memory.go llm/memory.go +7 -3

No files found.
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -1421,6 +1421,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
 				envVars["OLLAMA_LLM_LIBRARY"],
+				envVars["OLLAMA_GPU_OVERHEAD"],
 			})
 		default:
 			appendEnvDocs(cmd, envs)

--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -231,6 +231,25 @@ var (
 	MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
 )
+func Uint64(key string, defaultValue uint64) func() uint64 {
+	return func() uint64 {
+		if s := Var(key); s != "" {
+			if n, err := strconv.ParseUint(s, 10, 64); err != nil {
+				slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
+			} else {
+				return n
+			}
+		}
+		return defaultValue
+	}
+}
+var (
+	// Set aside VRAM per GPU
+	GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
+)
 type EnvVar struct {
 	Name        string
 	Value       any
@@ -241,6 +260,7 @@ func AsMap() map[string]EnvVar {
 	ret := map[string]EnvVar{
 		"OLLAMA_DEBUG":             {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
 		"OLLAMA_FLASH_ATTENTION":   {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
+		"OLLAMA_GPU_OVERHEAD":      {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
 		"OLLAMA_HOST":              {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
 		"OLLAMA_KEEP_ALIVE":        {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
 		"OLLAMA_LLM_LIBRARY":       {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},

--- a/llm/memory.go
+++ b/llm/memory.go
@@ -7,6 +7,7 @@ import (
 	"strings"
 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )
@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// Overflow that didn't fit into the GPU
 	var overflow uint64
+	overhead := envconfig.GpuOverhead()
 	availableList := make([]string, len(gpus))
 	for i, gpu := range gpus {
 		availableList[i] = format.HumanBytes2(gpu.FreeMemory)
@@ -164,7 +166,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 			gzo = gpuZeroOverhead
 		}
 		// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
-		if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
+		if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
 			slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
 			continue
 		}
@@ -196,7 +198,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[i%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+layerSize {
+			if (g.g.FreeMemory - overhead) > used+layerSize {
 				gpuAllocations[g.i] += layerSize
 				layerCounts[g.i]++
 				layerCount++
@@ -219,7 +221,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		for j := len(gpusWithSpace); j > 0; j-- {
 			g := gpusWithSpace[layerCount%j]
 			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > used+memoryLayerOutput {
+			if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
 				gpuAllocations[g.i] += memoryLayerOutput
 				layerCounts[g.i]++
 				layerCount++
@@ -306,6 +308,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 }
 func (m MemoryEstimate) log() {
+	overhead := envconfig.GpuOverhead()
 	slog.Info(
 		"offload to "+m.inferenceLibrary,
 		slog.Group(
@@ -323,6 +326,7 @@ func (m MemoryEstimate) log() {
 			"memory",
 			// memory available by GPU for offloading
 			"available", m.availableList,
+			"gpu_overhead", format.HumanBytes2(overhead),
 			slog.Group(
 				"required",
 				// memory required for full offloading