Unverified Commit b05c9e83 authored by Daniel Hiltgen's avatar Daniel Hiltgen Committed by GitHub
Browse files

Introduce GPU Overhead env var (#5922)

Provide a mechanism for users to set aside an amount of VRAM on each GPU
to make room for other applications they want to start after Ollama, or workaround
memory prediction bugs
parent a60d9b89
......@@ -1421,6 +1421,7 @@ func NewCLI() *cobra.Command {
envVars["OLLAMA_TMPDIR"],
envVars["OLLAMA_FLASH_ATTENTION"],
envVars["OLLAMA_LLM_LIBRARY"],
envVars["OLLAMA_GPU_OVERHEAD"],
})
default:
appendEnvDocs(cmd, envs)
......
......@@ -231,6 +231,25 @@ var (
MaxVRAM = Uint("OLLAMA_MAX_VRAM", 0)
)
func Uint64(key string, defaultValue uint64) func() uint64 {
return func() uint64 {
if s := Var(key); s != "" {
if n, err := strconv.ParseUint(s, 10, 64); err != nil {
slog.Warn("invalid environment variable, using default", "key", key, "value", s, "default", defaultValue)
} else {
return n
}
}
return defaultValue
}
}
var (
// Set aside VRAM per GPU
GpuOverhead = Uint64("OLLAMA_GPU_OVERHEAD", 0)
)
type EnvVar struct {
Name string
Value any
......@@ -241,6 +260,7 @@ func AsMap() map[string]EnvVar {
ret := map[string]EnvVar{
"OLLAMA_DEBUG": {"OLLAMA_DEBUG", Debug(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"},
"OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(), "Enabled flash attention"},
"OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"},
"OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"},
"OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"},
"OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"},
......
......@@ -7,6 +7,7 @@ import (
"strings"
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/envconfig"
"github.com/ollama/ollama/format"
"github.com/ollama/ollama/gpu"
)
......@@ -94,6 +95,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
// Overflow that didn't fit into the GPU
var overflow uint64
overhead := envconfig.GpuOverhead()
availableList := make([]string, len(gpus))
for i, gpu := range gpus {
availableList[i] = format.HumanBytes2(gpu.FreeMemory)
......@@ -164,7 +166,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
gzo = gpuZeroOverhead
}
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
if (gpus[i].FreeMemory - overhead) < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
continue
}
......@@ -196,7 +198,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[i%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+layerSize {
if (g.g.FreeMemory - overhead) > used+layerSize {
gpuAllocations[g.i] += layerSize
layerCounts[g.i]++
layerCount++
......@@ -219,7 +221,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
for j := len(gpusWithSpace); j > 0; j-- {
g := gpusWithSpace[layerCount%j]
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
if g.g.FreeMemory > used+memoryLayerOutput {
if (g.g.FreeMemory - overhead) > used+memoryLayerOutput {
gpuAllocations[g.i] += memoryLayerOutput
layerCounts[g.i]++
layerCount++
......@@ -306,6 +308,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
}
func (m MemoryEstimate) log() {
overhead := envconfig.GpuOverhead()
slog.Info(
"offload to "+m.inferenceLibrary,
slog.Group(
......@@ -323,6 +326,7 @@ func (m MemoryEstimate) log() {
"memory",
// memory available by GPU for offloading
"available", m.availableList,
"gpu_overhead", format.HumanBytes2(overhead),
slog.Group(
"required",
// memory required for full offloading
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment