llm: Enable new memory estimates by default

New memory estimates (see #11090 for more information) are now enabled automatically for all models running on the Ollama engine, improving both stability and performance through more accurate sizing and allocation. Models running on the llama engine will continue to use the original style of memory estimation.

llm: Enable new memory estimates by default
New memory estimates (see #11090 for more information) are now enabled automatically for all models running on the Ollama engine, improving both stability and performance through more accurate sizing and allocation. Models running on the llama engine will continue to use the original style of memory estimation.
eb10390d · Jesse Gross · Jesse Gross · feb18cd7 · eb10390d · eb10390d
Commit eb10390d authored Sep 11, 2025 by Jesse Gross Committed by Jesse Gross Sep 11, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 9 deletions

envconfig/config.go envconfig/config.go +0 -3

llm/server.go llm/server.go +1 -6

No files found.
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -185,8 +185,6 @@ var (
 	ContextLength = Uint("OLLAMA_CONTEXT_LENGTH", 4096)
 	// Auth enables authentication between the Ollama client and server
 	UseAuth = Bool("OLLAMA_AUTH")
-	// Enable the new memory estimation logic
-	NewMemoryEstimates = Bool("OLLAMA_NEW_ESTIMATES")
 )
 func String(s string) func() string {
@@ -272,7 +270,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 		"OLLAMA_CONTEXT_LENGTH":    {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4096)"},
 		"OLLAMA_NEW_ENGINE":        {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"},
-		"OLLAMA_NEW_ESTIMATES":     {"OLLAMA_NEW_ESTIMATES", NewMemoryEstimates(), "Enable the new memory estimation logic"},
 		// Informational
 		"HTTP_PROXY":  {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"},

--- a/llm/server.go
+++ b/llm/server.go
@@ -162,11 +162,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 		}
 	}
-	newEstimates := textProcessor != nil && envconfig.NewMemoryEstimates()
-	if newEstimates {
-		slog.Info("enabling new memory estimates")
-	}
 	// Verify the requested context size is <= the model training size
 	trainCtx := f.KV().ContextLength()
 	if opts.NumCtx > int(trainCtx) && trainCtx > 0 {
@@ -434,7 +429,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, modelPath string, f *ggml.GGML, a
 			}
 		}()
-		if newEstimates {
+		if textProcessor != nil {
 			return &ollamaServer{llmServer: s}, nil
 		} else {
 			return &llamaServer{llmServer: s, ggml: f}, nil