llm: Perform eviction when num_gpu is set with new estimates

Currently, if you set num_gpu then this forces the model to load with that number of layers in the current configuration. This is done regardless of any other information, which means that no eviction is performed even if another model is loaded. This behavior is different from the old estimates (and still happens for models that runs on the llama engine). In those cases, models would be evicted if needed to load at the requested number of layers. That behavior is more useful and less surprising, so this changes the new estimates to match. Fixes #12580

llm: Perform eviction when num_gpu is set with new estimates
Currently, if you set num_gpu then this forces the model to load with that number of layers in the current configuration. This is done regardless of any other information, which means that no eviction is performed even if another model is loaded. This behavior is different from the old estimates (and still happens for models that runs on the llama engine). In those cases, models would be evicted if needed to load at the requested number of layers. That behavior is more useful and less surprising, so this changes the new estimates to match. Fixes #12580
3dcfd5f6 · Jesse Gross · Jesse Gross · 53a969d5 · 3dcfd5f6 · 3dcfd5f6
Commit 3dcfd5f6 authored Oct 14, 2025 by Jesse Gross Committed by Jesse Gross Oct 14, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 4 deletions

llm/server.go llm/server.go +4 -4

llm/server_test.go llm/server_test.go +8 -0

No files found.
--- a/llm/server.go
+++ b/llm/server.go
@@ -928,7 +928,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
 			}
 		}
-		libraryGpuLayers := assignLayers(layers, gl, s.options.NumGPU, lastUsedGPU)
+		libraryGpuLayers := assignLayers(layers, gl, requireFull, s.options.NumGPU, lastUsedGPU)
 		if libraryGpuLayers.Sum() > gpuLayers.Sum() {
 			gpuLayers = libraryGpuLayers
 		}
@@ -994,7 +994,7 @@ nextLayer:
 }
 // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
-func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
+func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
 	// If we can't fit everything then prefer offloading layers other than the output layer
 	for range 2 {
 		// requestedLayers may be -1 if nothing was requested
@@ -1003,14 +1003,14 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers in
 		if !envconfig.SchedSpread() {
 			for i := lastUsedGPU; i < len(gpus); i++ {
 				// Try to pack things into as few GPUs as possible
-				forceRequest := i == len(gpus)-1
+				forceRequest := i == len(gpus)-1 && !requireFull
 				gpuLayers = findBestFit(layers, gpus[:i+1], requestedLayers, forceRequest)
 				if gpuLayers.Sum() == len(layers) || gpuLayers.Sum() == requestedLayers {
 					break
 				}
 			}
 		} else {
-			gpuLayers = findBestFit(layers, gpus, requestedLayers, true)
+			gpuLayers = findBestFit(layers, gpus, requestedLayers, !requireFull)
 		}
 		// We only stop if we've gotten all of the layers - even if we got requestedLayers, we still

--- a/llm/server_test.go
+++ b/llm/server_test.go
@@ -127,6 +127,14 @@ func TestLLMServerFitGPU(t *testing.T) {
 			requireFull: true,
 			expectedErr: ErrLoadRequiredFull,
 		},
+		{
+			name:        "requireFull numGPU",
+			gpus:        []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
+			layers:      []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
+			numGPU:      4,
+			requireFull: true,
+			expectedErr: ErrLoadRequiredFull,
+		},
 	}
 	for _, tt := range tests {