Commit 3dcfd5f6 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: Perform eviction when num_gpu is set with new estimates

Currently, if you set num_gpu then this forces the model to
load with that number of layers in the current configuration.
This is done regardless of any other information, which means
that no eviction is performed even if another model is loaded.

This behavior is different from the old estimates (and still
happens for models that runs on the llama engine). In those
cases, models would be evicted if needed to load at the requested
number of layers. That behavior is more useful and less surprising,
so this changes the new estimates to match.

Fixes #12580
parent 53a969d5
...@@ -928,7 +928,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d ...@@ -928,7 +928,7 @@ func (s *ollamaServer) createLayout(systemInfo discover.SystemInfo, systemGPUs d
} }
} }
libraryGpuLayers := assignLayers(layers, gl, s.options.NumGPU, lastUsedGPU) libraryGpuLayers := assignLayers(layers, gl, requireFull, s.options.NumGPU, lastUsedGPU)
if libraryGpuLayers.Sum() > gpuLayers.Sum() { if libraryGpuLayers.Sum() > gpuLayers.Sum() {
gpuLayers = libraryGpuLayers gpuLayers = libraryGpuLayers
} }
...@@ -994,7 +994,7 @@ nextLayer: ...@@ -994,7 +994,7 @@ nextLayer:
} }
// assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment // assignLayers packs the maximum number of layers onto the smallest set of GPUs and comes up with a layer assignment
func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) { func assignLayers(layers []uint64, gpus discover.GpuInfoList, requireFull bool, requestedLayers int, lastUsedGPU int) (gpuLayers ml.GPULayersList) {
// If we can't fit everything then prefer offloading layers other than the output layer // If we can't fit everything then prefer offloading layers other than the output layer
for range 2 { for range 2 {
// requestedLayers may be -1 if nothing was requested // requestedLayers may be -1 if nothing was requested
...@@ -1003,14 +1003,14 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers in ...@@ -1003,14 +1003,14 @@ func assignLayers(layers []uint64, gpus discover.GpuInfoList, requestedLayers in
if !envconfig.SchedSpread() { if !envconfig.SchedSpread() {
for i := lastUsedGPU; i < len(gpus); i++ { for i := lastUsedGPU; i < len(gpus); i++ {
// Try to pack things into as few GPUs as possible // Try to pack things into as few GPUs as possible
forceRequest := i == len(gpus)-1 forceRequest := i == len(gpus)-1 && !requireFull
gpuLayers = findBestFit(layers, gpus[:i+1], requestedLayers, forceRequest) gpuLayers = findBestFit(layers, gpus[:i+1], requestedLayers, forceRequest)
if gpuLayers.Sum() == len(layers) || gpuLayers.Sum() == requestedLayers { if gpuLayers.Sum() == len(layers) || gpuLayers.Sum() == requestedLayers {
break break
} }
} }
} else { } else {
gpuLayers = findBestFit(layers, gpus, requestedLayers, true) gpuLayers = findBestFit(layers, gpus, requestedLayers, !requireFull)
} }
// We only stop if we've gotten all of the layers - even if we got requestedLayers, we still // We only stop if we've gotten all of the layers - even if we got requestedLayers, we still
......
...@@ -127,6 +127,14 @@ func TestLLMServerFitGPU(t *testing.T) { ...@@ -127,6 +127,14 @@ func TestLLMServerFitGPU(t *testing.T) {
requireFull: true, requireFull: true,
expectedErr: ErrLoadRequiredFull, expectedErr: ErrLoadRequiredFull,
}, },
{
name: "requireFull numGPU",
gpus: []gpu{{id: ml.DeviceID{ID: "gpu0"}, free: 256 * format.MebiByte}},
layers: []int{100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte, 100 * format.MebiByte},
numGPU: 4,
requireFull: true,
expectedErr: ErrLoadRequiredFull,
},
} }
for _, tt := range tests { for _, tt := range tests {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment