Commit a2cc8571 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: Consistently track unassigned model data

In some cases, if we fail to assign a piece of the model to a GPU then
we lose track of this data. Although it doesn't change the memory
allocation, it does affect the total size of the model reported by
tools such as ollama ps (and also the percent offloaded).

This makes it look like setting num_gpu isn't reflected in ollama ps,
which isn't true but the offloading percent may appear to not change.

Spreading the model across more GPUs will continue to impact the
reported total size of the model.
parent 7edfdd2f
...@@ -216,6 +216,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -216,6 +216,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
if len(gpusWithSpace) > 0 { if len(gpusWithSpace) > 0 {
gpuZeroID = gpusWithSpace[0].i gpuZeroID = gpusWithSpace[0].i
gpuAllocations[gpuZeroID] += gpuZeroOverhead gpuAllocations[gpuZeroID] += gpuZeroOverhead
} else {
overflow += gpuZeroOverhead
} }
// For all the layers, find where they can fit on the GPU(s) // For all the layers, find where they can fit on the GPU(s)
...@@ -256,15 +258,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin ...@@ -256,15 +258,17 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
} }
// Determine if we need to consider output then find where it fits // Determine if we need to consider output then find where it fits
if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) { if memoryLayerOutput > 0 {
for j := len(gpusWithSpace); j > 0; j-- { if opts.NumGPU < 0 || layerCount < opts.NumGPU {
g := gpusWithSpace[layerCount%j] for j := len(gpusWithSpace); j > 0; j-- {
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload) g := gpusWithSpace[layerCount%j]
if g.g.FreeMemory > overhead+used+memoryLayerOutput { used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
gpuAllocations[g.i] += memoryLayerOutput if g.g.FreeMemory > overhead+used+memoryLayerOutput {
layerCounts[g.i]++ gpuAllocations[g.i] += memoryLayerOutput
layerCount++ layerCounts[g.i]++
break layerCount++
break
}
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment