Commit 172b5924 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: Avoid integer underflow on llama engine memory layout

On the llama engine, when we compute the memory layout, we reserve
a buffer to allow for some flexibility for incorrect estimates.
This is subtracted from GPU free memory and on GPUs with limited
memory, it may underflow.

Fixes #13494
parent 8852220f
......@@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
// Use the size of one layer as a buffer
layers := s.ggml.Tensors().GroupLayers()
if blk0, ok := layers["blk.0"]; ok {
buffer := blk0.Size() + kv[0]
for i := range gpus {
gpus[i].FreeMemory -= blk0.Size() + kv[0]
if gpus[i].FreeMemory > buffer {
gpus[i].FreeMemory -= buffer
} else {
gpus[i].FreeMemory = 0
}
}
} else {
slog.Warn("model missing blk.0 layer size")
......@@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
projectorGPU = firstIntegrated
}
if gpus[projectorGPU].FreeMemory > projectorWeights {
gpus[projectorGPU].FreeMemory -= projectorWeights
} else {
gpus[projectorGPU].FreeMemory = 0
}
}
var kvTotal uint64
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment