Commit 172b5924 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: Avoid integer underflow on llama engine memory layout

On the llama engine, when we compute the memory layout, we reserve
a buffer to allow for some flexibility for incorrect estimates.
This is subtracted from GPU free memory and on GPUs with limited
memory, it may underflow.

Fixes #13494
parent 8852220f
...@@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system ...@@ -524,8 +524,13 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
// Use the size of one layer as a buffer // Use the size of one layer as a buffer
layers := s.ggml.Tensors().GroupLayers() layers := s.ggml.Tensors().GroupLayers()
if blk0, ok := layers["blk.0"]; ok { if blk0, ok := layers["blk.0"]; ok {
buffer := blk0.Size() + kv[0]
for i := range gpus { for i := range gpus {
gpus[i].FreeMemory -= blk0.Size() + kv[0] if gpus[i].FreeMemory > buffer {
gpus[i].FreeMemory -= buffer
} else {
gpus[i].FreeMemory = 0
}
} }
} else { } else {
slog.Warn("model missing blk.0 layer size") slog.Warn("model missing blk.0 layer size")
...@@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system ...@@ -575,7 +580,11 @@ func (s *llamaServer) Load(ctx context.Context, systemInfo ml.SystemInfo, system
projectorGPU = firstIntegrated projectorGPU = firstIntegrated
} }
if gpus[projectorGPU].FreeMemory > projectorWeights {
gpus[projectorGPU].FreeMemory -= projectorWeights gpus[projectorGPU].FreeMemory -= projectorWeights
} else {
gpus[projectorGPU].FreeMemory = 0
}
} }
var kvTotal uint64 var kvTotal uint64
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment