Handle models with divergent layer sizes

The recent refactoring of the memory prediction assumed all layers are the same size, but for some models (like deepseek-coder-v2) this is not the case, so our predictions were significantly off.

Handle models with divergent layer sizes
The recent refactoring of the memory prediction assumed all layers are the same size, but for some models (like deepseek-coder-v2) this is not the case, so our predictions were significantly off.
359b15a5 · Daniel Hiltgen · b55958a5 · 359b15a5
Commit 359b15a5 authored Jun 18, 2024 by Daniel Hiltgen
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 0 deletions

llm/memory.go llm/memory.go +6 -0

No files found.
--- a/llm/memory.go
+++ b/llm/memory.go
 package llm
 import (
+	"fmt"
 	"log/slog"
 	"strconv"
 	"strings"
@@ -179,6 +180,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	// For all the layers, find where they can fit on the GPU(s)
 	for i := range int(ggml.KV().BlockCount()) {
+		// Some models have inconsistent layer sizes
+		if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
+			layerSize = blk.size()
+			layerSize += kv / ggml.KV().BlockCount()
+		}
 		memoryWeights += layerSize
 		if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {