llm: Use first layer as memory buffer in estimation

This is a partial revert of 0478d440 "Fixed over vram allcation dure to small initial layer sizes." Previously we used the size of the first layer as an extra reserved amount of space to buffer our memory estimates. The above commit changed this to use the largest layer. However, this had performance impacts on more models than the original commit was trying to fix. There is just a heuristic without an ideal solution so this goes back to the historic behavior. Fixes: #10765, #10756, #10752, #10726

llm: Use first layer as memory buffer in estimation
This is a partial revert of 0478d440 "Fixed over vram allcation dure to small initial layer sizes." Previously we used the size of the first layer as an extra reserved amount of space to buffer our memory estimates. The above commit changed this to use the largest layer. However, this had performance impacts on more models than the original commit was trying to fix. There is just a heuristic without an ideal solution so this goes back to the historic behavior. Fixes: #10765, #10756, #10752, #10726
3fe74fba · Jesse Gross · Jesse Gross · 1a0cfd08 · 3fe74fba
Commit 3fe74fba authored May 19, 2025 by Jesse Gross Committed by Jesse Gross May 19, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 7 deletions

llm/memory.go llm/memory.go +6 -7

No files found.
--- a/llm/memory.go
+++ b/llm/memory.go
 package llm
 import (
-	"cmp"
 	"fmt"
 	"log/slog"
-	"maps"
 	"os"
-	"slices"
 	"strconv"
 	"strings"
@@ -125,10 +122,12 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}
 	layers := f.Tensors().GroupLayers()
-	// add one layer (chosing the max layer) worth of memory as a buffer
+	// add one layer worth of memory as a buffer
-	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
+	if blk0, ok := layers["blk.0"]; ok {
-		return cmp.Compare(a.Size(), b.Size())
+		layerSize = blk0.Size()
-	}).Size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
+	}
 	var kvct string
 	if envconfig.FlashAttention() &&