fs/ggml: add multiplier in graph estimates (#11208)

ba049026 · Jeffrey Morgan · GitHub · 3944602f · ba049026
Unverified Commit ba049026 authored Jun 26, 2025 by Jeffrey Morgan Committed by GitHub Jun 26, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 1 deletion

fs/ggml/ggml.go fs/ggml/ggml.go +6 -1

No files found.
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -555,7 +555,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 			// vocab graph
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)
-	case "gemma", "gemma2", "gemma3":
+	case "gemma", "gemma2", "gemma3", "gemma3n":
 		fullOffload = max(
 			4*batch*(embedding+vocab),
 			4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
@@ -568,6 +568,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
 				embedding*embeddingHeadsK*heads*9/16,
 		)
+		if f.KV().Architecture() == "gemma3n" {
+			fullOffload *= 4
+			partialOffload *= 4
+		}
 		// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
 		// engine. Gemma3 always uses the Ollama engine.
 		if f.KV().Architecture() == "gemma3" {