Unverified Commit ba049026 authored by Jeffrey Morgan's avatar Jeffrey Morgan Committed by GitHub
Browse files

fs/ggml: add multiplier in graph estimates (#11208)

parent 3944602f
...@@ -555,7 +555,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri ...@@ -555,7 +555,7 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
// vocab graph // vocab graph
4*batch*(embedding+vocab)+embedding*vocab*105/128, 4*batch*(embedding+vocab)+embedding*vocab*105/128,
) )
case "gemma", "gemma2", "gemma3": case "gemma", "gemma2", "gemma3", "gemma3n":
fullOffload = max( fullOffload = max(
4*batch*(embedding+vocab), 4*batch*(embedding+vocab),
4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads), 4*batch*(2+context+context*heads+2*embedding+2*embeddingHeadsK*heads),
...@@ -568,6 +568,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri ...@@ -568,6 +568,11 @@ func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType stri
embedding*embeddingHeadsK*heads*9/16, embedding*embeddingHeadsK*heads*9/16,
) )
if f.KV().Architecture() == "gemma3n" {
fullOffload *= 4
partialOffload *= 4
}
// Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama // Gemma2 also has sliding window attention but we only have an optimized implementation in the Ollama
// engine. Gemma3 always uses the Ollama engine. // engine. Gemma3 always uses the Ollama engine.
if f.KV().Architecture() == "gemma3" { if f.KV().Architecture() == "gemma3" {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment