Merge pull request #6260 from ollama/mxyng/mem

llama3.1 memory

Merge pull request #6260 from ollama/mxyng/mem
llama3.1 memory
bf612cd6 · Michael Yang · GitHub · ef98e561 · 2003d601 · bf612cd6
Unverified Commit bf612cd6 authored Sep 05, 2024 by Michael Yang Committed by GitHub Sep 05, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

llm/ggml.go llm/ggml.go +4 -2

No files found.
--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -360,11 +360,13 @@ func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload ui

 	switch llm.KV().Architecture() {
 	case "llama":
-		fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
+		fullOffload = max(
+			4*batch*(1+4*embedding+context*(1+heads)),
+			4*batch*(embedding+vocab),
+		)

 		partialOffload = 4 * batch * embedding
 		partialOffload += max(
-			// 4*batch*(4+6*embedding+context*(2*heads)+llm.KV().GQA()),
 			4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embeddingHeads*headsKV),
 			4*batch*(embedding+vocab)+embedding*vocab*105/128,
 		)