Merge pull request #3684 from ollama/mxyng/scale-graph

scale graph based on gpu count

Merge pull request #3684 from ollama/mxyng/scale-graph
scale graph based on gpu count
fb9580df · Michael Yang · GitHub · 7c9792a6 · 26df6747 · fb9580df
Unverified Commit fb9580df authored Apr 16, 2024 by Michael Yang Committed by GitHub Apr 16, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 1 deletion

gpu/gpu_darwin.go gpu/gpu_darwin.go +1 -1

llm/server.go llm/server.go +3 -0

No files found.
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -55,6 +55,6 @@ func getCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
-		DeviceCount: 0,
+		DeviceCount: 1,
 	}, nil
 }
--- a/llm/server.go
+++ b/llm/server.go
@@ -79,6 +79,9 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 		graphFullOffload = graphPartialOffload
 	}
+	graphFullOffload *= uint64(info.DeviceCount)
+	graphPartialOffload *= uint64(info.DeviceCount)
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload