gpu: add 512MiB to darwin minimum, metal doesn't have partial offloading overhead (#4068)

f0c454ab · Jeffrey Morgan · GitHub · b9f74ff3 · f0c454ab · f0c454ab
Unverified Commit f0c454ab authored May 01, 2024 by Jeffrey Morgan Committed by GitHub May 01, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 1 deletion

gpu/gpu_darwin.go gpu/gpu_darwin.go +7 -1

llm/memory.go llm/memory.go +5 -0

No files found.
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -10,6 +10,12 @@ package gpu
 import "C"
 import (
 	"runtime"
+
+	"github.com/ollama/ollama/format"
+)
+
+const (
+	metalMinimumMemory = 512 * format.MebiByte
 )

 func GetGPUInfo() GpuInfoList {
@@ -32,7 +38,7 @@ func GetGPUInfo() GpuInfoList {
 	// TODO is there a way to gather actual allocated video memory? (currentAllocatedSize doesn't work)
 	info.FreeMemory = info.TotalMemory

-	info.MinimumMemory = 0
+	info.MinimumMemory = metalMinimumMemory
 	return []GpuInfo{info}
 }


--- a/llm/memory.go
+++ b/llm/memory.go
@@ -88,6 +88,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 	graphFullOffload *= uint64(len(gpus))
 	graphPartialOffload *= uint64(len(gpus))

+	// on metal there's no partial offload overhead
+	if gpus[0].Library == "metal" {
+		graphPartialOffload = graphFullOffload
+	}
+
 	// memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
 	memoryRequiredTotal := memoryMinimum + graphFullOffload