darwin: no partial offloading if required memory greater than system

41a272de · Michael Yang · f3357222 · 41a272de · 41a272de · 41a272de
Commit 41a272de authored Apr 16, 2024 by Michael Yang
Showing with 17 additions and 9 deletions

gpu/gpu_darwin.go gpu/gpu_darwin.go +2 -1

gpu/gpu_info_darwin.h gpu/gpu_info_darwin.h +1 -0

gpu/gpu_info_darwin.m gpu/gpu_info_darwin.m +9 -7

llm/server.go llm/server.go +5 -1

No files found.
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -32,6 +32,7 @@ func CheckVRAM() (uint64, error) {
 		// gpu not supported, this may not be metal
 		return 0, nil
 	}
 	return uint64(C.getRecommendedMaxVRAM()), nil
 }
@@ -52,7 +53,7 @@ func GetGPUInfo() GpuInfo {
 func getCPUMem() (memInfo, error) {
 	return memInfo{
-		TotalMemory: 0,
+		TotalMemory: uint64(C.getPhysicalMemory()),
 		FreeMemory:  0,
 		DeviceCount: 0,
 	}, nil

--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
 #import <Metal/Metal.h>
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
+uint64_t getPhysicalMemory();
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
-//go:build darwin
+// go:build darwin
 #include "gpu_info_darwin.h"
-uint64_t getRecommendedMaxVRAM()
+uint64_t getRecommendedMaxVRAM() {
-{
  id<MTLDevice> device = MTLCreateSystemDefaultDevice();
  uint64_t result = device.recommendedMaxWorkingSetSize;
  CFRelease(device);
  return result;
 }
+uint64_t getPhysicalMemory() {
+  return [[NSProcessInfo processInfo] physicalMemory];
+}
--- a/llm/server.go
+++ b/llm/server.go
@@ -108,7 +108,11 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
 	memoryLayerOutput := layers["output"].size()
 	memoryRequiredTotal += memoryLayerOutput
-	if memoryAvailable > memoryRequiredTotal {
+	if info.Library == "metal" && memoryRequiredTotal > info.TotalMemory {
+		// disable partial offloading when model is greater than total system memory
+		opts.NumGPU = 0
+	} else if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal
 	}