offload 75% of available vram to improve stability (#921)

2665f3c2 · Bruce MacDonald · GitHub · a79f030e · 2665f3c2
Unverified Commit 2665f3c2 authored Oct 26, 2023 by Bruce MacDonald Committed by GitHub Oct 26, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 4 deletions

llm/llama.go llm/llama.go +7 -4

No files found.
--- a/llm/llama.go
+++ b/llm/llama.go
@@ -243,12 +243,15 @@ func NumGPU(numLayer, fileSizeBytes int64, opts api.Options) int {
 			return 0
 		}

-		// Calculate bytes per layer
-		// TODO: this is a rough heuristic, better would be to calculate this based on number of layers and context size
+		/*
+		 Calculate bytes per layer, this will roughly be the size of the model file divided by the number of layers.
+		 We can store the model weights and the kv cache in vram,
+		 to enable kv chache vram storage add two additional layers to the number of layers retrieved from the model file.
+		*/
 		bytesPerLayer := fileSizeBytes / numLayer

-		// max number of layers we can fit in VRAM, subtract 8% to prevent consuming all available VRAM and running out of memory
-		layers := int(freeBytes/bytesPerLayer) * 92 / 100
+		// 75% of the absolute max number of layers we can fit in available VRAM, off-loading too many layers to the GPU can cause OOM errors
+		layers := int(freeBytes/bytesPerLayer) * 3 / 4
 		log.Printf("%d MB VRAM available, loading up to %d GPU layers", freeBytes/(1024*1024), layers)

 		return layers