only count output tensors

7bb7cb8a · Michael Yang · 5f73c087 · 7bb7cb8a
Commit 7bb7cb8a authored Apr 25, 2024 by Michael Yang
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 9 deletions

llm/memory.go llm/memory.go +18 -9

No files found.
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -5,7 +5,6 @@ import (
 	"log/slog"
 	"os"
 	"strconv"
-	"strings"
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/format"
@@ -100,8 +99,22 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		return 0, 0
 	}
-	var layerCount int
 	layers := ggml.Tensors().Layers()
+	var memoryLayerOutput uint64
+	for k, v := range layers {
+		if k == "output" || k == "output_norm" {
+			memoryLayerOutput += v.size()
+		}
+	}
+	if gpus[0].Library == "metal" && opts.UseMMap {
+		// memory is preallocated for output tensors
+		memoryRequiredTotal += memoryLayerOutput
+		memoryRequiredPartial += memoryLayerOutput
+	}
+	var layerCount int
 	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
 		memoryLayer := layers[fmt.Sprintf("blk.%d", i)].size()
@@ -115,15 +128,11 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
 		}
 	}
-	var memoryLayerOutput uint64
+	if gpus[0].Library != "metal" || !opts.UseMMap {
-	for k, v := range layers {
+		// memory was not preallocated for output tensors
-		if !strings.HasPrefix(k, "blk.") {
+		memoryRequiredTotal += memoryLayerOutput
-			memoryLayerOutput += v.size()
-		}
 	}
-	memoryRequiredTotal += memoryLayerOutput
 	if memoryAvailable > memoryRequiredTotal {
 		layerCount = int(ggml.KV().BlockCount()) + 1
 		memoryRequiredPartial = memoryRequiredTotal