Merge pull request #3241 from ollama/mxyng/mem

update memory estimations for gpu offloading

Merge pull request #3241 from ollama/mxyng/mem
update memory estimations for gpu offloading
d6dd2ff8 · Michael Yang · GitHub · e57a6ba8 · 91b3e4d2 · d6dd2ff8
Unverified Commit d6dd2ff8 authored Apr 01, 2024 by Michael Yang Committed by GitHub Apr 01, 2024
7 changed files
--- a/format/bytes.go
+++ b/format/bytes.go
@@ -6,11 +6,15 @@ import (
 )

 const (
-	Byte     = 1
+	Byte = 1
+
 	KiloByte = Byte * 1000
 	MegaByte = KiloByte * 1000
 	GigaByte = MegaByte * 1000
 	TeraByte = GigaByte * 1000
+
+	KibiByte = Byte * 1024
+	MebiByte = KibiByte * 1024
 )

 func HumanBytes(b int64) string {
@@ -45,3 +49,14 @@ func HumanBytes(b int64) string {
 		return fmt.Sprintf("%d %s", int(value), unit)
 	}
 }
+
+func HumanBytes2(b int64) string {
+	switch {
+	case b >= MebiByte:
+		return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
+	case b >= KibiByte:
+		return fmt.Sprintf("%.1f KiB", float64(b)/KibiByte)
+	default:
+		return fmt.Sprintf("%d B", b)
+	}
+}
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -20,6 +20,8 @@ import (
 	"strings"
 	"sync"
 	"unsafe"
+
+	"github.com/ollama/ollama/format"
 )

 type handles struct {
@@ -27,6 +29,11 @@ type handles struct {
 	cudart *C.cudart_handle_t
 }

+const (
+	cudaMinimumMemory = 377 * format.MebiByte
+	rocmMinimumMemory = 377 * format.MebiByte
+)
+
 var gpuMutex sync.Mutex
 var gpuHandles *handles = nil

@@ -168,6 +175,7 @@ func GetGPUInfo() GpuInfo {
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[nvidia-ml] NVML CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
+				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[nvidia-ml] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
@@ -187,6 +195,7 @@ func GetGPUInfo() GpuInfo {
 			} else if cc.major > CudaComputeMin[0] || (cc.major == CudaComputeMin[0] && cc.minor >= CudaComputeMin[1]) {
 				slog.Info(fmt.Sprintf("[cudart] CUDART CUDA Compute Capability detected: %d.%d", cc.major, cc.minor))
 				resp.Library = "cuda"
+				resp.MinimumMemory = cudaMinimumMemory
 			} else {
 				slog.Info(fmt.Sprintf("[cudart] CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor))
 			}
@@ -194,6 +203,7 @@ func GetGPUInfo() GpuInfo {
 	} else {
 		AMDGetGPUInfo(&resp)
 		if resp.Library != "" {
+			resp.MinimumMemory = rocmMinimumMemory
 			return resp
 		}
 	}
@@ -239,20 +249,7 @@ func CheckVRAM() (int64, error) {
 	}
 	gpuInfo := GetGPUInfo()
 	if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
-		// leave 10% or 1024MiB of VRAM free per GPU to handle unaccounted for overhead
-		overhead := gpuInfo.FreeMemory / 10
-		gpus := uint64(gpuInfo.DeviceCount)
-		if overhead < gpus*1024*1024*1024 {
-			overhead = gpus * 1024 * 1024 * 1024
-		}
-		// Assigning full reported free memory for Tegras due to OS controlled caching.
-		if CudaTegra != "" {
-			// Setting overhead for non-Tegra devices
-			overhead = 0
-		}
-		avail := int64(gpuInfo.FreeMemory - overhead)
-		slog.Debug(fmt.Sprintf("%s detected %d devices with %dM available memory", gpuInfo.Library, gpuInfo.DeviceCount, avail/1024/1024))
-		return avail, nil
+		return int64(gpuInfo.FreeMemory), nil
 	}

 	return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation

--- a/gpu/types.go
+++ b/gpu/types.go
@@ -14,6 +14,9 @@ type GpuInfo struct {
 	// Optional variant to select (e.g. versions, cpu feature flags)
 	Variant string `json:"variant,omitempty"`

+	// MinimumMemory represents the minimum memory required to use the GPU
+	MinimumMemory int64 `json:"-"`
+
 	// TODO add other useful attributes about the card here for discovery information
 }


--- a/llm/dyn_ext_server.go
+++ b/llm/dyn_ext_server.go
@@ -39,7 +39,7 @@ import (

 type dynExtServer struct {
 	s       C.struct_dynamic_llama_server
-	options api.Options
+	options *api.Options
 }

 // Note: current implementation does not support concurrent instantiations
@@ -64,7 +64,7 @@ func extServerResponseToErr(resp C.ext_server_resp_t) error {
 	return fmt.Errorf(C.GoString(resp.msg))
 }

-func newDynExtServer(library, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newDynExtServer(library, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	if !mutex.TryLock() {
 		slog.Info("concurrent llm servers not yet supported, waiting for prior server to complete")
 		mutex.Lock()

--- a/llm/ggml.go
+++ b/llm/ggml.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	"strings"
 )

 type GGML struct {
@@ -12,6 +13,16 @@ type GGML struct {
 	model
 }

+func (ggml *GGML) LayerSize(prefix string) (n int64) {
+	for _, t := range ggml.Tensors() {
+		if strings.HasPrefix(t.Name, prefix) {
+			n += int64(t.size())
+		}
+	}
+
+	return
+}
+
 const (
 	fileTypeF32 uint32 = iota
 	fileTypeF16

--- a/llm/llm.go
+++ b/llm/llm.go
@@ -5,10 +5,11 @@ import (
 	"fmt"
 	"log/slog"
 	"os"
-	"runtime"
 	"slices"
+	"strings"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
 )

@@ -24,7 +25,7 @@ var cpuOnlyFamilies = []string{
 	"mamba",
 }

-func New(model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func New(model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	if _, err := os.Stat(model); err != nil {
 		return nil, err
 	}
@@ -35,7 +36,7 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 	}
 	defer f.Close()

-	ggml, size, err := DecodeGGML(f)
+	ggml, _, err := DecodeGGML(f)
 	if err != nil {
 		return nil, err
 	}
@@ -49,84 +50,93 @@ func New(model string, adapters, projectors []string, opts api.Options) (LLM, er
 		opts.NumCtx = 4
 	}

-	vram, _ := gpu.CheckVRAM()
+	availableMemory, _ := gpu.CheckVRAM()
+	info := gpu.GetGPUInfo()
+
+	usedMemory := info.MinimumMemory
+	for _, projector := range projectors {
+		usedMemory += projectorMemoryRequirements(projector)
+
+		// multimodal models require at least 2048 context
+		opts.NumCtx = max(opts.NumCtx, 2048)
+	}

-	// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
-	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) * int64(ggml.KV().HeadCountKV()) / int64(ggml.KV().HeadCount())
+	// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
+	kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.KV().BlockCount()) * int64(ggml.KV().EmbeddingLength()) / int64(ggml.KV().HeadCount()) * int64(ggml.KV().HeadCountKV())

 	// this amount is the overhead + tensors in memory
 	// TODO: get this from the llama.cpp's graph calculations instead of
 	// estimating it's 1/6 * kv_cache_size * num_gqa
 	graph := int64(ggml.KV().GQA()) * kv / 6
+	usedMemory += graph

-	if slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
-		opts.NumGPU = 0
+	if usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
+		info.Library = "cpu"
 	}

-	info := gpu.GetGPUInfo()
-	switch runtime.GOOS {
-	case "darwin":
-		if opts.NumGPU == 0 {
-			break
-		}
+	requiredMemory := usedMemory

-		if size+kv+graph > vram {
-			slog.Info("not enough vram available, setting num_gpu=0")
-			opts.NumGPU = 0
-			break
-		}
+	var layers int
+	for i := 0; i < int(ggml.KV().BlockCount()); i++ {
+		layerMemory := ggml.LayerSize(fmt.Sprintf("blk.%d.", i)) + kv/int64(ggml.KV().BlockCount())
+		requiredMemory += layerMemory

-		// TODO: implement layer splitting on macOS
-		opts.NumGPU = 999
-	default:
-		if info.Library == "cpu" {
-			slog.Info("GPU not available, falling back to CPU")
-			opts.NumGPU = 0
-			break
+		if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) {
+			usedMemory += layerMemory
+			layers++
 		}
+	}

-		// don't use GPU at all if no layers are loaded
-		if opts.NumGPU == 0 {
-			info.Library = "cpu"
-			info.Variant = gpu.GetCPUVariant()
-			break
-		}
+	memOutputLayer := ggml.LayerSize("output.")
+	requiredMemory += memOutputLayer

-		// user-defined GPU count
-		if opts.NumGPU != -1 {
-			break
-		}
+	// only offload output layer if all repeating layers are offloaded
+	if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer {
+		usedMemory += memOutputLayer
+		layers++
+	}

-		// the "main" GPU needs the most memory and determines the limit
-		// of how many layers can be loaded. It needs to fit:
-		// 1. the full compute graph allocation for all devices (graph)
-		// 2. the proportional kv cache for all devices (kv * % layers)
-		// 3. the proportional model (size * % layers / # devices)
-		// This estimates the number of layers
-		maxlayers := int64(ggml.KV().BlockCount()) + 1
-		devices := int64(info.DeviceCount)
-		avg := vram / devices
-		layers := maxlayers * (avg - graph) / (kv + size/devices)
-		if layers > maxlayers {
-			layers = maxlayers
-		}
+	slog.Info(
+		"offload to gpu",
+		"layers", layers,
+		"required", format.HumanBytes2(requiredMemory),
+		"used", format.HumanBytes2(usedMemory),
+		"available", format.HumanBytes2(availableMemory),
+		"kv", format.HumanBytes2(kv),
+		"graph", format.HumanBytes2(graph),
+	)
+
+	if opts.NumGPU < 0 && info.Library != "cpu" {
+		opts.NumGPU = layers
+	}

-		// 1 + 2 must fit on the main gpu
-		min := graph + kv*layers/maxlayers
-		if layers <= 0 || min > avg {
-			slog.Info("not enough vram available, falling back to CPU only")
-			info.Library = "cpu"
-			info.Variant = gpu.GetCPUVariant()
-			opts.NumGPU = 0
-			break
-		}
+	return newLlmServer(info, model, adapters, projectors, opts)
+}

-		opts.NumGPU = int(layers)
+func projectorMemoryRequirements(filename string) int64 {
+	file, err := os.Open(filename)
+	if err != nil {
+		return 0
 	}
+	defer file.Close()

-	opts.RopeFrequencyBase = 0.0
-	opts.RopeFrequencyScale = 0.0
-	return newLlmServer(info, model, adapters, projectors, opts)
+	ggml, _, err := DecodeGGML(file)
+	if err != nil {
+		return 0
+	}
+
+	prefixes := make(map[string]struct{})
+	for _, layer := range ggml.Tensors() {
+		parts := strings.Split(layer.Name, ".")
+		prefixes[strings.Join(parts[:2], ".")] = struct{}{}
+	}
+
+	var ask int64
+	for prefix := range prefixes {
+		ask += ggml.LayerSize(prefix)
+	}
+
+	return ask
 }

 // Give any native cgo implementations an opportunity to initialize
@@ -134,7 +144,7 @@ func Init() error {
 	return nativeInit()
 }

-func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts api.Options) (LLM, error) {
+func newLlmServer(gpuInfo gpu.GpuInfo, model string, adapters, projectors []string, opts *api.Options) (LLM, error) {
 	dynLibs := getDynLibs(gpuInfo)

 	// Check to see if the user has requested a specific library instead of auto-detecting

--- a/server/routes.go
+++ b/server/routes.go
@@ -68,7 +68,7 @@ var loaded struct {
 var defaultSessionDuration = 5 * time.Minute

 // load a model into memory if it is not already loaded, it is up to the caller to lock loaded.mu before calling this function
-func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.Duration) error {
+func load(c *gin.Context, model *Model, opts *api.Options, sessionDuration time.Duration) error {
 	needLoad := loaded.runner == nil || // is there a model loaded?
 		loaded.ModelPath != model.ModelPath || // has the base model changed?
 		!reflect.DeepEqual(loaded.AdapterPaths, model.AdapterPaths) || // have the adapters changed?
@@ -97,7 +97,7 @@ func load(c *gin.Context, model *Model, opts api.Options, sessionDuration time.D

 		loaded.Model = model
 		loaded.runner = llmRunner
-		loaded.Options = &opts
+		loaded.Options = opts
 	}

 	loaded.expireAt = time.Now().Add(sessionDuration)
@@ -214,7 +214,7 @@ func GenerateHandler(c *gin.Context) {
 		sessionDuration = req.KeepAlive.Duration
 	}

-	if err := load(c, model, opts, sessionDuration); err != nil {
+	if err := load(c, model, &opts, sessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -460,7 +460,7 @@ func EmbeddingsHandler(c *gin.Context) {
 		sessionDuration = req.KeepAlive.Duration
 	}

-	if err := load(c, model, opts, sessionDuration); err != nil {
+	if err := load(c, model, &opts, sessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}
@@ -1267,7 +1267,7 @@ func ChatHandler(c *gin.Context) {
 		sessionDuration = req.KeepAlive.Duration
 	}

-	if err := load(c, model, opts, sessionDuration); err != nil {
+	if err := load(c, model, &opts, sessionDuration); err != nil {
 		c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()})
 		return
 	}