Commit ab6be852 authored by Jeffrey Morgan's avatar Jeffrey Morgan
Browse files

revisit memory allocation to account for full kv cache on main gpu

parent b24e8d17
...@@ -2,7 +2,6 @@ package llm ...@@ -2,7 +2,6 @@ package llm
import ( import (
"context" "context"
"fmt"
"log" "log"
"os" "os"
"runtime" "runtime"
...@@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) ...@@ -41,94 +40,76 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts.NumCtx = 4 opts.NumCtx = 4
} }
fmt.Println("size", ggml.Size) vram, _ := gpu.CheckVRAM()
fmt.Println("filetype", ggml.FileType()) size := ggml.Size
fmt.Println("architecture", ggml.ModelFamily())
fmt.Println("type", ggml.ModelType())
fmt.Println("name", ggml.Name())
fmt.Println("embd", ggml.NumEmbed())
fmt.Println("head", ggml.NumHead())
fmt.Println("head_kv", ggml.NumHeadKv())
fmt.Println("gqa", ggml.NumGQA())
available, _ := gpu.CheckVRAM()
// For now assume filesize = model size
// TODO: use actual model size
requiredModel := ggml.Size
// fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value // fp16 k,v matrices require = n_ctx * n_layer * n_embd / n_head * n_head_kv * 2 bytes each * 2 key and value
requiredKv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead()) kv := 2 * 2 * int64(opts.NumCtx) * int64(ggml.NumLayers()) * int64(ggml.NumEmbed()) * int64(ggml.NumHeadKv()) / int64(ggml.NumHead())
// this amount is the overhead + tensors in memory // this amount is the overhead + tensors in memory
// TODO: get this from the llama.cpp's graph calcluations instead of // TODO: get this from the llama.cpp's graph calcluations instead of
// estimating it's 1/6 * kv_cache_size * num_gqa // estimating it's 1/6 * kv_cache_size * num_gqa
requiredAlloc := int64(ggml.NumGQA()) * requiredKv / 6 graph := int64(ggml.NumGQA()) * kv / 6
requiredTotal := requiredModel + requiredKv + requiredAlloc
log.Println("system memory bytes:", available)
log.Println("required model bytes:", requiredModel)
log.Println("required kv bytes:", requiredKv)
log.Println("required alloc bytes:", requiredAlloc)
log.Println("required total bytes:", requiredTotal)
info := gpu.GetGPUInfo() info := gpu.GetGPUInfo()
library := info.Library library := info.Library
if opts.NumGPU == -1 {
// default to offloading all layers
opts.NumGPU = int(ggml.NumLayers()) + 1
}
// decide how many layers to put on the GPU
if opts.NumGPU > 0 {
switch runtime.GOOS { switch runtime.GOOS {
case "darwin": case "darwin":
if requiredTotal > available { if opts.NumGPU == 0 {
break
}
if size+kv+graph > vram {
log.Println("not enough vram available, falling back to CPU only") log.Println("not enough vram available, falling back to CPU only")
opts.NumGPU = 0 opts.NumGPU = 0
break
} }
opts.NumGPU = 1
default: default:
if library == "cpu" || library == "default" { if library == "cpu" || library == "default" {
log.Println("GPU not available, falling back to CPU")
opts.NumGPU = 0 opts.NumGPU = 0
break break
} }
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu // don't use GPU at all if no layers are loaded
// TODO: find the largest GPU and only reserve memory there if opts.NumGPU == 0 {
avgAvailable := available / int64(info.DeviceCount)
if requiredAlloc > avgAvailable {
log.Printf("not enough vram available, falling back to CPU only")
library = "cpu" library = "cpu"
opts.NumGPU = 0
break break
} }
// we don't know which GPU will be used, so estimate // user-defined GPU count
// the scratch buffer space on all of them if opts.NumGPU != -1 {
// TODO: allocate less layers to the GPU with the scratch buffer
// and more to the others (based on their available memory)
available -= requiredAlloc * int64(info.DeviceCount)
// no offloading required
if requiredModel+requiredKv <= available {
break break
} }
// fill remaining vram with layers // the "main" GPU needs the most memory and determines the limit
log.Println("splitting", available, "of available memory bytes into layers") // of how many layers can be loaded. It needs to fit:
bytesPerLayer := int64((requiredModel + requiredKv) / int64(ggml.NumLayers())) // 1. the full compute graph allocation for all devices (graph)
log.Println("bytes per layer:", bytesPerLayer) // 2. the proportional kv cache for all devices (kv * % layers)
layers := available / bytesPerLayer // 3. the proportional model (size * % layers / # devices)
log.Println("total required with split:", requiredAlloc+(layers*bytesPerLayer)) // This estimates the number of layers
if layers < int64(opts.NumGPU) { maxlayers := int64(ggml.NumLayers()) + 1
opts.NumGPU = int(layers) devices := int64(info.DeviceCount)
avg := vram / devices
layers := maxlayers * (avg - graph) / (kv + size/devices)
if layers > maxlayers {
layers = maxlayers
} }
// 1 + 2 must fit on the main gpu
min := graph + kv*layers/maxlayers
if layers <= 0 || min > avg {
log.Printf("not enough vram available, falling back to CPU only")
library = "cpu"
opts.NumGPU = 0
break
} }
opts.NumGPU = int(layers)
} }
opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0 opts.RopeFrequencyScale = 0.0
return newLlmServer(library, model, adapters, projectors, opts) return newLlmServer(library, model, adapters, projectors, opts)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment