Commit 7e33a017 authored by Michael Yang's avatar Michael Yang
Browse files

partial offloading

parent 8b2c1006
...@@ -50,7 +50,7 @@ func HumanBytes(b int64) string { ...@@ -50,7 +50,7 @@ func HumanBytes(b int64) string {
} }
} }
func HumanBytes2(b int64) string { func HumanBytes2(b uint64) string {
switch { switch {
case b >= MebiByte: case b >= MebiByte:
return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte) return fmt.Sprintf("%.1f MiB", float64(b)/MebiByte)
......
...@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) { ...@@ -243,7 +243,7 @@ func getCPUMem() (memInfo, error) {
return ret, nil return ret, nil
} }
func CheckVRAM() (int64, error) { func CheckVRAM() (uint64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM") userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" { if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64) avail, err := strconv.ParseInt(userLimit, 10, 64)
...@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) { ...@@ -251,11 +251,11 @@ func CheckVRAM() (int64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
} }
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil return uint64(avail), nil
} }
gpuInfo := GetGPUInfo() gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
return int64(gpuInfo.FreeMemory), nil return gpuInfo.FreeMemory, nil
} }
return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation return 0, fmt.Errorf("no GPU detected") // TODO - better handling of CPU based memory determiniation
......
...@@ -17,7 +17,7 @@ import ( ...@@ -17,7 +17,7 @@ import (
) )
// CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs // CheckVRAM returns the free VRAM in bytes on Linux machines with NVIDIA GPUs
func CheckVRAM() (int64, error) { func CheckVRAM() (uint64, error) {
userLimit := os.Getenv("OLLAMA_MAX_VRAM") userLimit := os.Getenv("OLLAMA_MAX_VRAM")
if userLimit != "" { if userLimit != "" {
avail, err := strconv.ParseInt(userLimit, 10, 64) avail, err := strconv.ParseInt(userLimit, 10, 64)
...@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) { ...@@ -25,15 +25,14 @@ func CheckVRAM() (int64, error) {
return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err) return 0, fmt.Errorf("Invalid OLLAMA_MAX_VRAM setting %s: %s", userLimit, err)
} }
slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail)) slog.Info(fmt.Sprintf("user override OLLAMA_MAX_VRAM=%d", avail))
return avail, nil return uint64(avail), nil
} }
if runtime.GOARCH == "amd64" { if runtime.GOARCH == "amd64" {
// gpu not supported, this may not be metal // gpu not supported, this may not be metal
return 0, nil return 0, nil
} }
recommendedMaxVRAM := int64(C.getRecommendedMaxVRAM()) return uint64(C.getRecommendedMaxVRAM()), nil
return recommendedMaxVRAM, nil
} }
func GetGPUInfo() GpuInfo { func GetGPUInfo() GpuInfo {
......
...@@ -15,7 +15,7 @@ type GpuInfo struct { ...@@ -15,7 +15,7 @@ type GpuInfo struct {
Variant string `json:"variant,omitempty"` Variant string `json:"variant,omitempty"`
// MinimumMemory represents the minimum memory required to use the GPU // MinimumMemory represents the minimum memory required to use the GPU
MinimumMemory int64 `json:"-"` MinimumMemory uint64 `json:"-"`
// TODO add other useful attributes about the card here for discovery information // TODO add other useful attributes about the card here for discovery information
} }
......
...@@ -324,45 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) { ...@@ -324,45 +324,52 @@ func DecodeGGML(rs io.ReadSeeker) (*GGML, int64, error) {
}, offset, nil }, offset, nil
} }
func (llm GGML) GraphSize(context, batch int) (int64, bool) { func (llm GGML) GraphSize(context, batch uint64) (partialOffload, fullOffload uint64) {
embeddingLength := llm.KV().EmbeddingLength() embedding := llm.KV().EmbeddingLength()
headCount := llm.KV().HeadCount() heads := llm.KV().HeadCount()
headCountKV := llm.KV().HeadCountKV() headsKV := llm.KV().HeadCountKV()
vocabLength := len(llm.KV()["tokenizer.ggml.tokens"].([]any)) vocab := uint64(len(llm.KV()["tokenizer.ggml.tokens"].([]any)))
layers := llm.Tensors().Layers()
var attnQKVWeight1 uint64 = 0
if t, ok := layers["0"]["attn_qkv.weight"]; ok && len(t.Shape) > 2 {
attnQKVWeight1 = t.Shape[1]
}
var ffnGate0Weight1 uint64 = 0
if t, ok := layers["0"]["ffn_gate.0.weight"]; ok && len(t.Shape) > 2 {
ffnGate0Weight1 = t.Shape[1]
}
switch llm.KV().Architecture() { switch llm.KV().Architecture() {
case "gemma", "command-r":
return 4 * int64(batch) * int64(embeddingLength+uint64(vocabLength)), true
case "phi2":
return max(
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
4*int64(batch)*int64(1+4*embeddingLength+uint64(context)+attnQKVWeight1+uint64(context)*headCount),
), true
case "qwen2":
return max(
4*int64(batch)*int64(embeddingLength+uint64(vocabLength)),
4*int64(batch)*int64(1+2*embeddingLength+uint64(context)+uint64(context)*headCount),
), true
case "llama": case "llama":
if ffnGate0Weight1 > 0 { fullOffload = 4 * batch * (1 + 4*embedding + context*(1+heads))
// moe
return 4 * int64(batch) * int64(2+3*embeddingLength+uint64(context)+uint64(context)*headCount+2*headCountKV+ffnGate0Weight1), true partialOffload = 4 * batch * embedding
} partialOffload += max(
4*batch*(1+embedding+max(context, embedding))+embedding*embedding*9/16+4*context*(batch*heads+embedding/heads*headsKV),
4*batch*(embedding+vocab)+embedding*vocab*105/128,
)
case "gemma":
fullOffload = 4 * batch * (embedding + vocab)
partialOffload = 4*batch*(2*embedding+vocab+1) + embedding*vocab*105/128
case "command-r":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(2+4*embedding+context*(1+heads)),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*batch*(1+2*embedding+context*(1+heads))+ 4*embedding*context+embedding*embedding*9/16,
)
case "qwen2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+2*embedding+context+context*heads),
)
partialOffload = max(
4*batch*(embedding+vocab)+embedding*vocab*105/128,
4*(batch*(1+2*embedding+context*(1+heads))+embedding*(1+context)),
)
case "phi2":
fullOffload = max(
4*batch*(embedding+vocab),
4*batch*(1+4*embedding+context+context*heads),
)
return 4 * int64(batch) * int64(1+4*embeddingLength+uint64(context)+uint64(context)*headCount), true partialOffload = 4*batch*(2*embedding+vocab) + embedding*vocab*105/128
} }
return 0, false return
} }
...@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{ ...@@ -41,10 +41,6 @@ var cpuOnlyFamilies = []string{
} }
func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) { func NewLlamaServer(model string, adapters, projectors []string, opts api.Options) (*LlamaServer, error) {
if _, err := os.Stat(model); err != nil {
return nil, err
}
f, err := os.Open(model) f, err := os.Open(model)
if err != nil { if err != nil {
return nil, err return nil, err
...@@ -65,12 +61,12 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option ...@@ -65,12 +61,12 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
opts.NumCtx = 4 opts.NumCtx = 4
} }
availableMemory, _ := gpu.CheckVRAM() memoryAvailable, _ := gpu.CheckVRAM()
info := gpu.GetGPUInfo() info := gpu.GetGPUInfo()
usedMemory := info.MinimumMemory memoryMinimum := info.MinimumMemory
for _, projector := range projectors { for _, projector := range projectors {
usedMemory += projectorMemoryRequirements(projector) memoryMinimum += projectorMemoryRequirements(projector)
// multimodal models require at least 2048 context // multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048) opts.NumCtx = max(opts.NumCtx, 2048)
...@@ -79,55 +75,65 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option ...@@ -79,55 +75,65 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
// fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv // fp16 k,v = (1 (k) + 1 (v)) * sizeof(float16) * n_ctx * n_layer * n_embd / n_head * n_head_kv
var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV() var kv uint64 = 2 * 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * ggml.KV().EmbeddingLength() / ggml.KV().HeadCount() * ggml.KV().HeadCountKV()
graph, ok := ggml.GraphSize(opts.NumCtx, min(opts.NumCtx, opts.NumBatch)) graphPartialOffload, graphFullOffload := ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
if !ok { if graphPartialOffload == 0 {
graph = int64(ggml.KV().GQA()*kv) / 6 graphPartialOffload = ggml.KV().GQA() * kv / 6
} }
usedMemory += graph if graphFullOffload == 0 {
graphFullOffload = graphPartialOffload
if (usedMemory > availableMemory || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture())) && info.Library != "metal" {
info.Library = "cpu"
} }
requiredMemory := usedMemory // memoryRequiredTotal represents the memory required for full GPU offloading (all layers)
memoryRequiredTotal := memoryMinimum + graphFullOffload
// memoryRequiredPartial represents the memory required for partial GPU offloading (n > 0, n < layers)
memoryRequiredPartial := memoryMinimum + graphPartialOffload
tensorLayers := ggml.Tensors().Layers() if info.Library != "metal" {
if memoryRequiredPartial > memoryAvailable || slices.Contains(cpuOnlyFamilies, ggml.KV().Architecture()) {
info.Library = "cpu"
}
}
var layers int var layerCount int
layers := ggml.Tensors().Layers()
for i := 0; i < int(ggml.KV().BlockCount()); i++ { for i := 0; i < int(ggml.KV().BlockCount()); i++ {
layerMemory := int64(tensorLayers[fmt.Sprintf("%d", i)].size() + kv/ggml.KV().BlockCount()) memoryLayer := layers[fmt.Sprintf("%d", i)].size()
requiredMemory += layerMemory
// KV is proportional to the number of layers
memoryLayer += kv / ggml.KV().BlockCount()
if availableMemory > usedMemory+layerMemory && (opts.NumGPU < 0 || layers < opts.NumGPU) { memoryRequiredTotal += memoryLayer
usedMemory += layerMemory if memoryAvailable > memoryRequiredPartial+memoryLayer {
layers++ memoryRequiredPartial += memoryLayer
layerCount++
} }
} }
memOutputLayer := int64(tensorLayers["output"].size()) memoryLayerOutput := layers["output"].size()
requiredMemory += memOutputLayer memoryRequiredTotal += memoryLayerOutput
if memoryAvailable > memoryRequiredTotal {
layerCount = int(ggml.KV().BlockCount()) + 1
memoryRequiredPartial = memoryRequiredTotal
}
// only offload output layer if all repeating layers are offloaded if opts.NumGPU < 0 {
if layers >= int(ggml.KV().BlockCount()) && availableMemory > usedMemory+memOutputLayer { opts.NumGPU = layerCount
usedMemory += memOutputLayer
layers++
} }
slog.Info( slog.Info(
"offload to gpu", "offload to gpu",
"layers", layers, "reallayers", opts.NumGPU,
"required", format.HumanBytes2(requiredMemory), "layers", layerCount,
"used", format.HumanBytes2(usedMemory), "required", format.HumanBytes2(memoryRequiredTotal),
"available", format.HumanBytes2(availableMemory), "used", format.HumanBytes2(memoryRequiredPartial),
"kv", format.HumanBytes2(int64(kv)), "available", format.HumanBytes2(memoryAvailable),
"graph", format.HumanBytes2(graph), "kv", format.HumanBytes2(kv),
"fulloffload", format.HumanBytes2(graphFullOffload),
"partialoffload", format.HumanBytes2(graphPartialOffload),
) )
if opts.NumGPU < 0 && info.Library != "cpu" {
opts.NumGPU = layers
}
if len(adapters) > 1 { if len(adapters) > 1 {
return nil, errors.New("ollama supports only one lora adapter, but multiple were provided") return nil, errors.New("ollama supports only one lora adapter, but multiple were provided")
} }
...@@ -284,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option ...@@ -284,7 +290,7 @@ func NewLlamaServer(model string, adapters, projectors []string, opts api.Option
return nil, finalErr return nil, finalErr
} }
func projectorMemoryRequirements(filename string) int64 { func projectorMemoryRequirements(filename string) uint64 {
file, err := os.Open(filename) file, err := os.Open(filename)
if err != nil { if err != nil {
return 0 return 0
...@@ -301,7 +307,7 @@ func projectorMemoryRequirements(filename string) int64 { ...@@ -301,7 +307,7 @@ func projectorMemoryRequirements(filename string) int64 {
mem += layer.size() mem += layer.size()
} }
return int64(mem) return mem
} }
type ServerStatus int type ServerStatus int
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment