Commit f387e963 authored by Jeffrey Morgan's avatar Jeffrey Morgan
Browse files

use runner if cuda alloc won't fit

parent 6566387a
...@@ -100,10 +100,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) ...@@ -100,10 +100,8 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
break break
} }
// This handles two cases: // requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
// 1. overhead + tensors are always loaded into scratch memory even with num_gpu 0 if requiredAlloc > available {
// 2. it seems llama.cpp always tries to allocate the entire kv cache (even if later split into layers) into vram or crashes
if requiredAlloc > available || requiredKv > available {
log.Printf("not enough vram available, falling back to CPU only") log.Printf("not enough vram available, falling back to CPU only")
library = "cpu" library = "cpu"
opts.NumGPU = 0 opts.NumGPU = 0
...@@ -127,8 +125,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options) ...@@ -127,8 +125,7 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
opts.NumGQA = 0 opts.NumGQA = 0
opts.RopeFrequencyBase = 0.0 opts.RopeFrequencyBase = 0.0
opts.RopeFrequencyScale = 0.0 opts.RopeFrequencyScale = 0.0
gpuInfo := gpu.GetGPUInfo() return newLlmServer(library, model, adapters, projectors, opts)
return newLlmServer(gpuInfo.Library, model, adapters, projectors, opts)
} }
// Give any native cgo implementations an opportunity to initialize // Give any native cgo implementations an opportunity to initialize
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment