Commit d5a0d8d9 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: New memory management

This changes the memory allocation strategy from upfront estimation to
tracking actual allocations done by the engine and reacting to that. The
goal is avoid issues caused by both under-estimation (crashing) and
over-estimation (low performance due to under-utilized GPUs).

It is currently opt-in and can be enabled for models running on the
Ollama engine by setting OLLAMA_NEW_ESTIMATES=1. Behavior in other
cases is unchanged and will continue to use the existing estimates.
parent ef7d26ba
...@@ -14,6 +14,7 @@ import ( ...@@ -14,6 +14,7 @@ import (
"net" "net"
"net/http" "net/http"
"os" "os"
"reflect"
"regexp" "regexp"
"runtime" "runtime"
"strconv" "strconv"
...@@ -259,6 +260,16 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [ ...@@ -259,6 +260,16 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
} }
type Server struct { type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// lastLoad is the load request from the previous load attempt. Used to
// detect if we can reuse an existing memory allocation.
lastLoad llm.LoadRequest
// is the server ready to process requests? // is the server ready to process requests?
// protects access to model and image // protects access to model and image
ready sync.WaitGroup ready sync.WaitGroup
...@@ -720,17 +731,6 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { ...@@ -720,17 +731,6 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
} }
} }
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) reserveWorstCaseGraph() error { func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext() ctx := s.model.Backend().NewContext()
defer ctx.Close() defer ctx.Close()
...@@ -828,15 +828,28 @@ func (s *Server) reserveWorstCaseGraph() error { ...@@ -828,15 +828,28 @@ func (s *Server) reserveWorstCaseGraph() error {
return nil return nil
} }
func (s *Server) initModel( // allocModel pre-allocates the maximum needed memory for a model
// based on the given parameters
func (s *Server) allocModel(
mpath string, mpath string,
params ml.BackendParams, params ml.BackendParams,
lpath multiLPath, loraPath []string,
parallel int, parallel int,
kvCacheType string, kvCacheType string,
kvSize int, kvSize int,
multiUserCache bool, multiUserCache bool,
) error { ) (panicErr error) {
// Convert memory allocation panics to errors
defer func() {
if r := recover(); r != nil {
if err, ok := r.(error); ok {
panicErr = err
} else {
panic(r)
}
}
}()
var err error var err error
s.model, err = model.New(mpath, params) s.model, err = model.New(mpath, params)
if err != nil { if err != nil {
...@@ -844,7 +857,7 @@ func (s *Server) initModel( ...@@ -844,7 +857,7 @@ func (s *Server) initModel(
} }
// TODO(jessegross): LoRA loading // TODO(jessegross): LoRA loading
if lpath.String() != "" { if len(loraPath) > 0 {
return errors.New("loras are not yet implemented") return errors.New("loras are not yet implemented")
} }
...@@ -865,63 +878,122 @@ func (s *Server) initModel( ...@@ -865,63 +878,122 @@ func (s *Server) initModel(
return s.reserveWorstCaseGraph() return s.reserveWorstCaseGraph()
} }
func (s *Server) load( // closeModel frees all memory associated with a model
ctx context.Context, func (s *Server) closeModel() {
mpath string,
params ml.BackendParams,
lpath multiLPath,
parallel int,
kvCacheType string,
kvSize int,
multiUserCache bool,
) {
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
if err != nil {
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
// We can't yet handle this but in the future we will
s.cache.Close() s.cache.Close()
s.cache = nil
if s.model != nil { if s.model != nil {
s.model.Backend().Close() s.model.Backend().Close()
s.model = nil
} }
} }
panic(err)
}
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
err = s.model.Backend().Load(ctx, // loadModel loads the weights for a model. The memory must already
// have been allocated with allocModel
func (s *Server) loadModel() {
err := s.model.Backend().Load(context.TODO(),
func(progress float32) { func(progress float32) {
s.progress = progress s.progress = progress
}) })
if err != nil { if err != nil {
panic(err) panic(fmt.Errorf("failed to load model: %v", err))
} }
s.status = llm.ServerStatusReady s.status = llm.ServerStatusReady
s.ready.Done() s.ready.Done()
} }
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
if req.Operation == llm.LoadOperationClose {
s.closeModel()
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
s.lastLoad.Operation = req.Operation
loadModel := s.model == nil || !reflect.DeepEqual(req, s.lastLoad)
s.lastLoad = req
if loadModel {
s.closeModel()
params := ml.BackendParams{
AllocMemory: req.Operation != llm.LoadOperationFit,
NumThreads: req.NumThreads,
GPULayers: req.GPULayers,
FlashAttention: req.FlashAttention,
}
s.batchSize = req.BatchSize
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
if err != nil {
s.closeModel()
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
resp := llm.LoadResponse{Success: false, Memory: noMem.BackendMemory}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
http.Error(w, fmt.Sprintf("failed to initialize model: %v", err), http.StatusInternalServerError)
return
}
}
mem := s.model.Backend().BackendMemory()
switch req.Operation {
case llm.LoadOperationFit:
// LoadOperationFit can't be used for anything else, so just close it
s.closeModel()
// LoadOperationAlloc should stay open for future operations
case llm.LoadOperationCommit:
s.status = llm.ServerStatusLoadingModel
go s.loadModel()
}
resp := llm.LoadResponse{Success: true, Memory: mem}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error { func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError) fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file") mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGPU := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on") port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() { fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n") fmt.Fprintf(fs.Output(), "Runner usage\n")
...@@ -933,39 +1005,17 @@ func Execute(args []string) error { ...@@ -933,39 +1005,17 @@ func Execute(args []string) error {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel())) slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
slog.Info("starting ollama engine") slog.Info("starting ollama engine")
server := &Server{
batchSize: *batchSize,
status: llm.ServerStatusLoadingModel,
}
server.cond = sync.NewCond(&server.mu)
server.ready.Add(1)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
defer cancel() defer cancel()
// TODO(jessegross): Parameters that need to be implemented: server := &Server{
// no-mmap modelPath: *mpath,
status: llm.ServerStatusLaunched,
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
} }
params := ml.BackendParams{ server.cond = sync.NewCond(&server.mu)
NumThreads: *threads, server.ready.Add(1)
NumGPULayers: *numGPULayers,
MainGPU: *mainGPU,
TensorSplit: tensorSplitFloats,
FlashAttention: *flashAttention,
}
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
go server.run(ctx) go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port) addr := "127.0.0.1:" + strconv.Itoa(*port)
...@@ -978,6 +1028,7 @@ func Execute(args []string) error { ...@@ -978,6 +1028,7 @@ func Execute(args []string) error {
mux := http.NewServeMux() mux := http.NewServeMux()
// TODO: support embeddings // TODO: support embeddings
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) { mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented) http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
}) })
......
...@@ -1477,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) { ...@@ -1477,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) {
mr := api.ProcessModelResponse{ mr := api.ProcessModelResponse{
Model: model.ShortName, Model: model.ShortName,
Name: model.ShortName, Name: model.ShortName,
Size: int64(v.estimatedTotal), Size: int64(v.totalSize),
SizeVRAM: int64(v.estimatedVRAM), SizeVRAM: int64(v.vramSize),
Digest: model.Digest, Digest: model.Digest,
Details: modelDetails, Details: modelDetails,
ExpiresAt: v.expiresAt, ExpiresAt: v.expiresAt,
} }
if v.Options != nil { if v.Options != nil {
mr.ContextLength = v.Options.NumCtx / v.numParallel mr.ContextLength = v.Options.NumCtx
} }
// The scheduler waits to set expiresAt, so if a model is loading it's // The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just // possible that it will be set to the unix epoch. For those cases, just
......
...@@ -77,12 +77,13 @@ func TestGenerateChat(t *testing.T) { ...@@ -77,12 +77,13 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
...@@ -620,12 +621,13 @@ func TestGenerate(t *testing.T) { ...@@ -620,12 +621,13 @@ func TestGenerate(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
......
...@@ -277,10 +277,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { ...@@ -277,10 +277,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond, reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
...@@ -427,10 +428,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { ...@@ -427,10 +428,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond, reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
...@@ -608,10 +610,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) { ...@@ -608,10 +610,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
......
...@@ -28,7 +28,6 @@ type LlmRequest struct { ...@@ -28,7 +28,6 @@ type LlmRequest struct {
ctx context.Context //nolint:containedctx ctx context.Context //nolint:containedctx
model *Model model *Model
opts api.Options opts api.Options
origNumCtx int // Track the initial ctx request
sessionDuration *api.Duration sessionDuration *api.Duration
successCh chan *runnerRef successCh chan *runnerRef
errCh chan error errCh chan error
...@@ -41,10 +40,17 @@ type Scheduler struct { ...@@ -41,10 +40,17 @@ type Scheduler struct {
expiredCh chan *runnerRef expiredCh chan *runnerRef
unloadedCh chan any unloadedCh chan any
loaded map[string]*runnerRef // loadedMu protects loaded and activeLoading
loadedMu sync.Mutex loadedMu sync.Mutex
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) // activeLoading is the model that we are currently working on loading,
// including by evicting one or more other models. We can only load
// one model at a time but new requests to models that already loaded can
// happen in parallel
activeLoading llm.LlamaServer
loaded map[string]*runnerRef
loadFn func(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool
newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) newServerFn func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error)
getGpuFn func() discover.GpuInfoList getGpuFn func() discover.GpuInfoList
getCpuFn func() discover.GpuInfoList getCpuFn func() discover.GpuInfoList
...@@ -56,9 +62,6 @@ type Scheduler struct { ...@@ -56,9 +62,6 @@ type Scheduler struct {
// on a large GPU can cause stalling // on a large GPU can cause stalling
var defaultModelsPerGPU = 3 var defaultModelsPerGPU = 3
// Default automatic value for parallel setting
var defaultParallel = 1
var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded") var ErrMaxQueue = errors.New("server busy, please try again. maximum pending requests exceeded")
func InitScheduler(ctx context.Context) *Scheduler { func InitScheduler(ctx context.Context) *Scheduler {
...@@ -79,25 +82,37 @@ func InitScheduler(ctx context.Context) *Scheduler { ...@@ -79,25 +82,37 @@ func InitScheduler(ctx context.Context) *Scheduler {
} }
// context must be canceled to decrement ref count and release the runner // context must be canceled to decrement ref count and release the runner
func (s *Scheduler) GetRunner(c context.Context, model *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) { func (s *Scheduler) GetRunner(c context.Context, m *Model, opts api.Options, sessionDuration *api.Duration) (chan *runnerRef, chan error) {
if opts.NumCtx < 4 { if opts.NumCtx < 4 {
opts.NumCtx = 4 opts.NumCtx = 4
} }
if m.CheckCapabilities(model.CapabilityVision) == nil {
// multimodal models require at least 2048 context
opts.NumCtx = max(opts.NumCtx, 2048)
}
req := &LlmRequest{ req := &LlmRequest{
ctx: c, ctx: c,
model: model, model: m,
opts: opts, opts: opts,
sessionDuration: sessionDuration, sessionDuration: sessionDuration,
successCh: make(chan *runnerRef), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
s.loadedMu.Lock()
runner := s.loaded[req.model.ModelPath]
s.loadedMu.Unlock()
if runner != nil && !runner.needsReload(c, req) {
req.useLoadedRunner(runner, s.finishedReqCh)
} else {
select { select {
case s.pendingReqCh <- req: case s.pendingReqCh <- req:
default: default:
req.errCh <- ErrMaxQueue req.errCh <- ErrMaxQueue
} }
}
return req.successCh, req.errCh return req.successCh, req.errCh
} }
...@@ -122,21 +137,11 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -122,21 +137,11 @@ func (s *Scheduler) processPending(ctx context.Context) {
case pending := <-s.pendingReqCh: case pending := <-s.pendingReqCh:
// Block other requests until we get this pending request running // Block other requests until we get this pending request running
pending.schedAttempts++ pending.schedAttempts++
if pending.origNumCtx == 0 {
pending.origNumCtx = pending.opts.NumCtx
}
if pending.ctx.Err() != nil { if pending.ctx.Err() != nil {
slog.Debug("pending request cancelled or timed out, skipping scheduling") slog.Debug("pending request cancelled or timed out, skipping scheduling")
continue continue
} }
numParallel := int(envconfig.NumParallel())
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(pending.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
for { for {
var runnerToExpire *runnerRef var runnerToExpire *runnerRef
...@@ -195,85 +200,27 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -195,85 +200,27 @@ func (s *Scheduler) processPending(ctx context.Context) {
break break
} }
// Embedding models should always be loaded with parallel=1 // Update free memory from currently loaded models
if pending.model.CheckCapabilities(model.CapabilityCompletion) != nil { s.updateFreeSpace(gpus)
numParallel = 1
}
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
if len(gpus) == 1 && gpus[0].Library == "cpu" {
// simplifying assumption of defaultParallel when in CPU mode
if numParallel <= 0 {
numParallel = defaultParallel
}
pending.opts.NumCtx = pending.origNumCtx * numParallel
if loadedCount == 0 { if loadedCount == 0 {
slog.Debug("cpu mode with first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
runnerToExpire = s.maybeFindCPURunnerToUnload(pending, ggml, gpus)
if runnerToExpire == nil {
slog.Debug("cpu mode with available system memory or first model, loading")
s.loadFn(pending, ggml, gpus, numParallel)
break
}
// else we need to expire a runner
} else if loadedCount == 0 {
// No models loaded. Load the model but prefer the best fit. // No models loaded. Load the model but prefer the best fit.
slog.Debug("loading first model", "model", pending.model.ModelPath) slog.Debug("loading first model", "model", pending.model.ModelPath)
g := pickBestFullFitByLibrary(pending, ggml, gpus, &numParallel) s.loadFn(pending, ggml, gpus, false)
if g != nil {
gpus = g
} else {
// Only allow partial loads when this is the first model
gpus = pickBestPartialFitByLibrary(pending, ggml, gpus, &numParallel)
}
s.loadFn(pending, ggml, gpus, numParallel)
break break
} }
if runnerToExpire == nil {
// More than one loaded model, so we have to see if the // More than one loaded model, so we have to see if the
// new one fits // new one fits
//
// We want to avoid loading on any GPUs that have other
// models still loading on them to avoid potential races
// with VRAM consumption ramping up during load
availGpus := s.filterGPUsWithoutLoadingModels(gpus)
// Update free memory from currently loaded models needEvict := s.loadFn(pending, ggml, gpus, true)
s.updateFreeSpace(availGpus) if !needEvict {
fitGpus := pickBestFullFitByLibrary(pending, ggml, availGpus, &numParallel)
if fitGpus != nil {
slog.Debug("new model fits with existing models, loading") slog.Debug("new model fits with existing models, loading")
s.loadFn(pending, ggml, fitGpus, numParallel)
break break
} }
// We couldn't find a set of GPUs to fully load the new
// model. If no other models are loading (both GPU lists
// are the same) then we need to unload another model to
// make room
if len(availGpus) < len(gpus) {
// There are other requests pending, and this one
// needs more time, so put it on the back of the
// queue so that we might satisfy other pending
// requests that aren't blocked
go func() {
// Process in a go routine to avoid deadlocking
// the scheduler if our queue is full
slog.Debug("delaying scheduling while other models finish loading", "attempts", pending.schedAttempts, "model", pending.model.ModelPath)
time.Sleep(s.reschedDelay)
s.pendingReqCh <- pending
}()
break
}
runnerToExpire = s.findRunnerToUnload() runnerToExpire = s.findRunnerToUnload()
} }
}
if runnerToExpire == nil { if runnerToExpire == nil {
// Shouildn't happen // Shouildn't happen
...@@ -293,8 +240,6 @@ func (s *Scheduler) processPending(ctx context.Context) { ...@@ -293,8 +240,6 @@ func (s *Scheduler) processPending(ctx context.Context) {
} }
runnerToExpire.refMu.Unlock() runnerToExpire.refMu.Unlock()
// Wait for the unload to happen // Wait for the unload to happen
// Note: at this point we're queueing up all incoming requests, even if they were for
// a different model that's loaded and not scheduled to be removed.
slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire) slog.Debug("waiting for pending requests to complete and unload to occur", "runner", runnerToExpire)
select { select {
case <-ctx.Done(): case <-ctx.Done():
...@@ -434,15 +379,37 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm ...@@ -434,15 +379,37 @@ func (pending *LlmRequest) useLoadedRunner(runner *runnerRef, finished chan *Llm
}() }()
} }
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel int) { // load creates a new model based on req and loads it. If requireFull is true then the model must be loaded fully onto GPUs
// (if any). Returns whether the scheduler needs to evict a model to make this one fit.
func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, requireFull bool) bool {
numParallel := int(envconfig.NumParallel())
if numParallel < 1 { if numParallel < 1 {
numParallel = 1 numParallel = 1
} }
// Embedding models should always be loaded with parallel=1
if req.model.CheckCapabilities(model.CapabilityCompletion) != nil {
numParallel = 1
}
// `mllama` is a snowflake and uses an encoder cache which cannot be used with num_parallel > 1
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains(req.model.Config.ModelFamilies, "mllama") && numParallel != 1 {
numParallel = 1
slog.Warn("mllama does not currently support parallel requests")
}
sessionDuration := envconfig.KeepAlive() sessionDuration := envconfig.KeepAlive()
if req.sessionDuration != nil { if req.sessionDuration != nil {
sessionDuration = req.sessionDuration.Duration sessionDuration = req.sessionDuration.Duration
} }
llama, err := s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
s.loadedMu.Lock()
llama := s.activeLoading
if llama == nil {
var err error
llama, err = s.newServerFn(gpus, req.model.ModelPath, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, numParallel)
if err != nil { if err != nil {
// some older models are not compatible with newer versions of llama.cpp // some older models are not compatible with newer versions of llama.cpp
// show a generalized compatibility error until there is a better way to // show a generalized compatibility error until there is a better way to
...@@ -452,8 +419,32 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -452,8 +419,32 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
} }
slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err) slog.Info("NewLlamaServer failed", "model", req.model.ModelPath, "error", err)
req.errCh <- err req.errCh <- err
return s.loadedMu.Unlock()
return false
}
s.activeLoading = llama
} else {
if s.activeLoading.ModelPath() != req.model.ModelPath {
panic(fmt.Errorf("attempting to load different model after eviction (original %v new %v)", s.activeLoading.ModelPath(), req.model.ModelPath))
} }
}
s.loadedMu.Unlock()
err := llama.Load(req.ctx, gpus, requireFull)
if err != nil {
if errors.Is(err, llm.ErrLoadRequiredFull) {
return true
}
slog.Info("Load failed", "model", req.model.ModelPath, "error", err)
s.activeLoading.Close()
s.activeLoading = nil
req.errCh <- err
return false
}
runner := &runnerRef{ runner := &runnerRef{
model: req.model, model: req.model,
modelPath: req.model.ModelPath, modelPath: req.model.ModelPath,
...@@ -461,8 +452,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -461,8 +452,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
Options: &req.opts, Options: &req.opts,
sessionDuration: sessionDuration, sessionDuration: sessionDuration,
gpus: gpus, gpus: gpus,
estimatedVRAM: llama.EstimatedVRAM(), vramSize: llama.VRAMSize(),
estimatedTotal: llama.EstimatedTotal(), totalSize: llama.TotalSize(),
loading: true, loading: true,
pid: llama.Pid(), pid: llama.Pid(),
} }
...@@ -477,6 +468,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -477,6 +468,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
oldRunner.unload() oldRunner.unload()
oldRunner.refMu.Unlock() oldRunner.refMu.Unlock()
} }
s.activeLoading = nil
s.loaded[req.model.ModelPath] = runner s.loaded[req.model.ModelPath] = runner
slog.Info("loaded runners", "count", len(s.loaded)) slog.Info("loaded runners", "count", len(s.loaded))
s.loadedMu.Unlock() s.loadedMu.Unlock()
...@@ -503,6 +495,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis ...@@ -503,6 +495,8 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoLis
}() }()
req.successCh <- runner req.successCh <- runner
}() }()
return false
} }
func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
...@@ -521,7 +515,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { ...@@ -521,7 +515,7 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
r.refMu.Lock() r.refMu.Lock()
if r.llama != nil { if r.llama != nil {
for _, gpu := range allGpus { for _, gpu := range allGpus {
predMap[predKey{gpu.Library, gpu.ID}] += r.llama.EstimatedVRAMByGPU(gpu.ID) predMap[predKey{gpu.Library, gpu.ID}] += r.llama.VRAMByGPU(gpu.ID)
} }
} else { } else {
slog.Warn("unexpected nil runner reference, memory prediction may be incorrect") slog.Warn("unexpected nil runner reference, memory prediction may be incorrect")
...@@ -548,30 +542,6 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) { ...@@ -548,30 +542,6 @@ func (s *Scheduler) updateFreeSpace(allGpus discover.GpuInfoList) {
} }
} }
// While models are loading the VRAM consumption numbers will be indeterminate, so we have
// to avoid scheduling another model on the same GPU(s) that haven't stabilized.
// This routine returns the set of GPUs that do not have an active loading model.
// If all GPUs have loading models, an empty list will be returned (not a single CPU entry)
func (s *Scheduler) filterGPUsWithoutLoadingModels(allGpus discover.GpuInfoList) discover.GpuInfoList {
ret := append(discover.GpuInfoList{}, allGpus...)
s.loadedMu.Lock()
defer s.loadedMu.Unlock()
for _, runner := range s.loaded {
if runner.loading {
slog.Debug("overlapping loads detected", "gpus", runner.gpus, "model", runner.modelPath)
for _, busyGPU := range runner.gpus {
for i := range ret {
if ret[i].ID == busyGPU.ID {
ret = append(ret[:i], ret[i+1:]...)
break
}
}
}
}
}
return ret
}
// TODO consolidate sched_types.go // TODO consolidate sched_types.go
type runnerRef struct { type runnerRef struct {
refMu sync.Mutex refMu sync.Mutex
...@@ -581,8 +551,8 @@ type runnerRef struct { ...@@ -581,8 +551,8 @@ type runnerRef struct {
pid int pid int
loading bool // True only during initial load, then false forever loading bool // True only during initial load, then false forever
gpus discover.GpuInfoList // Recorded at time of provisioning gpus discover.GpuInfoList // Recorded at time of provisioning
estimatedVRAM uint64 vramSize uint64
estimatedTotal uint64 totalSize uint64
sessionDuration time.Duration sessionDuration time.Duration
expireTimer *time.Timer expireTimer *time.Timer
...@@ -631,9 +601,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool ...@@ -631,9 +601,6 @@ func (runner *runnerRef) needsReload(ctx context.Context, req *LlmRequest) bool
optsNew.NumGPU = -1 optsNew.NumGPU = -1
} }
// Normalize the NumCtx for parallelism
optsExisting.NumCtx = optsExisting.NumCtx / runner.numParallel
ctx, cancel := context.WithTimeout(ctx, timeout) ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel() defer cancel()
if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed? if !reflect.DeepEqual(runner.model.AdapterPaths, req.model.AdapterPaths) || // have the adapters changed?
...@@ -694,7 +661,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any { ...@@ -694,7 +661,7 @@ func (runner *runnerRef) waitForVRAMRecovery() chan any {
freeMemoryNow += gpu.FreeMemory freeMemoryNow += gpu.FreeMemory
} }
// If we're within ~80% of the estimated memory usage recovered, bail out // If we're within ~80% of the estimated memory usage recovered, bail out
if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.estimatedVRAM)*0.8 { if float32(freeMemoryNow-freeMemoryBefore) > float32(runner.vramSize)*0.8 {
slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner) slog.Debug(fmt.Sprintf("gpu VRAM free memory converged after %0.2f seconds", time.Since(start).Seconds()), "runner", runner)
finished <- struct{}{} finished <- struct{}{}
return return
...@@ -719,8 +686,8 @@ func (runner *runnerRef) LogValue() slog.Value { ...@@ -719,8 +686,8 @@ func (runner *runnerRef) LogValue() slog.Value {
) )
} }
attrs = append(attrs, attrs = append(attrs,
slog.String("size", format.HumanBytes2(runner.estimatedTotal)), slog.String("size", format.HumanBytes2(runner.totalSize)),
slog.String("vram", format.HumanBytes2(runner.estimatedVRAM)), slog.String("vram", format.HumanBytes2(runner.vramSize)),
slog.Int("parallel", runner.numParallel), slog.Int("parallel", runner.numParallel),
slog.Int("pid", runner.pid), slog.Int("pid", runner.pid),
slog.String("model", runner.modelPath), slog.String("model", runner.modelPath),
...@@ -750,95 +717,7 @@ func (a ByDurationAndName) Less(i, j int) bool { ...@@ -750,95 +717,7 @@ func (a ByDurationAndName) Less(i, j int) bool {
// type BySize []*runnerRef // type BySize []*runnerRef
// func (a BySize) Len() int { return len(a) } // func (a BySize) Len() int { return len(a) }
// func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] } // func (a BySize) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
// func (a BySize) Less(i, j int) bool { return a[i].estimatedVRAM < a[j].estimatedVRAM } // func (a BySize) Less(i, j int) bool { return a[i].vramSize < a[j].vramSize }
// pickBestFullFitByLibrary will try to find the optimal placement of the model in the available GPUs where the model fully fits
// The list of GPUs returned will always be the same brand (library)
// If the model can not be fit fully within the available GPU(s) nil is returned
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
// opts.NumCtx accordingly
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
var numParallelToTry []int
if *numParallel <= 0 {
// If no specific parallel setting was provided, try larger then smaller, always end with 1
numParallelToTry = append(numParallelToTry, defaultParallel, 1)
} else {
numParallelToTry = []int{*numParallel}
}
for _, gl := range gpus.ByLibrary() {
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
// TODO - potentially sort by performance capability, existing models loaded, etc.
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
if !envconfig.SchedSpread() {
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
// Try to pack into as few GPUs as possible, starting from 1 GPU
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
gpuSubset := sgl[:numGPUs]
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
if ok {
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", numGPUs)
*numParallel = p
return gpuSubset
}
}
}
} else {
// TODO future refinements
// - if multiple Libraries, see if any single GPU in any Library will fit
// - try subsets of GPUs instead of just falling back to 1 or all in a family
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
for _, p := range numParallelToTry {
req.opts.NumCtx = req.origNumCtx * p
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
slog.Info("new model will fit in available VRAM, loading",
"model", req.model.ModelPath,
"library", sgl[0].Library,
"parallel", p,
"required", format.HumanBytes2(estimatedVRAM),
"gpus", len(sgl))
*numParallel = p
return sgl
}
}
}
}
return nil
}
// If multiple Libraries are detected, pick the Library which loads the most layers for the model
func pickBestPartialFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
if *numParallel <= 0 {
*numParallel = 1
req.opts.NumCtx = req.origNumCtx
}
byLibrary := gpus.ByLibrary()
if len(byLibrary) <= 1 {
return gpus
}
var bestEstimate uint64
var bestFit int
for i, gl := range byLibrary {
_, estimatedVRAM := llm.PredictServerFit(gl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, *numParallel)
if estimatedVRAM > bestEstimate {
bestEstimate = estimatedVRAM
bestFit = i
}
}
return byLibrary[bestFit]
}
// findRunnerToUnload finds a runner to unload to make room for a new model // findRunnerToUnload finds a runner to unload to make room for a new model
func (s *Scheduler) findRunnerToUnload() *runnerRef { func (s *Scheduler) findRunnerToUnload() *runnerRef {
...@@ -875,6 +754,13 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef { ...@@ -875,6 +754,13 @@ func (s *Scheduler) findRunnerToUnload() *runnerRef {
func (s *Scheduler) unloadAllRunners() { func (s *Scheduler) unloadAllRunners() {
s.loadedMu.Lock() s.loadedMu.Lock()
defer s.loadedMu.Unlock() defer s.loadedMu.Unlock()
if s.activeLoading != nil {
slog.Debug("shutting down currently loading runner")
s.activeLoading.Close()
s.activeLoading = nil
}
for model, runner := range s.loaded { for model, runner := range s.loaded {
if runner.llama != nil { if runner.llama != nil {
slog.Debug("shutting down runner", "model", model) slog.Debug("shutting down runner", "model", model)
...@@ -901,18 +787,3 @@ func (s *Scheduler) expireRunner(model *Model) { ...@@ -901,18 +787,3 @@ func (s *Scheduler) expireRunner(model *Model) {
runner.refMu.Unlock() runner.refMu.Unlock()
} }
} }
// If other runners are loaded, make sure the pending request will fit in system memory
// If not, pick a runner to unload, else return nil and the request can be loaded
func (s *Scheduler) maybeFindCPURunnerToUnload(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList) *runnerRef {
slog.Debug("evaluating if CPU model load will fit in available system memory")
estimate := llm.EstimateGPULayers(gpus, f, req.model.ProjectorPaths, req.opts, req.opts.NumCtx/req.origNumCtx)
if estimate.TotalSize <= gpus[0].FreeMemory {
slog.Debug("cpu inference mode, model fits in available system memory", "model", format.HumanBytes2(estimate.TotalSize), "available", format.HumanBytes2(gpus[0].FreeMemory))
return nil
}
// TODO - optimization: try to find CPU only runners first, or partial offloads with enough in system memory to make room
return s.findRunnerToUnload()
}
...@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) { ...@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) {
return nil, errors.New("something failed to load model blah") return nil, errors.New("something failed to load model blah")
} }
gpus := discover.GpuInfoList{} gpus := discover.GpuInfoList{}
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
require.Empty(t, req.successCh) require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1) require.Len(t, req.errCh, 1)
s.loadedMu.Lock() s.loadedMu.Lock()
...@@ -61,16 +61,17 @@ func TestLoad(t *testing.T) { ...@@ -61,16 +61,17 @@ func TestLoad(t *testing.T) {
err := <-req.errCh err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible") require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil return server, nil
} }
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
require.NoError(t, err) require.NoError(t, err)
case resp := <-req.successCh: case resp := <-req.successCh:
require.Equal(t, uint64(10), resp.estimatedVRAM) require.Equal(t, uint64(10), resp.vramSize)
require.Equal(t, uint(1), resp.refCount) require.Equal(t, uint(1), resp.refCount)
s.loadedMu.Lock() s.loadedMu.Lock()
require.Len(t, s.loaded, 1) require.Len(t, s.loaded, 1)
...@@ -79,7 +80,7 @@ func TestLoad(t *testing.T) { ...@@ -79,7 +80,7 @@ func TestLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path" req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure") server.waitResp = errors.New("wait failure")
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure") require.Contains(t, err.Error(), "wait failure")
...@@ -104,10 +105,11 @@ type reqBundle struct { ...@@ -104,10 +105,11 @@ type reqBundle struct {
} }
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil return scenario.srv, nil
} }
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle { func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
b := &reqBundle{} b := &reqBundle{}
b.ctx, b.ctxDone = context.WithCancel(ctx) b.ctx, b.ctxDone = context.WithCancel(ctx)
t.Helper() t.Helper()
...@@ -144,7 +146,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est ...@@ -144,7 +146,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
return b return b
} }
...@@ -262,10 +264,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) { ...@@ -262,10 +264,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
// Multiple loaded models // Multiple loaded models
a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil) a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil) b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil) c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
c.req.opts.NumGPU = 0 // CPU load, will be allowed c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1") t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer s.newServerFn = a.newServer
...@@ -418,11 +420,12 @@ func TestExpireRunner(t *testing.T) { ...@@ -418,11 +420,12 @@ func TestExpireRunner(t *testing.T) {
var f *ggml.GGML var f *ggml.GGML
gpus := discover.GpuInfoList{} gpus := discover.GpuInfoList{}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil return server, nil
} }
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
...@@ -506,7 +509,7 @@ func TestUseLoadedRunner(t *testing.T) { ...@@ -506,7 +509,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2}, sessionDuration: &api.Duration{Duration: 2},
} }
finished := make(chan *LlmRequest) finished := make(chan *LlmRequest)
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1} r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
req.useLoadedRunner(r1, finished) req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount) require.Equal(t, uint(1), r1.refCount)
...@@ -541,8 +544,8 @@ func TestUpdateFreeSpace(t *testing.T) { ...@@ -541,8 +544,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900 gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000 gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900 gpus[1].FreeMemory = 1900
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}} llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}} llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1} r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1} r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
...@@ -557,40 +560,6 @@ func TestUpdateFreeSpace(t *testing.T) { ...@@ -557,40 +560,6 @@ func TestUpdateFreeSpace(t *testing.T) {
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory) require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
} }
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
gpus := discover.GpuInfoList{
{
Library: "cuda",
ID: "0",
},
{
Library: "cuda",
ID: "1",
},
}
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loadedMu.Unlock()
tmp := s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "1", tmp[0].ID)
r1.gpus = discover.GpuInfoList{gpus[1]}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "0", tmp[0].ID)
r1.gpus = discover.GpuInfoList{}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 2)
}
func TestFindRunnerToUnload(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
...@@ -615,7 +584,7 @@ func TestNeedsReload(t *testing.T) { ...@@ -615,7 +584,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm := &mockLlm{vramByGPU: map[string]uint64{}}
do := api.DefaultOptions() do := api.DefaultOptions()
runner := &runnerRef{ runner := &runnerRef{
model: &Model{ model: &Model{
...@@ -662,8 +631,8 @@ func TestUnloadAllRunners(t *testing.T) { ...@@ -662,8 +631,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.unloadAllRunners() s.unloadAllRunners()
...@@ -681,7 +650,7 @@ func TestUnloadAllRunners(t *testing.T) { ...@@ -681,7 +650,7 @@ func TestUnloadAllRunners(t *testing.T) {
} }
func TestUnload(t *testing.T) { func TestUnload(t *testing.T) {
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, numParallel: 1} r1 := &runnerRef{llama: llm1, numParallel: 1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1} r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
r1.unload() r1.unload()
...@@ -707,46 +676,8 @@ func TestAlreadyCanceled(t *testing.T) { ...@@ -707,46 +676,8 @@ func TestAlreadyCanceled(t *testing.T) {
require.Empty(t, scenario1a.req.successCh) require.Empty(t, scenario1a.req.successCh)
} }
func TestHomogeneousGPUs(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
s := InitScheduler(ctx)
s.getGpuFn = func() discover.GpuInfoList {
// Set memory values to require the model to be spread
gpus := []discover.GpuInfo{
{Library: "cuda"},
{Library: "rocm"},
}
gpus[0].TotalMemory = 1 * format.GibiByte
gpus[0].FreeMemory = 256 * format.MebiByte
gpus[1].TotalMemory = 1 * format.GibiByte
gpus[1].FreeMemory = 256 * format.MebiByte
return gpus
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
require.Len(t, s.pendingReqCh, 1)
s.Run(ctx)
select {
case resp := <-a.req.successCh:
require.Equal(t, resp.llama, a.srv)
require.Empty(t, s.pendingReqCh)
require.Empty(t, a.req.errCh)
case err := <-a.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
t.Fatal("timeout")
}
}
type mockLlm struct { type mockLlm struct {
modelPath string
pingResp error pingResp error
waitResp error waitResp error
completionResp error completionResp error
...@@ -758,11 +689,27 @@ type mockLlm struct { ...@@ -758,11 +689,27 @@ type mockLlm struct {
detonekizeRespErr error detonekizeRespErr error
closeResp error closeResp error
closeCalled bool closeCalled bool
estimatedVRAM uint64 vramSize uint64
estimatedTotal uint64 totalSize uint64
estimatedVRAMByGPU map[string]uint64 vramByGPU map[string]uint64
} }
func (s *mockLlm) ModelPath() string {
return s.modelPath
}
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
if requireFull {
for _, g := range gpus {
if g.FreeMemory >= s.vramSize {
return nil
}
}
return llm.ErrLoadRequiredFull
}
return nil
}
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp } func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp } func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error { func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
...@@ -785,7 +732,7 @@ func (s *mockLlm) Close() error { ...@@ -785,7 +732,7 @@ func (s *mockLlm) Close() error {
s.closeCalled = true s.closeCalled = true
return s.closeResp return s.closeResp
} }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) VRAMSize() uint64 { return s.vramSize }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) TotalSize() uint64 { return s.totalSize }
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] } func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 } func (s *mockLlm) Pid() int { return -1 }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment