Commit d5a0d8d9 authored by Jesse Gross's avatar Jesse Gross Committed by Jesse Gross
Browse files

llm: New memory management

This changes the memory allocation strategy from upfront estimation to
tracking actual allocations done by the engine and reacting to that. The
goal is avoid issues caused by both under-estimation (crashing) and
over-estimation (low performance due to under-utilized GPUs).

It is currently opt-in and can be enabled for models running on the
Ollama engine by setting OLLAMA_NEW_ESTIMATES=1. Behavior in other
cases is unchanged and will continue to use the existing estimates.
parent ef7d26ba
...@@ -14,6 +14,7 @@ import ( ...@@ -14,6 +14,7 @@ import (
"net" "net"
"net/http" "net/http"
"os" "os"
"reflect"
"regexp" "regexp"
"runtime" "runtime"
"strconv" "strconv"
...@@ -259,6 +260,16 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [ ...@@ -259,6 +260,16 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
} }
type Server struct { type Server struct {
// modelPath is the location of the model to be loaded
modelPath string
// loadMu prevents more than one load attempt from occurring at a time
loadMu sync.Mutex
// lastLoad is the load request from the previous load attempt. Used to
// detect if we can reuse an existing memory allocation.
lastLoad llm.LoadRequest
// is the server ready to process requests? // is the server ready to process requests?
// protects access to model and image // protects access to model and image
ready sync.WaitGroup ready sync.WaitGroup
...@@ -720,17 +731,6 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) { ...@@ -720,17 +731,6 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
} }
} }
type multiLPath []string
func (m *multiLPath) Set(value string) error {
*m = append(*m, value)
return nil
}
func (m *multiLPath) String() string {
return strings.Join(*m, ", ")
}
func (s *Server) reserveWorstCaseGraph() error { func (s *Server) reserveWorstCaseGraph() error {
ctx := s.model.Backend().NewContext() ctx := s.model.Backend().NewContext()
defer ctx.Close() defer ctx.Close()
...@@ -828,15 +828,28 @@ func (s *Server) reserveWorstCaseGraph() error { ...@@ -828,15 +828,28 @@ func (s *Server) reserveWorstCaseGraph() error {
return nil return nil
} }
func (s *Server) initModel( // allocModel pre-allocates the maximum needed memory for a model
// based on the given parameters
func (s *Server) allocModel(
mpath string, mpath string,
params ml.BackendParams, params ml.BackendParams,
lpath multiLPath, loraPath []string,
parallel int, parallel int,
kvCacheType string, kvCacheType string,
kvSize int, kvSize int,
multiUserCache bool, multiUserCache bool,
) error { ) (panicErr error) {
// Convert memory allocation panics to errors
defer func() {
if r := recover(); r != nil {
if err, ok := r.(error); ok {
panicErr = err
} else {
panic(r)
}
}
}()
var err error var err error
s.model, err = model.New(mpath, params) s.model, err = model.New(mpath, params)
if err != nil { if err != nil {
...@@ -844,7 +857,7 @@ func (s *Server) initModel( ...@@ -844,7 +857,7 @@ func (s *Server) initModel(
} }
// TODO(jessegross): LoRA loading // TODO(jessegross): LoRA loading
if lpath.String() != "" { if len(loraPath) > 0 {
return errors.New("loras are not yet implemented") return errors.New("loras are not yet implemented")
} }
...@@ -865,63 +878,122 @@ func (s *Server) initModel( ...@@ -865,63 +878,122 @@ func (s *Server) initModel(
return s.reserveWorstCaseGraph() return s.reserveWorstCaseGraph()
} }
func (s *Server) load( // closeModel frees all memory associated with a model
ctx context.Context, func (s *Server) closeModel() {
mpath string, s.cache.Close()
params ml.BackendParams, s.cache = nil
lpath multiLPath, if s.model != nil {
parallel int, s.model.Backend().Close()
kvCacheType string, s.model = nil
kvSize int,
multiUserCache bool,
) {
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
if err != nil {
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
// We can't yet handle this but in the future we will
s.cache.Close()
if s.model != nil {
s.model.Backend().Close()
}
}
panic(err)
} }
}
slog.Debug("memory", "allocated", s.model.Backend().BackendMemory()) // loadModel loads the weights for a model. The memory must already
// have been allocated with allocModel
err = s.model.Backend().Load(ctx, func (s *Server) loadModel() {
err := s.model.Backend().Load(context.TODO(),
func(progress float32) { func(progress float32) {
s.progress = progress s.progress = progress
}) })
if err != nil { if err != nil {
panic(err) panic(fmt.Errorf("failed to load model: %v", err))
} }
s.status = llm.ServerStatusReady s.status = llm.ServerStatusReady
s.ready.Done() s.ready.Done()
} }
// load is the handler called by the Ollama server to process different
// load operations
func (s *Server) load(w http.ResponseWriter, r *http.Request) {
s.loadMu.Lock()
defer s.loadMu.Unlock()
w.Header().Set("Content-Type", "application/json")
if s.status != llm.ServerStatusLaunched {
http.Error(w, "model already loaded", http.StatusInternalServerError)
return
}
var req llm.LoadRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "bad request", http.StatusBadRequest)
return
}
slog.Info("load", "request", req)
if req.Operation == llm.LoadOperationClose {
s.closeModel()
if err := json.NewEncoder(w).Encode(&llm.LoadResponse{}); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
s.lastLoad.Operation = req.Operation
loadModel := s.model == nil || !reflect.DeepEqual(req, s.lastLoad)
s.lastLoad = req
if loadModel {
s.closeModel()
params := ml.BackendParams{
AllocMemory: req.Operation != llm.LoadOperationFit,
NumThreads: req.NumThreads,
GPULayers: req.GPULayers,
FlashAttention: req.FlashAttention,
}
s.batchSize = req.BatchSize
err := s.allocModel(s.modelPath, params, req.LoraPath, req.Parallel, req.KvCacheType, req.KvSize, req.MultiUserCache)
if err != nil {
s.closeModel()
var noMem ml.ErrNoMem
if errors.As(err, &noMem) {
resp := llm.LoadResponse{Success: false, Memory: noMem.BackendMemory}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
}
return
}
http.Error(w, fmt.Sprintf("failed to initialize model: %v", err), http.StatusInternalServerError)
return
}
}
mem := s.model.Backend().BackendMemory()
switch req.Operation {
case llm.LoadOperationFit:
// LoadOperationFit can't be used for anything else, so just close it
s.closeModel()
// LoadOperationAlloc should stay open for future operations
case llm.LoadOperationCommit:
s.status = llm.ServerStatusLoadingModel
go s.loadModel()
}
resp := llm.LoadResponse{Success: true, Memory: mem}
if err := json.NewEncoder(w).Encode(&resp); err != nil {
http.Error(w, fmt.Sprintf("failed to encode response: %v", err), http.StatusInternalServerError)
return
}
}
func Execute(args []string) error { func Execute(args []string) error {
fs := flag.NewFlagSet("runner", flag.ExitOnError) fs := flag.NewFlagSet("runner", flag.ExitOnError)
mpath := fs.String("model", "", "Path to model binary file") mpath := fs.String("model", "", "Path to model binary file")
parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := fs.Int("batch-size", 512, "Batch size")
numGPULayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGPU := fs.Int("main-gpu", 0, "Main GPU")
flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
port := fs.Int("port", 8080, "Port to expose the server on") port := fs.Int("port", 8080, "Port to expose the server on")
threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
_ = fs.Bool("verbose", false, "verbose output (default: disabled)") _ = fs.Bool("verbose", false, "verbose output (default: disabled)")
_ = fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
var lpaths multiLPath
fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
fs.Usage = func() { fs.Usage = func() {
fmt.Fprintf(fs.Output(), "Runner usage\n") fmt.Fprintf(fs.Output(), "Runner usage\n")
...@@ -933,39 +1005,17 @@ func Execute(args []string) error { ...@@ -933,39 +1005,17 @@ func Execute(args []string) error {
slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel())) slog.SetDefault(logutil.NewLogger(os.Stderr, envconfig.LogLevel()))
slog.Info("starting ollama engine") slog.Info("starting ollama engine")
server := &Server{
batchSize: *batchSize,
status: llm.ServerStatusLoadingModel,
}
server.cond = sync.NewCond(&server.mu)
server.ready.Add(1)
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithCancel(context.Background())
defer cancel() defer cancel()
// TODO(jessegross): Parameters that need to be implemented: server := &Server{
// no-mmap modelPath: *mpath,
status: llm.ServerStatusLaunched,
var tensorSplitFloats []float32
if *tensorSplit != "" {
splits := strings.Split(*tensorSplit, ",")
tensorSplitFloats = make([]float32, len(splits))
for i, s := range splits {
f, _ := strconv.ParseFloat(s, 32)
tensorSplitFloats[i] = float32(f)
}
} }
params := ml.BackendParams{ server.cond = sync.NewCond(&server.mu)
NumThreads: *threads, server.ready.Add(1)
NumGPULayers: *numGPULayers,
MainGPU: *mainGPU,
TensorSplit: tensorSplitFloats,
FlashAttention: *flashAttention,
}
go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
go server.run(ctx) go server.run(ctx)
addr := "127.0.0.1:" + strconv.Itoa(*port) addr := "127.0.0.1:" + strconv.Itoa(*port)
...@@ -978,6 +1028,7 @@ func Execute(args []string) error { ...@@ -978,6 +1028,7 @@ func Execute(args []string) error {
mux := http.NewServeMux() mux := http.NewServeMux()
// TODO: support embeddings // TODO: support embeddings
mux.HandleFunc("POST /load", server.load)
mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) { mux.HandleFunc("POST /embedding", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "this model does not support embeddings", http.StatusNotImplemented) http.Error(w, "this model does not support embeddings", http.StatusNotImplemented)
}) })
......
...@@ -1477,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) { ...@@ -1477,14 +1477,14 @@ func (s *Server) PsHandler(c *gin.Context) {
mr := api.ProcessModelResponse{ mr := api.ProcessModelResponse{
Model: model.ShortName, Model: model.ShortName,
Name: model.ShortName, Name: model.ShortName,
Size: int64(v.estimatedTotal), Size: int64(v.totalSize),
SizeVRAM: int64(v.estimatedVRAM), SizeVRAM: int64(v.vramSize),
Digest: model.Digest, Digest: model.Digest,
Details: modelDetails, Details: modelDetails,
ExpiresAt: v.expiresAt, ExpiresAt: v.expiresAt,
} }
if v.Options != nil { if v.Options != nil {
mr.ContextLength = v.Options.NumCtx / v.numParallel mr.ContextLength = v.Options.NumCtx
} }
// The scheduler waits to set expiresAt, so if a model is loading it's // The scheduler waits to set expiresAt, so if a model is loading it's
// possible that it will be set to the unix epoch. For those cases, just // possible that it will be set to the unix epoch. For those cases, just
......
...@@ -77,12 +77,13 @@ func TestGenerateChat(t *testing.T) { ...@@ -77,12 +77,13 @@ func TestGenerateChat(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
...@@ -620,12 +621,13 @@ func TestGenerate(t *testing.T) { ...@@ -620,12 +621,13 @@ func TestGenerate(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
// add small delay to simulate loading // add small delay to simulate loading
time.Sleep(time.Millisecond) time.Sleep(time.Millisecond)
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
......
...@@ -277,10 +277,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) { ...@@ -277,10 +277,11 @@ func TestChatHarmonyParserStreamingRealtime(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond, reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
...@@ -427,10 +428,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) { ...@@ -427,10 +428,11 @@ func TestChatHarmonyParserStreamingSimple(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 100 * time.Millisecond, reschedDelay: 100 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
...@@ -608,10 +610,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) { ...@@ -608,10 +610,11 @@ func TestChatHarmonyParserStreaming(t *testing.T) {
getGpuFn: discover.GetGPUInfo, getGpuFn: discover.GetGPUInfo,
getCpuFn: discover.GetCPUInfo, getCpuFn: discover.GetCPUInfo,
reschedDelay: 250 * time.Millisecond, reschedDelay: 250 * time.Millisecond,
loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ int) { loadFn: func(req *LlmRequest, _ *ggml.GGML, _ discover.GpuInfoList, _ bool) bool {
req.successCh <- &runnerRef{ req.successCh <- &runnerRef{
llama: &mock, llama: &mock,
} }
return false
}, },
}, },
} }
......
This diff is collapsed.
...@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) { ...@@ -52,7 +52,7 @@ func TestLoad(t *testing.T) {
return nil, errors.New("something failed to load model blah") return nil, errors.New("something failed to load model blah")
} }
gpus := discover.GpuInfoList{} gpus := discover.GpuInfoList{}
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
require.Empty(t, req.successCh) require.Empty(t, req.successCh)
require.Len(t, req.errCh, 1) require.Len(t, req.errCh, 1)
s.loadedMu.Lock() s.loadedMu.Lock()
...@@ -61,16 +61,17 @@ func TestLoad(t *testing.T) { ...@@ -61,16 +61,17 @@ func TestLoad(t *testing.T) {
err := <-req.errCh err := <-req.errCh
require.Contains(t, err.Error(), "this model may be incompatible") require.Contains(t, err.Error(), "this model may be incompatible")
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil return server, nil
} }
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
require.NoError(t, err) require.NoError(t, err)
case resp := <-req.successCh: case resp := <-req.successCh:
require.Equal(t, uint64(10), resp.estimatedVRAM) require.Equal(t, uint64(10), resp.vramSize)
require.Equal(t, uint(1), resp.refCount) require.Equal(t, uint(1), resp.refCount)
s.loadedMu.Lock() s.loadedMu.Lock()
require.Len(t, s.loaded, 1) require.Len(t, s.loaded, 1)
...@@ -79,7 +80,7 @@ func TestLoad(t *testing.T) { ...@@ -79,7 +80,7 @@ func TestLoad(t *testing.T) {
req.model.ModelPath = "dummy_model_path" req.model.ModelPath = "dummy_model_path"
server.waitResp = errors.New("wait failure") server.waitResp = errors.New("wait failure")
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
require.Contains(t, err.Error(), "wait failure") require.Contains(t, err.Error(), "wait failure")
...@@ -104,10 +105,11 @@ type reqBundle struct { ...@@ -104,10 +105,11 @@ type reqBundle struct {
} }
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
scenario.srv.modelPath = model
return scenario.srv, nil return scenario.srv, nil
} }
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle { func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
b := &reqBundle{} b := &reqBundle{}
b.ctx, b.ctxDone = context.WithCancel(ctx) b.ctx, b.ctxDone = context.WithCancel(ctx)
t.Helper() t.Helper()
...@@ -144,7 +146,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est ...@@ -144,7 +146,7 @@ func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, est
successCh: make(chan *runnerRef, 1), successCh: make(chan *runnerRef, 1),
errCh: make(chan error, 1), errCh: make(chan error, 1),
} }
b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}} b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
return b return b
} }
...@@ -262,10 +264,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) { ...@@ -262,10 +264,10 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
// Multiple loaded models // Multiple loaded models
a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil) a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil) b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil) c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
c.req.opts.NumGPU = 0 // CPU load, will be allowed c.req.opts.NumGPU = 0 // CPU load, will be allowed
d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1") t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
s.newServerFn = a.newServer s.newServerFn = a.newServer
...@@ -418,11 +420,12 @@ func TestExpireRunner(t *testing.T) { ...@@ -418,11 +420,12 @@ func TestExpireRunner(t *testing.T) {
var f *ggml.GGML var f *ggml.GGML
gpus := discover.GpuInfoList{} gpus := discover.GpuInfoList{}
server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}} server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) { s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
server.modelPath = model
return server, nil return server, nil
} }
s.load(req, f, gpus, 0) s.load(req, f, gpus, false)
select { select {
case err := <-req.errCh: case err := <-req.errCh:
...@@ -506,7 +509,7 @@ func TestUseLoadedRunner(t *testing.T) { ...@@ -506,7 +509,7 @@ func TestUseLoadedRunner(t *testing.T) {
sessionDuration: &api.Duration{Duration: 2}, sessionDuration: &api.Duration{Duration: 2},
} }
finished := make(chan *LlmRequest) finished := make(chan *LlmRequest)
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1} r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
req.useLoadedRunner(r1, finished) req.useLoadedRunner(r1, finished)
require.Equal(t, uint(1), r1.refCount) require.Equal(t, uint(1), r1.refCount)
...@@ -541,8 +544,8 @@ func TestUpdateFreeSpace(t *testing.T) { ...@@ -541,8 +544,8 @@ func TestUpdateFreeSpace(t *testing.T) {
gpus[0].FreeMemory = 900 gpus[0].FreeMemory = 900
gpus[1].TotalMemory = 2000 gpus[1].TotalMemory = 2000
gpus[1].FreeMemory = 1900 gpus[1].FreeMemory = 1900
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}} llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}} llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1} r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1} r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
...@@ -557,40 +560,6 @@ func TestUpdateFreeSpace(t *testing.T) { ...@@ -557,40 +560,6 @@ func TestUpdateFreeSpace(t *testing.T) {
require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory) require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
} }
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done()
gpus := discover.GpuInfoList{
{
Library: "cuda",
ID: "0",
},
{
Library: "cuda",
ID: "1",
},
}
r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
s := InitScheduler(ctx)
s.loadedMu.Lock()
s.loaded["a"] = r1
s.loadedMu.Unlock()
tmp := s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "1", tmp[0].ID)
r1.gpus = discover.GpuInfoList{gpus[1]}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 1)
require.Equal(t, "0", tmp[0].ID)
r1.gpus = discover.GpuInfoList{}
tmp = s.filterGPUsWithoutLoadingModels(gpus)
require.Len(t, tmp, 2)
}
func TestFindRunnerToUnload(t *testing.T) { func TestFindRunnerToUnload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
...@@ -615,7 +584,7 @@ func TestNeedsReload(t *testing.T) { ...@@ -615,7 +584,7 @@ func TestNeedsReload(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm := &mockLlm{vramByGPU: map[string]uint64{}}
do := api.DefaultOptions() do := api.DefaultOptions()
runner := &runnerRef{ runner := &runnerRef{
model: &Model{ model: &Model{
...@@ -662,8 +631,8 @@ func TestUnloadAllRunners(t *testing.T) { ...@@ -662,8 +631,8 @@ func TestUnloadAllRunners(t *testing.T) {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
defer done() defer done()
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
s := InitScheduler(ctx) s := InitScheduler(ctx)
s.unloadAllRunners() s.unloadAllRunners()
...@@ -681,7 +650,7 @@ func TestUnloadAllRunners(t *testing.T) { ...@@ -681,7 +650,7 @@ func TestUnloadAllRunners(t *testing.T) {
} }
func TestUnload(t *testing.T) { func TestUnload(t *testing.T) {
llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}} llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
r1 := &runnerRef{llama: llm1, numParallel: 1} r1 := &runnerRef{llama: llm1, numParallel: 1}
r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1} r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
r1.unload() r1.unload()
...@@ -707,62 +676,40 @@ func TestAlreadyCanceled(t *testing.T) { ...@@ -707,62 +676,40 @@ func TestAlreadyCanceled(t *testing.T) {
require.Empty(t, scenario1a.req.successCh) require.Empty(t, scenario1a.req.successCh)
} }
func TestHomogeneousGPUs(t *testing.T) { type mockLlm struct {
ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond) modelPath string
defer done() pingResp error
s := InitScheduler(ctx) waitResp error
completionResp error
s.getGpuFn = func() discover.GpuInfoList { embeddingResp []float32
// Set memory values to require the model to be spread embeddingRespErr error
gpus := []discover.GpuInfo{ tokenizeResp []int
{Library: "cuda"}, tokenizeRespErr error
{Library: "rocm"}, detokenizeResp string
} detonekizeRespErr error
gpus[0].TotalMemory = 1 * format.GibiByte closeResp error
gpus[0].FreeMemory = 256 * format.MebiByte closeCalled bool
gpus[1].TotalMemory = 1 * format.GibiByte vramSize uint64
gpus[1].FreeMemory = 256 * format.MebiByte totalSize uint64
return gpus vramByGPU map[string]uint64
}
s.getCpuFn = getCpuFn
a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
require.Len(t, gpus, 1)
return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
}
slog.Info("a")
s.pendingReqCh <- a.req
require.Len(t, s.pendingReqCh, 1)
s.Run(ctx)
select {
case resp := <-a.req.successCh:
require.Equal(t, resp.llama, a.srv)
require.Empty(t, s.pendingReqCh)
require.Empty(t, a.req.errCh)
case err := <-a.req.errCh:
t.Fatal(err.Error())
case <-ctx.Done():
t.Fatal("timeout")
}
} }
type mockLlm struct { func (s *mockLlm) ModelPath() string {
pingResp error return s.modelPath
waitResp error
completionResp error
embeddingResp []float32
embeddingRespErr error
tokenizeResp []int
tokenizeRespErr error
detokenizeResp string
detonekizeRespErr error
closeResp error
closeCalled bool
estimatedVRAM uint64
estimatedTotal uint64
estimatedVRAMByGPU map[string]uint64
} }
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
if requireFull {
for _, g := range gpus {
if g.FreeMemory >= s.vramSize {
return nil
}
}
return llm.ErrLoadRequiredFull
}
return nil
}
func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp } func (s *mockLlm) Ping(ctx context.Context) error { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp } func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error { func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
...@@ -785,7 +732,7 @@ func (s *mockLlm) Close() error { ...@@ -785,7 +732,7 @@ func (s *mockLlm) Close() error {
s.closeCalled = true s.closeCalled = true
return s.closeResp return s.closeResp
} }
func (s *mockLlm) EstimatedVRAM() uint64 { return s.estimatedVRAM } func (s *mockLlm) VRAMSize() uint64 { return s.vramSize }
func (s *mockLlm) EstimatedTotal() uint64 { return s.estimatedTotal } func (s *mockLlm) TotalSize() uint64 { return s.totalSize }
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] } func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
func (s *mockLlm) Pid() int { return -1 } func (s *mockLlm) Pid() int { return -1 }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment