Restructure loading conditional chain

36a6dacc · Bryce Reitano · ceb0e26e · 36a6dacc · 36a6dacc
Commit 36a6dacc authored Apr 24, 2024 by Bryce Reitano
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 19 deletions

server/sched.go server/sched.go +17 -18

server/sched_test.go server/sched_test.go +1 -1

No files found.
--- a/server/sched.go
+++ b/server/sched.go
@@ -123,36 +123,35 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						pending.useLoadedRunner(runner, s.finishedReqCh)
 						break
 					}
-				} else if loadedCount == 0 {
-					slog.Debug("loading first model", "model", pending.model.ModelPath)
-					gpus := s.getGpuFn()
-
-					ggml, err := llm.LoadModel(pending.model.ModelPath)
-					if err != nil {
-						pending.errCh <- err
-						break
-					}
-					g := pickBestFitGPUs(pending, ggml, gpus)
-					if g != nil {
-						gpus = g
-					}
-					s.loadFn(pending, ggml, gpus)
-					break
 				} else if loadedMax > 0 && loadedCount >= loadedMax {
 					slog.Debug("max runners achieved, unloading one to make room", "runner_count", loadedCount)
 					runnerToExpire = s.findRunnerToUnload(pending)
 				} else {
-					// More than one loaded model, so we have to see if the new one fits
+					// Either no models are loaded or below loadedMax
 					// Get a refreshed GPU list
 					gpus := s.getGpuFn()
-					// Update free memory from currently loaded models
-					s.updateFreeSpace(gpus)

+					// Load model for fitting
 					ggml, err := llm.LoadModel(pending.model.ModelPath)
 					if err != nil {
 						pending.errCh <- err
 						break
 					}
+
+					// No models loaded. Load the model but prefer the best fit.
+					if loadedCount == 0 {
+						slog.Debug("loading first model", "model", pending.model.ModelPath)
+						g := pickBestFitGPUs(pending, ggml, gpus)
+						if g != nil {
+							gpus = g
+						}
+						s.loadFn(pending, ggml, gpus)
+						break
+					}
+
+					// More than one loaded model, so we have to see if the new one fits
+					// Update free memory from currently loaded models
+					s.updateFreeSpace(gpus)
 					gpus = pickBestFitGPUs(pending, ggml, gpus)
 					if gpus != nil {
 						slog.Debug("new model fits with existing models, loading")

--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -47,7 +47,7 @@ func TestLoad(t *testing.T) {
 	ctx, done := context.WithTimeout(context.Background(), 5*time.Millisecond)
 	defer done()
 	s := InitScheduler(ctx)
-	ggml := nil // value not used in tests
+	var ggml *llm.GGML // value not used in tests
 	req := &LlmRequest{
 		ctx:             ctx,
 		model:           &Model{ModelPath: "foo"},