sched_test.go 21.5 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
package server

import (
	"bytes"
	"context"
	"encoding/binary"
	"fmt"
	"log/slog"
	"os"
10
	"runtime"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
12
13
14
15
	"testing"
	"time"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
Michael Yang's avatar
lint  
Michael Yang committed
16
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/gpu"
	"github.com/ollama/ollama/llm"
	"github.com/stretchr/testify/require"
)

func init() {
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
37
}

func TestLoad(t *testing.T) {
38
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
	defer done()
	s := InitScheduler(ctx)
41
	var ggml *llm.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
43
44
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
48
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
	}
	// Fail to load model first
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
54
		return nil, fmt.Errorf("something failed to load model blah")
	}
	gpus := gpu.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
55
	s.load(req, ggml, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
56
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
59
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
63
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

64
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
67
		return server, nil
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
70
71
72
73
74
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
80
81
	}

	req.model.ModelPath = "dummy_model_path"
	server.waitResp = fmt.Errorf("wait failure")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
83
84
85
86
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
97
	require.Len(t, s.expiredCh, 1)
}

98
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
100
101
102
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
103
	ggml    *llm.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
105
}

106
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
109
	return scenario.srv, nil
}

110
111
112
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
115
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
116
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
	defer f.Close()

	gguf := llm.NewGGUFV3(binary.LittleEndian)
	err = gguf.Encode(f, llm.KV{
		"general.architecture":          "llama",
		"general.name":                  "name",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
	}, []llm.Tensor{
132
133
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
134
	})
Michael Yang's avatar
lint  
Michael Yang committed
135
	require.NoError(t, err)
136

Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
138
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
139
	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
140
	require.NoError(t, err)
141

142
143
144
145
146
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
		opts:            api.DefaultOptions(),
149
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
150
151
152
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
153
154
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155
156
}

157
158
159
160
161
162
func getGpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "metal"}
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
163

164
165
166
167
168
169
func getCpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "cpu"}
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170

171
172
173
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
174
	s := InitScheduler(ctx)
175
176
177
178
179
180
181
182
183
184
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
185
186
187
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
188
189
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
190
		require.Empty(t, s.pendingReqCh)
191
192
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
193
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
196
197
198
	}

	// Same runner as first request due to not needing a reload
199
200
201
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
202
	select {
203
204
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
205
		require.Empty(t, s.pendingReqCh)
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
237
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
239
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
240
241
242
	}

	// Trigger a reload
243
244
245
246
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
247
248
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
249
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
250
	select {
251
252
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
253
		require.Empty(t, s.pendingReqCh)
254
255
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
256
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
258
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
259
	}
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
275

276
	envconfig.MaxRunners = 1
277
278
279
280
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
281
	select {
282
283
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
284
		require.Empty(t, s.pendingReqCh)
285
286
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
287
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
293
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
294

295
	envconfig.MaxRunners = 0
296
297
298
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
299
	select {
300
301
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
302
		require.Empty(t, s.pendingReqCh)
303
304
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
305
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
310
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
311
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
312

Daniel Hiltgen's avatar
Daniel Hiltgen committed
313
	// This is a CPU load with NumGPU = 0 so it should load
314
315
316
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
317
	select {
318
319
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
320
		require.Empty(t, s.pendingReqCh)
321
322
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
323
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
325
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
326
327
328
329
330
331
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

	// Try to load a model that wont fit
332
333
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
334
335
336
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
337
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
338
	time.Sleep(2 * time.Millisecond)
339
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
340
341
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
342
343
344
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
345
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
346
	select {
347
348
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
349
		require.Empty(t, s.pendingReqCh)
350
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
352
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
354
355
356
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
357
358
}

359
360
func TestRequestsModelTooBigForSystem(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
361
362
363
364
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "metal"}
365
366
		g.TotalMemory = 4 * format.MebiByte
		g.FreeMemory = 3 * format.MebiByte
Daniel Hiltgen's avatar
Daniel Hiltgen committed
367
368
		return []gpu.GpuInfo{g}
	}
369
370
371
372
373
374
375
376
377
378
379
380

	s.getCpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "cpu"}
		g.TotalMemory = 4 * format.MebiByte
		g.FreeMemory = 2 * format.MebiByte
		return []gpu.GpuInfo{g}
	}
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
381
	require.Len(t, s.pendingReqCh, 1)
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
	s.Run(ctx)
	select {
	case <-a.req.successCh:
		if runtime.GOOS == "linux" {
			t.Fatal("request should have been rejected with out of space")
		}
		// else - Darwin and Windows don't reject right now
	case err := <-a.req.errCh:
		require.Contains(t, err.Error(), "too large")
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}
func TestGetRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
	envconfig.MaxQueuedRequests = 1
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
412
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
413
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
414
415
416
417
418
419
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
420
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
421
422
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
423
424
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
425
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
426
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
427
	}
428
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
429
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
430
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
431
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
432

433
434
435
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
436
	// Starts in pending channel, then should be quickly processsed to return an error
437
	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
438
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
439
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
440
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
441
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
442
443
444
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
445
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
446
447
448
449
}

// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
450
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
451
452
453
	defer done()

	// Same model, same request
454
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
455
456
457
458
459
460
461
462
463
464
465
466
467
468
	s := InitScheduler(ctx)
	s.getGpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "metal"}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
		return []gpu.GpuInfo{g}
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
469
470
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
471
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
472
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
473
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
474
475
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
476
477
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
478
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
479
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
480
	}
481
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
482
483
484
485
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
486
	require.Empty(t, s.finishedReqCh)
487
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
488
	require.Empty(t, s.loaded)
489
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
490
491
492
493
494
495
496

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
497
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
498
499
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
500
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
501
		successCh:       make(chan *runnerRef, 1),
502
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
503
504
	}
	finished := make(chan *LlmRequest)
505
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
506
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
508
509
510
511
512
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
513
514
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
515
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
516
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
517
518
519
520
521
522
523
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
524
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
540
541
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
542
543
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
544
545

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
546
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
547
548
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
549
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
550
551

	s.updateFreeSpace(gpus)
552
553
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
555
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{gpus[1]}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
588
589
590
}

func TestFindRunnerToUnload(t *testing.T) {
591
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
592
	defer done()
593

Daniel Hiltgen's avatar
Daniel Hiltgen committed
594
595
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
596
597

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
598
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
599
600
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
601
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
602

603
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
604
605
	require.Equal(t, r2, resp)
	r2.refCount = 1
606
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
607
608
609
610
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
611
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
612
613
	defer done()

614
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
615
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
616
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
617
618
619
620
621
622
623
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
624
625
626
627
628
629
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
630
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
631
632
633
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
634
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
635
636
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
637
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
638
639
640
641
642
643
644
645
646
647
648
649
650
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
	llm.pingResp = fmt.Errorf("foo")
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
651
652
653
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
654
655
656
657
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
658
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
659
660
	defer done()

661
662
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
663
664
665
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
666
667
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
668

Daniel Hiltgen's avatar
Daniel Hiltgen committed
669
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
670
671
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
672
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
673
674
675
676
677
678
679
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
680
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
681
682
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
683
684
685
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
686
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
687
688
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
689
690
691
692
693
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
694
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
695
696
697
698
699
700
701
702
703
704
705
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
706
type mockLlm struct {
707
708
709
	pingResp           error
	waitResp           error
	completionResp     error
710
711
	embedResp          [][]float32
	embedRespErr       error
712
713
714
715
716
717
718
719
720
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
721
722
723
724
725
726
727
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
728
729
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
	return s.embedResp, s.embedRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
730
731
732
733
734
735
736
737
738
739
740
}
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
741
742
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
743
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }