sched_test.go 21.7 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
16
17
18
19
20
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/gpu"
	"github.com/ollama/ollama/llm"
)

Michael Yang's avatar
lint  
Michael Yang committed
21
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
22
23
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
24
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
27
28
29
30
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
35
36
}

func TestLoad(t *testing.T) {
37
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
38
39
	defer done()
	s := InitScheduler(ctx)
40
	var ggml *llm.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
41
42
43
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
46
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
47
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
49
	}
	// Fail to load model first
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
51
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
	}
	gpus := gpu.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
	s.load(req, ggml, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
55
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
56
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
58
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

63
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
66
		return server, nil
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
70
71
72
73
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
79
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
80
	server.waitResp = errors.New("wait failure")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
81
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
83
84
85
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95
96
	require.Len(t, s.expiredCh, 1)
}

97
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
100
101
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
102
	ggml    *llm.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
}

105
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
107
108
	return scenario.srv, nil
}

109
110
111
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
112
113
114
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
115
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
	defer f.Close()

Michael Yang's avatar
Michael Yang committed
118
	require.NoError(t, llm.WriteGGUF(f, llm.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
119
120
121
122
123
124
125
126
127
128
		"general.architecture":          "llama",
		"general.name":                  "name",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
Michael Yang's avatar
Michael Yang committed
129
	}, []llm.Tensor{
130
131
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Michael Yang's avatar
Michael Yang committed
132
	}))
Michael Yang's avatar
lint  
Michael Yang committed
133
	require.NoError(t, err)
134

Daniel Hiltgen's avatar
Daniel Hiltgen committed
135
136
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
137
	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
138
	require.NoError(t, err)
139

140
141
142
143
144
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		opts:            api.DefaultOptions(),
147
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
149
150
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
151
152
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
154
}

155
156
157
158
159
160
func getGpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "metal"}
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161

162
163
164
165
166
167
func getCpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "cpu"}
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
168

169
170
171
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
172
	s := InitScheduler(ctx)
173
174
175
176
177
178
179
180
181
182
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
183
184
185
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
186
187
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
188
		require.Empty(t, s.pendingReqCh)
189
190
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
191
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
192
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
195
196
	}

	// Same runner as first request due to not needing a reload
197
198
199
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
200
	select {
201
202
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
203
		require.Empty(t, s.pendingReqCh)
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
235
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
236
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
239
240
	}

	// Trigger a reload
241
242
243
244
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
245
246
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
247
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
248
	select {
249
250
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
251
		require.Empty(t, s.pendingReqCh)
252
253
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
254
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
255
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
256
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
	}
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
273

Michael Yang's avatar
int  
Michael Yang committed
274
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
275
276
277
278
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
279
	select {
280
281
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
282
		require.Empty(t, s.pendingReqCh)
283
284
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
285
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292

Michael Yang's avatar
int  
Michael Yang committed
293
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
294
295
296
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
297
	select {
298
299
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
300
		require.Empty(t, s.pendingReqCh)
301
302
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
303
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
310

Daniel Hiltgen's avatar
Daniel Hiltgen committed
311
	// This is a CPU load with NumGPU = 0 so it should load
312
313
314
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
315
	select {
316
317
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
318
		require.Empty(t, s.pendingReqCh)
319
320
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
321
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
325
326
327
328
329
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

	// Try to load a model that wont fit
330
331
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
332
333
334
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
335
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
336
	time.Sleep(2 * time.Millisecond)
337
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
338
339
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
340
341
342
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
343
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
344
	select {
345
346
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
347
		require.Empty(t, s.pendingReqCh)
348
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
352
353
354
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
355
356
}

357
358
359
360
361
362
363
func TestGetRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
364
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
365
366
367
368
369
370
371
372
373
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
374
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
375
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
376
377
378
379
380
381
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
382
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
383
384
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
385
386
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
388
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
	}
390
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
393
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
394

395
396
397
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
398
	// Starts in pending channel, then should be quickly processsed to return an error
399
	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
400
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
401
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
402
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
403
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
404
405
406
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
407
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
408
409
410
411
}

// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
412
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
413
414
415
	defer done()

	// Same model, same request
416
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
417
418
419
420
421
422
423
424
425
426
427
428
429
430
	s := InitScheduler(ctx)
	s.getGpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "metal"}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
		return []gpu.GpuInfo{g}
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
431
432
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
433
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
434
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
435
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
436
437
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
438
439
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
440
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
441
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
442
	}
443
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
444
445
446
447
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
448
	require.Empty(t, s.finishedReqCh)
449
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
450
	require.Empty(t, s.loaded)
451
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
452
453
454
455
456
457
458

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
459
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
460
461
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
462
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
463
		successCh:       make(chan *runnerRef, 1),
464
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
465
466
	}
	finished := make(chan *LlmRequest)
467
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
468
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
469
470
471
472
473
474
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
475
476
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
477
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
478
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
479
480
481
482
483
484
485
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
486
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
502
503
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
504
505
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
506
507

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
508
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
509
510
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
511
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
512
513

	s.updateFreeSpace(gpus)
514
515
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
516
517
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{gpus[1]}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
550
551
552
}

func TestFindRunnerToUnload(t *testing.T) {
553
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
	defer done()
555

Daniel Hiltgen's avatar
Daniel Hiltgen committed
556
557
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
558
559

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
560
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
561
562
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
563
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
564

565
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
566
567
	require.Equal(t, r2, resp)
	r2.refCount = 1
568
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
569
570
571
572
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
573
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
574
575
	defer done()

576
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
577
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
578
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
579
580
581
582
583
584
585
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
586
587
588
589
590
591
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
592
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
593
594
595
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
596
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
597
598
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
599
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
600
601
602
603
604
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
605
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
606
607
608
609
610
611
612
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
613
614
615
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
616
617
618
619
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
620
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
621
622
	defer done()

623
624
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
625
626
627
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
628
629
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
630

Daniel Hiltgen's avatar
Daniel Hiltgen committed
631
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
632
633
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
634
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
635
636
637
638
639
640
641
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
642
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
643
644
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
645
646
647
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
648
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
649
650
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
651
652
653
654
655
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
656
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
657
658
659
660
661
662
663
664
665
666
667
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
func TestHomogeneousGPUs(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)

	s.getGpuFn = func() gpu.GpuInfoList {
		// Set memory values to require the model to be spread
		gpus := []gpu.GpuInfo{
			{Library: "cuda"},
			{Library: "rocm"},
		}
		gpus[0].TotalMemory = 1 * format.GibiByte
		gpus[0].FreeMemory = 256 * format.MebiByte
		gpus[1].TotalMemory = 1 * format.GibiByte
		gpus[1].FreeMemory = 256 * format.MebiByte
		return gpus
	}
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
		require.Len(t, gpus, 1)
		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
	}
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
707
type mockLlm struct {
708
709
710
	pingResp           error
	waitResp           error
	completionResp     error
711
	embedResp          *llm.EmbedResponse
712
	embedRespErr       error
713
714
715
716
717
718
719
720
721
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
722
723
724
725
726
727
728
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
729

730
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
731
	return s.embedResp, s.embedRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
732
}
Michael Yang's avatar
lint  
Michael Yang committed
733

Daniel Hiltgen's avatar
Daniel Hiltgen committed
734
735
736
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
737

Daniel Hiltgen's avatar
Daniel Hiltgen committed
738
739
740
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
741

Daniel Hiltgen's avatar
Daniel Hiltgen committed
742
743
744
745
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
746
747
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
748
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }