sched_test.go 22.8 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
16
	"github.com/ollama/ollama/discover"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
18
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
20
21
	"github.com/ollama/ollama/llm"
)

Michael Yang's avatar
lint  
Michael Yang committed
22
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
23
24
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
25
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
28
}

func TestInitScheduler(t *testing.T) {
29
	ctx, done := context.WithCancel(t.Context())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
30
31
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
37
}

func TestLoad(t *testing.T) {
38
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
	defer done()
	s := InitScheduler(ctx)
Michael Yang's avatar
Michael Yang committed
41
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
43
44
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
48
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
	}
	// Fail to load model first
Michael Yang's avatar
Michael Yang committed
51
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
52
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
	}
54
	gpus := discover.GpuInfoList{}
Michael Yang's avatar
Michael Yang committed
55
	s.load(req, f, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
56
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
59
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
63
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

64
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
65
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
67
		return server, nil
	}
Michael Yang's avatar
Michael Yang committed
68
	s.load(req, f, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
70
71
72
73
74
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
80
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
81
	server.waitResp = errors.New("wait failure")
Michael Yang's avatar
Michael Yang committed
82
	s.load(req, f, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
83
84
85
86
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
97
	require.Len(t, s.expiredCh, 1)
}

98
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
100
101
102
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
103
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
105
}

Michael Yang's avatar
Michael Yang committed
106
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
109
	return scenario.srv, nil
}

110
111
112
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
	t.Helper()

115
	p, _ := createBinFile(t, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
118
119
120
121
122
123
124
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
125
	}, []*ggml.Tensor{
126
127
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
128
	})
129

130
131
132
133
134
135
	model := &Model{Name: modelName, ModelPath: p}
	f, err := llm.LoadModel(model.ModelPath, 0)
	if err != nil {
		t.Fatal(err)
	}
	b.f = f
136
137
138
139
140
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
141
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
142
		opts:            api.DefaultOptions(),
143
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
145
146
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
147
148
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
149
150
}

151
152
func getGpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "metal"}
153
154
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
155
	return []discover.GpuInfo{g}
156
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
157

158
159
func getCpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "cpu"}
160
161
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
162
	return []discover.GpuInfo{g}
163
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
164

165
func TestRequestsSameModelSameRequest(t *testing.T) {
166
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
167
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
168
	s := InitScheduler(ctx)
169
170
171
172
173
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
174
	b.f = a.f
175
176
177
178

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
179
180
181
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
182
183
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
184
		require.Empty(t, s.pendingReqCh)
185
186
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
187
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
188
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
189
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
190
191
192
	}

	// Same runner as first request due to not needing a reload
193
194
195
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
196
	select {
197
198
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
199
		require.Empty(t, s.pendingReqCh)
200
201
202
203
204
205
206
207
208
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
209
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
210
211
212
213
214
215
216
217
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
218
	b.f = a.f
219
220
221
222
223
224
225
226
227
228
229
230

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
231
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
232
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
233
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
234
235
236
	}

	// Trigger a reload
237
238
239
240
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
241
242
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
243
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
244
	select {
245
246
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
247
		require.Empty(t, s.pendingReqCh)
248
249
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
250
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
251
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
252
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
253
	}
254
255
256
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
257
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
258
259
260
261
262
263
264
265
266
267
268
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
269

Michael Yang's avatar
int  
Michael Yang committed
270
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
271
272
273
274
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
275
	select {
276
277
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
278
		require.Empty(t, s.pendingReqCh)
279
280
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
281
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
282
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
283
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
284
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
285
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288

Michael Yang's avatar
int  
Michael Yang committed
289
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
290
291
292
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
293
	select {
294
295
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
296
		require.Empty(t, s.pendingReqCh)
297
298
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
299
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
300
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
301
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
302
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306

Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	// This is a CPU load with NumGPU = 0 so it should load
308
309
310
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
311
	select {
312
313
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
314
		require.Empty(t, s.pendingReqCh)
315
316
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
317
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
318
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
319
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
320
321
322
323
324
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

325
	// Try to load a model that won't fit
326
327
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
328
329
330
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
331
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
332
	time.Sleep(2 * time.Millisecond)
333
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
334
335
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
336
337
338
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
339
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
340
	select {
341
342
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
343
		require.Empty(t, s.pendingReqCh)
344
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
345
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
346
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
347
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
348
349
350
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
352
}

353
func TestGetRunner(t *testing.T) {
354
	ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
355
356
357
358
359
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
360
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
361
362
363
364
365
366
367
368
369
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
370
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
371
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
372
373
374
375
376
377
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
378
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
379
380
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
381
382
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
383
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
384
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
385
	}
386
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
388
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390

391
392
393
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
394
	// Starts in pending channel, then should be quickly processed to return an error
395
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
396
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
397
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
398
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
399
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
400
401
402
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
403
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
404
405
}

Patrick Devine's avatar
Patrick Devine committed
406
func TestExpireRunner(t *testing.T) {
407
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Patrick Devine's avatar
Patrick Devine committed
408
409
410
411
412
413
414
415
416
417
418
	defer done()
	s := InitScheduler(ctx)
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
419
	var f *ggml.GGML
420
	gpus := discover.GpuInfoList{}
Patrick Devine's avatar
Patrick Devine committed
421
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
422
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Patrick Devine's avatar
Patrick Devine committed
423
424
		return server, nil
	}
Michael Yang's avatar
Michael Yang committed
425
	s.load(req, f, gpus, 0)
Patrick Devine's avatar
Patrick Devine committed
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
452
453
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
454
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
455
456
457
	defer done()

	// Same model, same request
458
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
459
	s := InitScheduler(ctx)
460
461
	s.getGpuFn = func() discover.GpuInfoList {
		g := discover.GpuInfo{Library: "metal"}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
462
463
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
464
		return []discover.GpuInfo{g}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
465
466
467
468
469
470
471
472
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
473
474
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
475
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
476
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
477
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
478
479
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
480
481
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
482
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
483
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
484
	}
485
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
486
487
488
489
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
490
	require.Empty(t, s.finishedReqCh)
491
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
492
	require.Empty(t, s.loaded)
493
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
494
495
496
497
498
499
500

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
501
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
502
503
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
504
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
505
		successCh:       make(chan *runnerRef, 1),
506
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
508
	}
	finished := make(chan *LlmRequest)
509
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
510
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
511
512
513
514
515
516
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
517
518
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
519
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
520
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
521
522
523
524
525
526
527
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
528
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
529
	defer done()
530
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
531
532
533
534
535
536
537
538
539
540
541
542
543
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
544
545
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
546
547
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
548
549

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
550
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
551
552
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
553
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
555

	s.updateFreeSpace(gpus)
556
557
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
558
559
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
560
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
561
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
562
	defer done()
563
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
564
565
566
567
568
569
570
571
572
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
573
	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
574
575
576
577
578
579
580
581
582
583

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

584
	r1.gpus = discover.GpuInfoList{gpus[1]}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
585
586
587
588
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

589
	r1.gpus = discover.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
590
591
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
592
593
594
}

func TestFindRunnerToUnload(t *testing.T) {
595
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
596
	defer done()
597

Daniel Hiltgen's avatar
Daniel Hiltgen committed
598
599
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
600
601

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
602
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
603
604
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
605
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
606

607
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
608
609
	require.Equal(t, r2, resp)
	r2.refCount = 1
610
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
611
612
613
614
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
615
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
616
617
	defer done()

618
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
619
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
620
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
621
622
623
624
625
626
627
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
628
629
630
631
632
633
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
634
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
635
636
637
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
638
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
639
640
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
641
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
642
643
644
645
646
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
647
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
648
649
650
651
652
653
654
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
655
656
657
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
658
659
660
661
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
662
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
663
664
	defer done()

665
666
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
667
668
669
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
670
671
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
672

Daniel Hiltgen's avatar
Daniel Hiltgen committed
673
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
674
675
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
676
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
677
678
679
680
681
682
683
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
684
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
685
686
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
687
688
689
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
690
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
691
692
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
693
func TestAlreadyCanceled(t *testing.T) {
694
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
695
696
697
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
698
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
699
700
701
702
703
704
705
706
707
708
709
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

710
func TestHomogeneousGPUs(t *testing.T) {
711
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
712
713
714
	defer done()
	s := InitScheduler(ctx)

715
	s.getGpuFn = func() discover.GpuInfoList {
716
		// Set memory values to require the model to be spread
717
		gpus := []discover.GpuInfo{
718
719
720
721
722
723
724
725
726
727
728
			{Library: "cuda"},
			{Library: "rocm"},
		}
		gpus[0].TotalMemory = 1 * format.GibiByte
		gpus[0].FreeMemory = 256 * format.MebiByte
		gpus[1].TotalMemory = 1 * format.GibiByte
		gpus[1].FreeMemory = 256 * format.MebiByte
		return gpus
	}
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
Michael Yang's avatar
Michael Yang committed
729
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
730
		require.Len(t, gpus, 1)
Michael Yang's avatar
Michael Yang committed
731
		return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
	}
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
749
type mockLlm struct {
750
751
752
	pingResp           error
	waitResp           error
	completionResp     error
753
754
	embeddingResp      []float32
	embeddingRespErr   error
755
756
757
758
759
760
761
762
763
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
764
765
766
767
768
769
770
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
771

772
773
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
774
}
Michael Yang's avatar
lint  
Michael Yang committed
775

Daniel Hiltgen's avatar
Daniel Hiltgen committed
776
777
778
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
779

Daniel Hiltgen's avatar
Daniel Hiltgen committed
780
781
782
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
783

Daniel Hiltgen's avatar
Daniel Hiltgen committed
784
785
786
787
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
788
789
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
790
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
791
func (s *mockLlm) Pid() int                               { return -1 }