sched_test.go 21.6 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
16
17
18
19
20
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/gpu"
	"github.com/ollama/ollama/llm"
)

Michael Yang's avatar
lint  
Michael Yang committed
21
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
22
23
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
24
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
27
28
29
30
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
35
36
}

func TestLoad(t *testing.T) {
37
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
38
39
	defer done()
	s := InitScheduler(ctx)
40
	var ggml *llm.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
41
42
43
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
46
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
47
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
49
	}
	// Fail to load model first
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
51
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
53
	}
	gpus := gpu.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
	s.load(req, ggml, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
55
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
56
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
58
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

63
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
66
		return server, nil
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
70
71
72
73
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
79
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
80
	server.waitResp = errors.New("wait failure")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
81
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
83
84
85
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95
96
	require.Len(t, s.expiredCh, 1)
}

97
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
100
101
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
102
	ggml    *llm.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
}

105
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
107
108
	return scenario.srv, nil
}

109
110
111
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
112
113
114
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
115
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
	defer f.Close()

Michael Yang's avatar
Michael Yang committed
118
	require.NoError(t, llm.WriteGGUF(f, llm.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
119
120
121
122
123
124
125
126
127
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
Michael Yang's avatar
Michael Yang committed
128
	}, []llm.Tensor{
129
130
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Michael Yang's avatar
Michael Yang committed
131
	}))
Michael Yang's avatar
lint  
Michael Yang committed
132
	require.NoError(t, err)
133

Daniel Hiltgen's avatar
Daniel Hiltgen committed
134
135
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
136
	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
	require.NoError(t, err)
138

139
140
141
142
143
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		opts:            api.DefaultOptions(),
146
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
148
149
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
150
151
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
153
}

154
155
156
157
158
159
func getGpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "metal"}
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160

161
162
163
164
165
166
func getCpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "cpu"}
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
167

168
169
170
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
171
	s := InitScheduler(ctx)
172
173
174
175
176
177
178
179
180
181
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
182
183
184
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
185
186
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
187
		require.Empty(t, s.pendingReqCh)
188
189
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
190
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
191
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
192
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
194
195
	}

	// Same runner as first request due to not needing a reload
196
197
198
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
199
	select {
200
201
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
202
		require.Empty(t, s.pendingReqCh)
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
234
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
235
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
236
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
238
239
	}

	// Trigger a reload
240
241
242
243
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
244
245
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
246
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
247
	select {
248
249
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
250
		require.Empty(t, s.pendingReqCh)
251
252
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
253
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
254
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
255
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
256
	}
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
272

Michael Yang's avatar
int  
Michael Yang committed
273
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
274
275
276
277
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
278
	select {
279
280
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
281
		require.Empty(t, s.pendingReqCh)
282
283
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
284
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
285
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291

Michael Yang's avatar
int  
Michael Yang committed
292
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
293
294
295
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
296
	select {
297
298
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
299
		require.Empty(t, s.pendingReqCh)
300
301
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
302
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309

Daniel Hiltgen's avatar
Daniel Hiltgen committed
310
	// This is a CPU load with NumGPU = 0 so it should load
311
312
313
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
314
	select {
315
316
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
317
		require.Empty(t, s.pendingReqCh)
318
319
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
320
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
321
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
324
325
326
327
328
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

	// Try to load a model that wont fit
329
330
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
331
332
333
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
334
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
335
	time.Sleep(2 * time.Millisecond)
336
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
337
338
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
339
340
341
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
342
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
343
	select {
344
345
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
346
		require.Empty(t, s.pendingReqCh)
347
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
348
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
352
353
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
354
355
}

356
357
358
359
360
361
362
func TestGetRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
363
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
364
365
366
367
368
369
370
371
372
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
373
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
374
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
375
376
377
378
379
380
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
381
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
382
383
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
384
385
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
386
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
388
	}
389
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
393

394
395
396
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
397
	// Starts in pending channel, then should be quickly processsed to return an error
398
	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
399
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
400
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
401
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
402
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
403
404
405
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
406
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
407
408
409
410
}

// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
411
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
412
413
414
	defer done()

	// Same model, same request
415
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
416
417
418
419
420
421
422
423
424
425
426
427
428
429
	s := InitScheduler(ctx)
	s.getGpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "metal"}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
		return []gpu.GpuInfo{g}
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
430
431
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
432
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
433
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
434
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
435
436
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
437
438
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
439
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
440
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
441
	}
442
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
443
444
445
446
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
447
	require.Empty(t, s.finishedReqCh)
448
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
449
	require.Empty(t, s.loaded)
450
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
451
452
453
454
455
456
457

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
458
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
459
460
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
461
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
462
		successCh:       make(chan *runnerRef, 1),
463
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
464
465
	}
	finished := make(chan *LlmRequest)
466
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
467
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
468
469
470
471
472
473
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
474
475
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
476
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
477
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
478
479
480
481
482
483
484
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
485
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
501
502
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
503
504
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
505
506

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
508
509
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
510
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
511
512

	s.updateFreeSpace(gpus)
513
514
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
515
516
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{gpus[1]}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
549
550
551
}

func TestFindRunnerToUnload(t *testing.T) {
552
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
553
	defer done()
554

Daniel Hiltgen's avatar
Daniel Hiltgen committed
555
556
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
557
558

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
559
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
560
561
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
562
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
563

564
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
565
566
	require.Equal(t, r2, resp)
	r2.refCount = 1
567
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
568
569
570
571
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
572
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
573
574
	defer done()

575
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
576
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
577
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
578
579
580
581
582
583
584
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
585
586
587
588
589
590
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
591
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
592
593
594
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
595
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
596
597
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
598
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
599
600
601
602
603
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
604
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
605
606
607
608
609
610
611
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
612
613
614
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
615
616
617
618
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
619
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
620
621
	defer done()

622
623
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
624
625
626
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
627
628
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
629

Daniel Hiltgen's avatar
Daniel Hiltgen committed
630
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
631
632
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
633
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
634
635
636
637
638
639
640
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
641
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
642
643
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
644
645
646
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
647
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
648
649
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
650
651
652
653
654
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
655
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
656
657
658
659
660
661
662
663
664
665
666
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
func TestHomogeneousGPUs(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)

	s.getGpuFn = func() gpu.GpuInfoList {
		// Set memory values to require the model to be spread
		gpus := []gpu.GpuInfo{
			{Library: "cuda"},
			{Library: "rocm"},
		}
		gpus[0].TotalMemory = 1 * format.GibiByte
		gpus[0].FreeMemory = 256 * format.MebiByte
		gpus[1].TotalMemory = 1 * format.GibiByte
		gpus[1].FreeMemory = 256 * format.MebiByte
		return gpus
	}
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
		require.Len(t, gpus, 1)
		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
	}
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
706
type mockLlm struct {
707
708
709
	pingResp           error
	waitResp           error
	completionResp     error
710
711
	embeddingResp      []float32
	embeddingRespErr   error
712
713
714
715
716
717
718
719
720
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
721
722
723
724
725
726
727
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
728

729
730
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
731
}
Michael Yang's avatar
lint  
Michael Yang committed
732

Daniel Hiltgen's avatar
Daniel Hiltgen committed
733
734
735
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
736

Daniel Hiltgen's avatar
Daniel Hiltgen committed
737
738
739
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
740

Daniel Hiltgen's avatar
Daniel Hiltgen committed
741
742
743
744
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
745
746
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
747
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }