sched_test.go 21.1 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
16
	"github.com/ollama/ollama/discover"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
18
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
20
21
	"github.com/ollama/ollama/llm"
)

Michael Yang's avatar
lint  
Michael Yang committed
22
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
23
24
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
25
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
28
}

func TestInitScheduler(t *testing.T) {
29
	ctx, done := context.WithCancel(t.Context())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
30
31
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
37
}

func TestLoad(t *testing.T) {
38
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
	defer done()
	s := InitScheduler(ctx)
Michael Yang's avatar
Michael Yang committed
41
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
43
44
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
48
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
	}
	// Fail to load model first
Michael Yang's avatar
Michael Yang committed
51
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
52
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
	}
54
	gpus := discover.GpuInfoList{}
Jesse Gross's avatar
Jesse Gross committed
55
	s.load(req, f, gpus, false)
Michael Yang's avatar
lint  
Michael Yang committed
56
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
59
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
63
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

Jesse Gross's avatar
Jesse Gross committed
64
	server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
65
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
66
		server.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
68
		return server, nil
	}
Jesse Gross's avatar
Jesse Gross committed
69
	s.load(req, f, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
71
72
73
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
Jesse Gross's avatar
Jesse Gross committed
74
		require.Equal(t, uint64(10), resp.vramSize)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
81
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
82
	server.waitResp = errors.New("wait failure")
Jesse Gross's avatar
Jesse Gross committed
83
	s.load(req, f, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
85
86
87
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
94
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
98
	require.Len(t, s.expiredCh, 1)
}

99
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
101
102
103
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
104
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
105
106
}

Michael Yang's avatar
Michael Yang committed
107
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
108
	scenario.srv.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
109
110
111
	return scenario.srv, nil
}

Jesse Gross's avatar
Jesse Gross committed
112
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration) *reqBundle {
113
114
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
115
116
	t.Helper()

117
	p, _ := createBinFile(t, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
118
119
120
121
122
123
124
125
126
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
127
	}, []*ggml.Tensor{
128
129
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
130
	})
131

132
133
134
135
136
137
	model := &Model{Name: modelName, ModelPath: p}
	f, err := llm.LoadModel(model.ModelPath, 0)
	if err != nil {
		t.Fatal(err)
	}
	b.f = f
138
139
140
141
142
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
143
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
		opts:            api.DefaultOptions(),
145
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
147
148
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
Jesse Gross's avatar
Jesse Gross committed
149
	b.srv = &mockLlm{vramSize: vramSize, vramByGPU: map[string]uint64{"": vramSize}}
150
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
151
152
}

153
154
func getGpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "metal"}
155
156
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
157
	return []discover.GpuInfo{g}
158
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159

160
161
func getCpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "cpu"}
162
163
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
164
	return []discover.GpuInfo{g}
165
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166

167
func TestRequestsSameModelSameRequest(t *testing.T) {
168
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
169
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170
	s := InitScheduler(ctx)
171
172
173
174
175
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
176
	b.f = a.f
177
178
179
180

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
181
182
183
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
184
185
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
186
		require.Empty(t, s.pendingReqCh)
187
188
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
189
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
190
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
191
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
192
193
194
	}

	// Same runner as first request due to not needing a reload
195
196
197
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
198
	select {
199
200
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
201
		require.Empty(t, s.pendingReqCh)
202
203
204
205
206
207
208
209
210
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
211
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
212
213
214
215
216
217
218
219
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
220
	b.f = a.f
221
222
223
224
225
226
227
228
229
230
231
232

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
233
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
234
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
235
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
236
237
238
	}

	// Trigger a reload
239
240
241
242
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
243
244
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
245
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
246
	select {
247
248
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
249
		require.Empty(t, s.pendingReqCh)
250
251
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
252
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
253
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
254
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
255
	}
256
257
258
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
259
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
260
261
262
263
264
265
266
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
Jesse Gross's avatar
Jesse Gross committed
267
268
269
270
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 10*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 10*format.GigaByte, nil)
	c.req.opts.NumGPU = 0                                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 10*format.GigaByte, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
271

Michael Yang's avatar
int  
Michael Yang committed
272
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
273
274
275
276
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
277
	select {
278
279
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
280
		require.Empty(t, s.pendingReqCh)
281
282
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
283
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
284
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
285
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290

Michael Yang's avatar
int  
Michael Yang committed
291
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
292
293
294
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
295
	select {
296
297
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
298
		require.Empty(t, s.pendingReqCh)
299
300
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
301
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
302
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308

Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	// This is a CPU load with NumGPU = 0 so it should load
310
311
312
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
313
	select {
314
315
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
316
		require.Empty(t, s.pendingReqCh)
317
318
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
319
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
320
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
321
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
323
324
325
326
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

327
	// Try to load a model that won't fit
328
329
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
330
331
332
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
333
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
334
	time.Sleep(2 * time.Millisecond)
335
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
336
337
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
338
339
340
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
341
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
342
	select {
343
344
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
345
		require.Empty(t, s.pendingReqCh)
346
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
347
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
348
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
351
352
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
354
}

355
func TestGetRunner(t *testing.T) {
356
	ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
357
358
359
360
361
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
362
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
363
364
365
366
367
368
369
370
371
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
372
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
373
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
374
375
376
377
378
379
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
380
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
381
382
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
383
384
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
385
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
386
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
	}
388
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392

393
394
395
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
396
	// Starts in pending channel, then should be quickly processed to return an error
397
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
398
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
399
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
400
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
401
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
402
403
404
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
405
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
406
407
}

Patrick Devine's avatar
Patrick Devine committed
408
func TestExpireRunner(t *testing.T) {
409
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Patrick Devine's avatar
Patrick Devine committed
410
411
412
413
414
415
416
417
418
419
420
	defer done()
	s := InitScheduler(ctx)
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
421
	var f *ggml.GGML
422
	gpus := discover.GpuInfoList{}
Jesse Gross's avatar
Jesse Gross committed
423
	server := &mockLlm{vramSize: 10, vramByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
424
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
425
		server.modelPath = model
Patrick Devine's avatar
Patrick Devine committed
426
427
		return server, nil
	}
Jesse Gross's avatar
Jesse Gross committed
428
	s.load(req, f, gpus, false)
Patrick Devine's avatar
Patrick Devine committed
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
455
456
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
457
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
458
459
460
	defer done()

	// Same model, same request
461
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
462
	s := InitScheduler(ctx)
463
464
	s.getGpuFn = func() discover.GpuInfoList {
		g := discover.GpuInfo{Library: "metal"}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
465
466
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
467
		return []discover.GpuInfo{g}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
468
469
470
471
472
473
474
475
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
476
477
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
478
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
479
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
480
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
481
482
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
483
484
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
485
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
486
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
487
	}
488
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
489
490
491
492
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
493
	require.Empty(t, s.finishedReqCh)
494
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
495
	require.Empty(t, s.loaded)
496
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
497
498
499
500
501
502
503

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
504
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
505
506
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
508
		successCh:       make(chan *runnerRef, 1),
509
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
510
511
	}
	finished := make(chan *LlmRequest)
Jesse Gross's avatar
Jesse Gross committed
512
	llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
513
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
514
515
516
517
518
519
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
520
521
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
522
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
523
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
524
525
526
527
528
529
530
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
531
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
532
	defer done()
533
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
534
535
536
537
538
539
540
541
542
543
544
545
546
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
Jesse Gross's avatar
Jesse Gross committed
547
548
	llm1 := &mockLlm{vramByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{vramByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
549
550
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
551
552

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
553
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
555
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
556
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
557
558

	s.updateFreeSpace(gpus)
559
560
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
561
562
563
}

func TestFindRunnerToUnload(t *testing.T) {
564
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
565
	defer done()
566

Daniel Hiltgen's avatar
Daniel Hiltgen committed
567
568
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
569
570

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
571
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
572
573
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
574
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
575

576
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
577
578
	require.Equal(t, r2, resp)
	r2.refCount = 1
579
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
580
581
582
583
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
584
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
585
586
	defer done()

Jesse Gross's avatar
Jesse Gross committed
587
	llm := &mockLlm{vramByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
588
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
589
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
590
591
592
593
594
595
596
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
597
598
599
600
601
602
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
603
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
604
605
606
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
607
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
608
609
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
610
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
611
612
613
614
615
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
616
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
617
618
619
620
621
622
623
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
624
625
626
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
627
628
629
630
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
631
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
632
633
	defer done()

Jesse Gross's avatar
Jesse Gross committed
634
635
	llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
	llm2 := &mockLlm{vramByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
636
637
638
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
639
640
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
641

Daniel Hiltgen's avatar
Daniel Hiltgen committed
642
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
643
644
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
645
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
646
647
648
649
650
651
652
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
Jesse Gross's avatar
Jesse Gross committed
653
	llm1 := &mockLlm{vramByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
654
655
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
656
657
658
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
659
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
660
661
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
662
func TestAlreadyCanceled(t *testing.T) {
663
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
664
665
666
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
667
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
668
669
670
671
672
673
674
675
676
677
678
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

Jesse Gross's avatar
Jesse Gross committed
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
type mockLlm struct {
	modelPath         string
	pingResp          error
	waitResp          error
	completionResp    error
	embeddingResp     []float32
	embeddingRespErr  error
	tokenizeResp      []int
	tokenizeRespErr   error
	detokenizeResp    string
	detonekizeRespErr error
	closeResp         error
	closeCalled       bool
	vramSize          uint64
	totalSize         uint64
	vramByGPU         map[string]uint64
695
696
}

Jesse Gross's avatar
Jesse Gross committed
697
698
func (s *mockLlm) ModelPath() string {
	return s.modelPath
Daniel Hiltgen's avatar
Daniel Hiltgen committed
699
700
}

Jesse Gross's avatar
Jesse Gross committed
701
702
703
704
705
706
707
708
709
710
711
712
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) error {
	if requireFull {
		for _, g := range gpus {
			if g.FreeMemory >= s.vramSize {
				return nil
			}
		}

		return llm.ErrLoadRequiredFull
	}
	return nil
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
713
714
715
716
717
func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
718

719
720
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
721
}
Michael Yang's avatar
lint  
Michael Yang committed
722

Daniel Hiltgen's avatar
Daniel Hiltgen committed
723
724
725
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
726

Daniel Hiltgen's avatar
Daniel Hiltgen committed
727
728
729
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
730

Daniel Hiltgen's avatar
Daniel Hiltgen committed
731
732
733
734
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
Jesse Gross's avatar
Jesse Gross committed
735
736
737
738
func (s *mockLlm) VRAMSize() uint64              { return s.vramSize }
func (s *mockLlm) TotalSize() uint64             { return s.totalSize }
func (s *mockLlm) VRAMByGPU(gpuid string) uint64 { return s.vramByGPU[gpuid] }
func (s *mockLlm) Pid() int                      { return -1 }