sched_test.go 21.7 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
package server

import (
	"bytes"
	"context"
	"fmt"
	"log/slog"
	"os"
	"testing"
	"time"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/gpu"
	"github.com/ollama/ollama/llm"
	"github.com/stretchr/testify/require"
)

func init() {
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
29
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
30
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
33
34
}

func TestLoad(t *testing.T) {
35
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
	defer done()
	s := InitScheduler(ctx)
38
	var ggml *llm.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
41
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
43
44
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
45
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
	}
	// Fail to load model first
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
51
		return nil, fmt.Errorf("something failed to load model blah")
	}
	gpus := gpu.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
	s.load(req, ggml, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
53
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
55
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
56
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
59
60
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

61
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
64
		return server, nil
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
67
68
69
70
71
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
76
77
78
	}

	req.model.ModelPath = "dummy_model_path"
	server.waitResp = fmt.Errorf("wait failure")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
81
82
83
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
84
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
90
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
94
	require.Len(t, s.expiredCh, 1)
}

95
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
98
99
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
100
	ggml    *llm.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
102
}

103
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
105
106
	return scenario.srv, nil
}

107
108
109
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
110
111
112
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
113
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
114
115
	defer f.Close()

Michael Yang's avatar
Michael Yang committed
116
	require.NoError(t, llm.WriteGGUF(f, llm.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
117
118
119
120
121
122
123
124
125
126
		"general.architecture":          "llama",
		"general.name":                  "name",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
Michael Yang's avatar
Michael Yang committed
127
	}, []*llm.Tensor{
128
129
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Michael Yang's avatar
Michael Yang committed
130
	}))
Michael Yang's avatar
lint  
Michael Yang committed
131
	require.NoError(t, err)
132

Daniel Hiltgen's avatar
Daniel Hiltgen committed
133
134
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
135
	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
136
	require.NoError(t, err)
137

138
139
140
141
142
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
143
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
		opts:            api.DefaultOptions(),
145
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
147
148
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
149
150
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
151
152
}

153
154
155
156
157
158
func getGpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "metal"}
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159

160
161
162
163
164
165
func getCpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "cpu"}
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
166

167
168
169
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170
	s := InitScheduler(ctx)
171
172
173
174
175
176
177
178
179
180
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
181
182
183
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
184
185
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
186
		require.Empty(t, s.pendingReqCh)
187
188
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
189
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
190
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
191
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
192
193
194
	}

	// Same runner as first request due to not needing a reload
195
196
197
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
198
	select {
199
200
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
201
		require.Empty(t, s.pendingReqCh)
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
233
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
234
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
235
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
236
237
238
	}

	// Trigger a reload
239
240
241
242
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
243
244
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
245
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
246
	select {
247
248
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
249
		require.Empty(t, s.pendingReqCh)
250
251
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
252
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
253
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
254
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
255
	}
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
271

Michael Yang's avatar
int  
Michael Yang committed
272
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
273
274
275
276
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
277
	select {
278
279
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
280
		require.Empty(t, s.pendingReqCh)
281
282
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
283
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
284
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
285
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290

Michael Yang's avatar
int  
Michael Yang committed
291
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
292
293
294
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
295
	select {
296
297
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
298
		require.Empty(t, s.pendingReqCh)
299
300
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
301
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
302
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308

Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	// This is a CPU load with NumGPU = 0 so it should load
310
311
312
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
313
	select {
314
315
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
316
		require.Empty(t, s.pendingReqCh)
317
318
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
319
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
320
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
321
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
323
324
325
326
327
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

	// Try to load a model that wont fit
328
329
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
330
331
332
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
333
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
334
	time.Sleep(2 * time.Millisecond)
335
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
336
337
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
338
339
340
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
341
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
342
	select {
343
344
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
345
		require.Empty(t, s.pendingReqCh)
346
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
347
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
348
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
351
352
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
354
}

355
356
357
358
359
360
361
func TestGetRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
362
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
363
364
365
366
367
368
369
370
371
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
372
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
373
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
374
375
376
377
378
379
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
380
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
381
382
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
383
384
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
385
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
386
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
	}
388
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392

393
394
395
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
396
	// Starts in pending channel, then should be quickly processsed to return an error
397
	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
398
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
399
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
400
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
401
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
402
403
404
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
405
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
406
407
408
409
}

// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
410
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
411
412
413
	defer done()

	// Same model, same request
414
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
415
416
417
418
419
420
421
422
423
424
425
426
427
428
	s := InitScheduler(ctx)
	s.getGpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "metal"}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
		return []gpu.GpuInfo{g}
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
429
430
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
431
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
432
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
433
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
434
435
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
436
437
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
438
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
439
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
440
	}
441
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
442
443
444
445
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
446
	require.Empty(t, s.finishedReqCh)
447
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
448
	require.Empty(t, s.loaded)
449
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
450
451
452
453
454
455
456

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
457
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
458
459
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
460
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
461
		successCh:       make(chan *runnerRef, 1),
462
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
463
464
	}
	finished := make(chan *LlmRequest)
465
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
466
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
467
468
469
470
471
472
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
473
474
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
475
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
476
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
477
478
479
480
481
482
483
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
484
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
500
501
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
502
503
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
504
505

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
506
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
508
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
509
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
510
511

	s.updateFreeSpace(gpus)
512
513
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
514
515
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{gpus[1]}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
548
549
550
}

func TestFindRunnerToUnload(t *testing.T) {
551
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
552
	defer done()
553

Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
555
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
556
557

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
558
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
559
560
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
561
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
562

563
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
564
565
	require.Equal(t, r2, resp)
	r2.refCount = 1
566
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
567
568
569
570
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
571
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
572
573
	defer done()

574
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
575
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
576
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
577
578
579
580
581
582
583
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
584
585
586
587
588
589
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
590
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
591
592
593
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
594
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
595
596
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
597
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
598
599
600
601
602
603
604
605
606
607
608
609
610
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
	llm.pingResp = fmt.Errorf("foo")
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
611
612
613
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
614
615
616
617
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
618
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
619
620
	defer done()

621
622
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
623
624
625
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
626
627
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
628

Daniel Hiltgen's avatar
Daniel Hiltgen committed
629
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
630
631
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
632
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
633
634
635
636
637
638
639
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
640
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
641
642
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
643
644
645
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
646
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
647
648
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
649
650
651
652
653
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
654
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
655
656
657
658
659
660
661
662
663
664
665
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
func TestHomogeneousGPUs(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)

	s.getGpuFn = func() gpu.GpuInfoList {
		// Set memory values to require the model to be spread
		gpus := []gpu.GpuInfo{
			{Library: "cuda"},
			{Library: "rocm"},
		}
		gpus[0].TotalMemory = 1 * format.GibiByte
		gpus[0].FreeMemory = 256 * format.MebiByte
		gpus[1].TotalMemory = 1 * format.GibiByte
		gpus[1].FreeMemory = 256 * format.MebiByte
		return gpus
	}
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
		require.Len(t, gpus, 1)
		return a.newServer(gpus, model, ggml, adapters, projectors, opts, numParallel)
	}
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
705
type mockLlm struct {
706
707
708
	pingResp           error
	waitResp           error
	completionResp     error
709
	embedResp          *llm.EmbedResponse
710
	embedRespErr       error
711
712
713
714
715
716
717
718
719
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
720
721
722
723
724
725
726
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
727
func (s *mockLlm) Embed(ctx context.Context, input []string) (*llm.EmbedResponse, error) {
728
	return s.embedResp, s.embedRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
729
730
731
732
733
734
735
736
737
738
739
}
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
740
741
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
742
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }