sched_test.go 20.5 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
package server

import (
	"bytes"
	"context"
	"encoding/binary"
	"fmt"
	"log/slog"
	"os"
	"testing"
	"time"

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
Michael Yang's avatar
lint  
Michael Yang committed
15
	"github.com/ollama/ollama/envconfig"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
	"github.com/ollama/ollama/format"
	"github.com/ollama/ollama/gpu"
	"github.com/ollama/ollama/llm"
	"github.com/stretchr/testify/require"
)

func init() {
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
35
36
}

func TestLoad(t *testing.T) {
37
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
38
39
	defer done()
	s := InitScheduler(ctx)
40
	var ggml *llm.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
41
42
43
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
46
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
47
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
49
	}
	// Fail to load model first
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
52
53
		return nil, fmt.Errorf("something failed to load model blah")
	}
	gpus := gpu.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
	s.load(req, ggml, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
55
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
56
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
58
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

63
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
	s.newServerFn = func(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
66
		return server, nil
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
70
71
72
73
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
74
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
79
80
	}

	req.model.ModelPath = "dummy_model_path"
	server.waitResp = fmt.Errorf("wait failure")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
81
	s.load(req, ggml, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
83
84
85
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95
96
	require.Len(t, s.expiredCh, 1)
}

97
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
100
101
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
102
	ggml    *llm.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
}

105
func (scenario *reqBundle) newServer(gpus gpu.GpuInfoList, model string, ggml *llm.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
107
108
	return scenario.srv, nil
}

109
110
111
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
112
113
114
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
115
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
	defer f.Close()

	gguf := llm.NewGGUFV3(binary.LittleEndian)
	err = gguf.Encode(f, llm.KV{
		"general.architecture":          "llama",
		"general.name":                  "name",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
	}, []llm.Tensor{
131
132
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
133
	})
Michael Yang's avatar
lint  
Michael Yang committed
134
	require.NoError(t, err)
135

Daniel Hiltgen's avatar
Daniel Hiltgen committed
136
137
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
138
	b.ggml, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
139
	require.NoError(t, err)
140

141
142
143
144
145
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
		opts:            api.DefaultOptions(),
148
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
149
150
151
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
152
153
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154
155
}

156
157
158
159
160
161
func getGpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "metal"}
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162

163
164
165
166
167
168
func getCpuFn() gpu.GpuInfoList {
	g := gpu.GpuInfo{Library: "cpu"}
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
	return []gpu.GpuInfo{g}
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
169

170
171
172
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
173
	s := InitScheduler(ctx)
174
175
176
177
178
179
180
181
182
183
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
184
185
186
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
187
188
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
189
		require.Empty(t, s.pendingReqCh)
190
191
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
192
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
196
197
	}

	// Same runner as first request due to not needing a reload
198
199
200
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
201
	select {
202
203
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
204
		require.Empty(t, s.pendingReqCh)
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
	b.ggml = a.ggml

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
236
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
239
240
241
	}

	// Trigger a reload
242
243
244
245
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
246
247
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
248
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
249
	select {
250
251
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
252
		require.Empty(t, s.pendingReqCh)
253
254
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
255
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
256
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
258
	}
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
274

275
	envconfig.MaxRunners = 1
276
277
278
279
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
280
	select {
281
282
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
283
		require.Empty(t, s.pendingReqCh)
284
285
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
286
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
293

294
	envconfig.MaxRunners = 0
295
296
297
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
298
	select {
299
300
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
301
		require.Empty(t, s.pendingReqCh)
302
303
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
304
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
310
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
311

Daniel Hiltgen's avatar
Daniel Hiltgen committed
312
	// This is a CPU load with NumGPU = 0 so it should load
313
314
315
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
316
	select {
317
318
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
319
		require.Empty(t, s.pendingReqCh)
320
321
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
322
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
325
326
327
328
329
330
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

	// Try to load a model that wont fit
331
332
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
333
334
335
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
336
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
337
	time.Sleep(2 * time.Millisecond)
338
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
339
340
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
341
342
343
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
344
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
345
	select {
346
347
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
348
		require.Empty(t, s.pendingReqCh)
349
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
352
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
354
355
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
356
357
}

358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
func TestGetRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
	envconfig.MaxQueuedRequests = 1
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
375
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
376
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
377
378
379
380
381
382
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
383
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
384
385
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
386
387
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
388
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390
	}
391
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
393
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
394
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
395

396
397
398
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
399
	// Starts in pending channel, then should be quickly processsed to return an error
400
	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
401
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
402
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
403
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
404
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
405
406
407
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
408
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
409
410
411
412
}

// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
413
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
414
415
416
	defer done()

	// Same model, same request
417
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
418
419
420
421
422
423
424
425
426
427
428
429
430
431
	s := InitScheduler(ctx)
	s.getGpuFn = func() gpu.GpuInfoList {
		g := gpu.GpuInfo{Library: "metal"}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
		return []gpu.GpuInfo{g}
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
432
433
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
434
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
435
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
436
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
437
438
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
439
440
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
441
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
442
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
443
	}
444
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
445
446
447
448
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
449
	require.Empty(t, s.finishedReqCh)
450
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
451
	require.Empty(t, s.loaded)
452
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
453
454
455
456
457
458
459

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
460
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
461
462
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
463
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
464
		successCh:       make(chan *runnerRef, 1),
465
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
466
467
	}
	finished := make(chan *LlmRequest)
468
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
469
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
470
471
472
473
474
475
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
476
477
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
478
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
479
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
480
481
482
483
484
485
486
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
487
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
503
504
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
505
506
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
508

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
509
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
510
511
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
512
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
513
514

	s.updateFreeSpace(gpus)
515
516
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
517
518
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	gpus := gpu.GpuInfoList{
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
	r1 := &runnerRef{gpus: gpu.GpuInfoList{gpus[0]}, loading: true}

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{gpus[1]}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

	r1.gpus = gpu.GpuInfoList{}
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
551
552
553
}

func TestFindRunnerToUnload(t *testing.T) {
554
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
555
	defer done()
556

Daniel Hiltgen's avatar
Daniel Hiltgen committed
557
558
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
559
560

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
561
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
562
563
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
564
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
565

566
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
567
568
	require.Equal(t, r2, resp)
	r2.refCount = 1
569
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
570
571
572
573
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
574
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
575
576
	defer done()

577
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
578
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
579
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
580
581
582
583
584
585
586
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
587
588
589
590
591
592
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
593
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
594
595
596
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
597
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
598
599
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
600
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
601
602
603
604
605
606
607
608
609
610
611
612
613
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
	llm.pingResp = fmt.Errorf("foo")
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
614
615
616
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
617
618
619
620
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
621
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
622
623
	defer done()

624
625
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
626
627
628
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
629
630
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
631

Daniel Hiltgen's avatar
Daniel Hiltgen committed
632
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
633
634
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
635
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
636
637
638
639
640
641
642
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
643
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
644
645
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
646
647
648
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
649
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
650
651
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
652
653
654
655
656
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
657
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
658
659
660
661
662
663
664
665
666
667
668
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
669
type mockLlm struct {
670
671
672
	pingResp           error
	waitResp           error
	completionResp     error
673
674
	embedResp          [][]float32
	embedRespErr       error
675
676
677
678
679
680
681
682
683
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
684
685
686
687
688
689
690
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
691
692
func (s *mockLlm) Embed(ctx context.Context, input []string) ([][]float32, error) {
	return s.embedResp, s.embedRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
693
694
695
696
697
698
699
700
701
702
703
}
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
704
705
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
706
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }