sched_test.go 24.5 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
16
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
17
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
18
	"github.com/ollama/ollama/llm"
19
	"github.com/ollama/ollama/ml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
20
21
)

Michael Yang's avatar
lint  
Michael Yang committed
22
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
23
24
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
25
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
}

28
func TestSchedInit(t *testing.T) {
29
	ctx, done := context.WithCancel(t.Context())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
30
31
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
}

37
func TestSchedLoad(t *testing.T) {
38
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
	defer done()
	s := InitScheduler(ctx)
41
	s.waitForRecovery = 10 * time.Millisecond
Michael Yang's avatar
Michael Yang committed
42
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
43
44
45
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
48
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
49
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
51
	}
	// Fail to load model first
52
	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
53
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
	}
55
56
57
	gpus := []ml.DeviceInfo{}
	systemInfo := ml.SystemInfo{}
	s.load(req, f, systemInfo, gpus, false)
Michael Yang's avatar
lint  
Michael Yang committed
58
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
61
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
64
65
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

66
	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
67
	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
68
		server.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
70
		return server, nil
	}
71
	s.load(req, f, systemInfo, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
73
74
75
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
Jesse Gross's avatar
Jesse Gross committed
76
		require.Equal(t, uint64(10), resp.vramSize)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
81
82
83
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
84
	server.waitResp = errors.New("wait failure")
85
	s.load(req, f, systemInfo, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
87
88
89
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
100
	require.Len(t, s.expiredCh, 1)
}

101
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
103
104
105
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
106
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
}

109
func (scenario *reqBundle) newServer(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
110
	scenario.srv.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
111
112
113
	return scenario.srv, nil
}

114
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration, vramByGPU map[ml.DeviceID]uint64) *reqBundle {
115
116
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
117
118
	t.Helper()

119
	p, _ := createBinFile(t, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
121
122
123
124
125
126
127
128
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
129
	}, []*ggml.Tensor{
130
131
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
132
	})
133

134
135
136
137
138
139
	model := &Model{Name: modelName, ModelPath: p}
	f, err := llm.LoadModel(model.ModelPath, 0)
	if err != nil {
		t.Fatal(err)
	}
	b.f = f
140
141
142
143
144
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		opts:            api.DefaultOptions(),
147
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
149
150
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
151
	b.srv = &mockLlm{vramSize: vramSize, vramByGPU: vramByGPU}
152
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
154
}

155
func getGpuFn(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
156
	slog.Info("test getGpuFn called", "runners", runners)
157
	g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
158
159
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
160
	return []ml.DeviceInfo{g}
161
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162

163
164
165
166
167
168
func getSystemInfoFn() ml.SystemInfo {
	slog.Info("test getSystemInfoFn called")
	return ml.SystemInfo{
		TotalMemory: 32 * format.GigaByte,
		FreeMemory:  26 * format.GigaByte,
	}
169
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170

171
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
172
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
173
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
174
	s := InitScheduler(ctx)
175
	s.waitForRecovery = 10 * time.Millisecond
176
	s.getGpuFn = getGpuFn
177
	s.getSystemInfoFn = getSystemInfoFn
178
179
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
180
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
181
	b.f = a.f
182
183
184
185

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
186
187
188
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
189
190
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
191
		require.Empty(t, s.pendingReqCh)
192
193
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
194
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
196
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
197
198
199
	}

	// Same runner as first request due to not needing a reload
200
201
202
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
203
	select {
204
205
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
206
		require.Empty(t, s.pendingReqCh)
207
208
209
210
211
212
213
214
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

215
func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
216
	ctx, done := context.WithTimeout(t.Context(), 5000*time.Millisecond)
217
218
	defer done()
	s := InitScheduler(ctx)
219
	s.waitForRecovery = 10 * time.Millisecond
220
	s.getGpuFn = getGpuFn
221
	s.getSystemInfoFn = getSystemInfoFn
222
223
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
224
225
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
226
	b.f = a.f
227
228
229
230
231
232
233
234
235
236
237
238

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
239
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
240
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
241
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
242
243
244
	}

	// Trigger a reload
245
246
247
248
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
249
250
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
251
	a.ctxDone()
252
253
	// Report recovered VRAM usage
	time.Sleep(1 * time.Millisecond)
254
255
256
	s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
		slog.Info("altered getGpuFn called")
		g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
257
258
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 24 * format.GigaByte
259
		return []ml.DeviceInfo{g}
260
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
261
	select {
262
263
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
264
		require.Empty(t, s.pendingReqCh)
265
266
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
267
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
268
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
269
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
270
	}
271
272
}

273
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
274
275
	slog.Info("TestRequestsMultipleLoadedModels")
	ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
276
277
	defer done()
	s := InitScheduler(ctx)
278
	s.waitForRecovery = 10 * time.Millisecond
279
280
	s.getGpuFn = getGpuFn // 1 Metal GPU
	s.getSystemInfoFn = getSystemInfoFn
281
282

	// Multiple loaded models
283
	a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 1 * format.GigaByte})
284
	a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
285
	b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 10 * format.GigaByte})
286
287
288
289
	b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
	c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
	c.req.opts.NumGPU = 0                                                                                                                         // CPU load, will be allowed
	b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond}                                                                        // longer than b to cause the scheduler to favor unloading b over c
290
	d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "Metal"}: 13 * format.GigaByte}) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291

292
	s.newServerFn = a.newServer
293
	slog.Info("Loading A")
294
295
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
296
	select {
297
298
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
299
		require.Empty(t, s.pendingReqCh)
300
301
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
302
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309

Michael Yang's avatar
int  
Michael Yang committed
310
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
311
	s.newServerFn = b.newServer
312
	slog.Info("Loading B")
313
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
314
	select {
315
316
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
317
		require.Empty(t, s.pendingReqCh)
318
319
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
320
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
321
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
325
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
326
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
327

Daniel Hiltgen's avatar
Daniel Hiltgen committed
328
	// This is a CPU load with NumGPU = 0 so it should load
329
	s.newServerFn = c.newServer
330
	slog.Info("Loading C")
331
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
332
	select {
333
334
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
335
		require.Empty(t, s.pendingReqCh)
336
337
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
338
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
339
	case <-ctx.Done():
340
		slog.Info("FAIL: scheduler state", "s.loaded", s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
341
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
342
343
344
345
346
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

347
	// Try to load a model that won't fit
348
349
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
351
352
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
353
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
354
	time.Sleep(2 * time.Millisecond)
355
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
356
357
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
358
359
360
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
361
	// Mark b done so it can unload
362
	b.ctxDone()
363
364
	// Report recovered VRAM usage so scheduler will finish waiting and unload
	time.Sleep(1 * time.Millisecond)
365
366
	s.getGpuFn = func(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.DeviceInfo {
		g := ml.DeviceInfo{DeviceID: ml.DeviceID{Library: "Metal"}}
367
368
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 24 * format.GigaByte
369
		return []ml.DeviceInfo{g}
370
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
371
	select {
372
373
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
374
		require.Empty(t, s.pendingReqCh)
375
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
376
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
377
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
378
	}
379
380
381
382
383
384
385
386
387
388
389
390
391
	// Wait for b to close
closeWait:
	for {
		select {
		case <-ctx.Done():
			t.Fatal("timeout")
		default:
			if b.srv.closeCalled {
				break closeWait
			}
			time.Sleep(1 * time.Millisecond)
		}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
393
394
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
395
396
}

397
func TestSchedGetRunner(t *testing.T) {
398
	ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
399
400
	defer done()

401
402
403
	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
Michael Yang's avatar
int  
Michael Yang committed
404
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
405
	s := InitScheduler(ctx)
406
	s.waitForRecovery = 10 * time.Millisecond
407
	s.getGpuFn = getGpuFn
408
	s.getSystemInfoFn = getSystemInfoFn
409
410
411
412
413
414
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
415
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
416
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
417
418
419
420
421
422
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
423
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
424
425
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
426
427
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
428
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
429
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
430
	}
431
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
432
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
433
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
434
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
435

436
437
438
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
439
	// Starts in pending channel, then should be quickly processed to return an error
440
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
441
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
442
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
443
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
444
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
445
446
447
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
448
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
449
450
}

451
func TestSchedExpireRunner(t *testing.T) {
452
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Patrick Devine's avatar
Patrick Devine committed
453
454
	defer done()
	s := InitScheduler(ctx)
455
	s.waitForRecovery = 10 * time.Millisecond
Patrick Devine's avatar
Patrick Devine committed
456
457
458
459
460
461
462
463
464
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
465
	var f *ggml.GGML
466
467
	gpus := []ml.DeviceInfo{}
	systemInfo := ml.SystemInfo{}
468
	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
469
	s.newServerFn = func(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
470
		server.modelPath = model
Patrick Devine's avatar
Patrick Devine committed
471
472
		return server, nil
	}
473
	s.load(req, f, systemInfo, gpus, false)
Patrick Devine's avatar
Patrick Devine committed
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
500
// TODO - add one scenario that triggers the bogus finished event with positive ref count
501
func TestSchedPrematureExpired(t *testing.T) {
502
	ctx, done := context.WithTimeout(t.Context(), 1000*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
503
504
505
	defer done()

	// Same model, same request
506
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 100 * time.Millisecond}, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
	s := InitScheduler(ctx)
508
	s.waitForRecovery = 10 * time.Millisecond
509
510
	s.getGpuFn = getGpuFn
	s.getSystemInfoFn = getSystemInfoFn
Daniel Hiltgen's avatar
Daniel Hiltgen committed
511
512
513
514
515
516
517
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
518
519
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
520
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
521
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
522
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
523
524
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
525
526
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
527
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
528
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
529
	}
530
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
531
532
533
534
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
535
	require.Empty(t, s.finishedReqCh)
536
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
537
	require.Empty(t, s.loaded)
538
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
539
540
541
542
543
544

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

545
func TestSchedUseLoadedRunner(t *testing.T) {
546
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
547
548
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
549
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
550
		successCh:       make(chan *runnerRef, 1),
551
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
552
553
	}
	finished := make(chan *LlmRequest)
554
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
555
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
556
557
558
559
560
561
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
562
563
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
564
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
565
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
566
567
568
569
570
571
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

572
func TestSchedUpdateFreeSpace(t *testing.T) {
573
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
574
	defer done()
575
	gpus := []ml.DeviceInfo{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
576
		{
577
578
579
			DeviceID: ml.DeviceID{
				ID: "1",
			},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
580
581
		},
		{
582
583
584
			DeviceID: ml.DeviceID{
				ID: "2",
			},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
585
586
587
588
589
590
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
591
592
593
594
595
596
597
598
599
600
601
602
	gpuIDs := []ml.DeviceID{
		{
			ID: "1",
		},
		{
			ID: "2",
		},
	}
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{{ID: "1"}: 50, {ID: "2"}: 50}}
	llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{{ID: "1"}: 125, {ID: "2"}: 75}}
	r1 := &runnerRef{llama: llm1, gpus: gpuIDs, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpuIDs, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
603
604

	s := InitScheduler(ctx)
605
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
606
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
607
608
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
609
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
610
611

	s.updateFreeSpace(gpus)
612
613
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
614
615
}

616
func TestSchedFindRunnerToUnload(t *testing.T) {
617
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
618
	defer done()
619

Daniel Hiltgen's avatar
Daniel Hiltgen committed
620
621
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
622
623

	s := InitScheduler(ctx)
624
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
625
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
626
627
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
628
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
629

630
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
631
632
	require.Equal(t, r2, resp)
	r2.refCount = 1
633
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
634
635
636
	require.Equal(t, r1, resp)
}

637
func TestSchedNeedsReload(t *testing.T) {
638
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
639
640
	defer done()

641
	llm := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
642
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
643
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
644
645
646
647
648
649
650
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
651
652
653
654
655
656
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
657
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
658
659
660
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
661
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
662
663
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
664
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
665
666
667
668
669
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
670
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
671
672
673
674
675
676
677
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
678
679
680
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
681
682
683
	require.False(t, resp)
}

684
func TestSchedUnloadAllRunners(t *testing.T) {
685
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
686
687
	defer done()

688
689
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
	llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
690
	s := InitScheduler(ctx)
691
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
692
693
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
694
695
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
696

Daniel Hiltgen's avatar
Daniel Hiltgen committed
697
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
698
699
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
700
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
701
702
703
704
705
706
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

707
func TestSchedUnload(t *testing.T) {
708
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
709
710
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
711
712
713
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
714
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
715
716
}

717
func TestSchedAlreadyCanceled(t *testing.T) {
718
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
719
720
721
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
722
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0}, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
723
	s := InitScheduler(ctx)
724
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
725
726
727
728
729
730
731
732
733
734
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

Jesse Gross's avatar
Jesse Gross committed
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
type mockLlm struct {
	modelPath         string
	pingResp          error
	waitResp          error
	completionResp    error
	embeddingResp     []float32
	embeddingRespErr  error
	tokenizeResp      []int
	tokenizeRespErr   error
	detokenizeResp    string
	detonekizeRespErr error
	closeResp         error
	closeCalled       bool
	vramSize          uint64
	totalSize         uint64
750
	vramByGPU         map[ml.DeviceID]uint64
751
752
}

Jesse Gross's avatar
Jesse Gross committed
753
754
func (s *mockLlm) ModelPath() string {
	return s.modelPath
Daniel Hiltgen's avatar
Daniel Hiltgen committed
755
756
}

757
func (s *mockLlm) Load(ctx context.Context, sytemInfo ml.SystemInfo, gpus []ml.DeviceInfo, requireFull bool) ([]ml.DeviceID, error) {
Jesse Gross's avatar
Jesse Gross committed
758
	if requireFull {
759
760
761
762
		if len(gpus) == 0 {
			slog.Info("mockLlm.Load CPU based load")
			return nil, nil
		}
Jesse Gross's avatar
Jesse Gross committed
763
764
		for _, g := range gpus {
			if g.FreeMemory >= s.vramSize {
765
				return []ml.DeviceID{g.DeviceID}, nil
Jesse Gross's avatar
Jesse Gross committed
766
767
768
			}
		}

769
770
771
772
773
		return nil, llm.ErrLoadRequiredFull
	}
	gpuIDs := make([]ml.DeviceID, len(gpus))
	for i := range gpus {
		gpuIDs[i] = gpus[i].DeviceID
Jesse Gross's avatar
Jesse Gross committed
774
	}
775
	return gpuIDs, nil
Jesse Gross's avatar
Jesse Gross committed
776
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
777
778
779
780
781
func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
782

783
784
func (s *mockLlm) Embedding(ctx context.Context, input string, truncate bool) ([]float32, int, error) {
	return s.embeddingResp, 0, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
785
}
Michael Yang's avatar
lint  
Michael Yang committed
786

Daniel Hiltgen's avatar
Daniel Hiltgen committed
787
788
789
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
790

Daniel Hiltgen's avatar
Daniel Hiltgen committed
791
792
793
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
794

Daniel Hiltgen's avatar
Daniel Hiltgen committed
795
796
797
798
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
799
800
801
802
803
804
805
806
func (s *mockLlm) VRAMSize() uint64                                   { return s.vramSize }
func (s *mockLlm) TotalSize() uint64                                  { return s.totalSize }
func (s *mockLlm) VRAMByGPU(id ml.DeviceID) uint64                    { return s.vramByGPU[id] }
func (s *mockLlm) Pid() int                                           { return -1 }
func (s *mockLlm) GetPort() int                                       { return -1 }
func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return nil }
func (s *mockLlm) HasExited() bool                                    { return false }
func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }