sched_test.go 24.4 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
16
	"github.com/ollama/ollama/discover"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
18
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
	"github.com/ollama/ollama/llm"
20
	"github.com/ollama/ollama/ml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
21
22
)

Michael Yang's avatar
lint  
Michael Yang committed
23
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
24
25
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
26
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
}

29
func TestSchedInit(t *testing.T) {
30
	ctx, done := context.WithCancel(t.Context())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
32
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
}

38
func TestSchedLoad(t *testing.T) {
39
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
40
41
	defer done()
	s := InitScheduler(ctx)
42
	s.waitForRecovery = 10 * time.Millisecond
Michael Yang's avatar
Michael Yang committed
43
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
44
45
46
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
49
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
50
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
52
	}
	// Fail to load model first
Michael Yang's avatar
Michael Yang committed
53
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
54
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
55
	}
56
	gpus := discover.GpuInfoList{}
Jesse Gross's avatar
Jesse Gross committed
57
	s.load(req, f, gpus, false)
Michael Yang's avatar
lint  
Michael Yang committed
58
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
61
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
64
65
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

66
	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
Michael Yang's avatar
Michael Yang committed
67
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
68
		server.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
70
		return server, nil
	}
Jesse Gross's avatar
Jesse Gross committed
71
	s.load(req, f, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
72
73
74
75
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
Jesse Gross's avatar
Jesse Gross committed
76
		require.Equal(t, uint64(10), resp.vramSize)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
81
82
83
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
84
	server.waitResp = errors.New("wait failure")
Jesse Gross's avatar
Jesse Gross committed
85
	s.load(req, f, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
87
88
89
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
100
	require.Len(t, s.expiredCh, 1)
}

101
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
103
104
105
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
106
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
}

Michael Yang's avatar
Michael Yang committed
109
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
110
	scenario.srv.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
111
112
113
	return scenario.srv, nil
}

114
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration, vramByGPU map[ml.DeviceID]uint64) *reqBundle {
115
116
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
117
118
	t.Helper()

119
	p, _ := createBinFile(t, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
121
122
123
124
125
126
127
128
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
129
	}, []*ggml.Tensor{
130
131
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
132
	})
133

134
135
136
137
138
139
	model := &Model{Name: modelName, ModelPath: p}
	f, err := llm.LoadModel(model.ModelPath, 0)
	if err != nil {
		t.Fatal(err)
	}
	b.f = f
140
141
142
143
144
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		opts:            api.DefaultOptions(),
147
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
149
150
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
151
	b.srv = &mockLlm{vramSize: vramSize, vramByGPU: vramByGPU}
152
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
154
}

155
156
157
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
	slog.Info("test getGpuFn called", "runners", runners)
	g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
158
159
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
160
	return []discover.GpuInfo{g}
161
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162

163
164
165
func getCpuFn() discover.GpuInfo {
	slog.Info("test getCpuFn called")
	g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
166
167
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
168
	return g
169
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
170

171
func TestSchedRequestsSameModelSameRequest(t *testing.T) {
172
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
173
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
174
	s := InitScheduler(ctx)
175
	s.waitForRecovery = 10 * time.Millisecond
176
177
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
178
179
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
180
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
181
	b.f = a.f
182
183
184
185

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
186
187
188
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
189
190
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
191
		require.Empty(t, s.pendingReqCh)
192
193
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
194
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
196
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
197
198
199
	}

	// Same runner as first request due to not needing a reload
200
201
202
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
203
	select {
204
205
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
206
		require.Empty(t, s.pendingReqCh)
207
208
209
210
211
212
213
214
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

215
func TestSchedRequestsSimpleReloadSameModel(t *testing.T) {
216
	ctx, done := context.WithTimeout(t.Context(), 5000*time.Millisecond)
217
218
	defer done()
	s := InitScheduler(ctx)
219
	s.waitForRecovery = 10 * time.Millisecond
220
221
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
222
223
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
224
225
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
226
	b.f = a.f
227
228
229
230
231
232
233
234
235
236
237
238

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
239
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
240
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
241
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
242
243
244
	}

	// Trigger a reload
245
246
247
248
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
249
250
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
251
	a.ctxDone()
252
253
254
255
256
257
258
259
260
	// Report recovered VRAM usage
	time.Sleep(1 * time.Millisecond)
	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
		slog.Info("XXX altered getGpuFn called")
		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 24 * format.GigaByte
		return []discover.GpuInfo{g}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
261
	select {
262
263
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
264
		require.Empty(t, s.pendingReqCh)
265
266
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
267
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
268
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
269
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
270
	}
271
272
}

273
func TestSchedRequestsMultipleLoadedModels(t *testing.T) {
274
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
275
276
	defer done()
	s := InitScheduler(ctx)
277
	s.waitForRecovery = 10 * time.Millisecond
278
279
	s.getGpuFn = getGpuFn // 1 metal GPU
	s.getCpuFn = getCpuFn // 1 CPU
280
281

	// Multiple loaded models
282
283
284
285
286
287
288
289
	a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
	a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
	b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
	b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
	c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
	c.req.opts.NumGPU = 0                                                                                                                         // CPU load, will be allowed
	b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond}                                                                        // longer than b to cause the scheduler to favor unloading b over c
	d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290

Michael Yang's avatar
int  
Michael Yang committed
291
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
292
293
294
295
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
296
	select {
297
298
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
299
		require.Empty(t, s.pendingReqCh)
300
301
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
302
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309

Michael Yang's avatar
int  
Michael Yang committed
310
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
311
312
313
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
314
	select {
315
316
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
317
		require.Empty(t, s.pendingReqCh)
318
319
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
320
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
321
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
325
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
326
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
327

Daniel Hiltgen's avatar
Daniel Hiltgen committed
328
	// This is a CPU load with NumGPU = 0 so it should load
329
330
331
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
332
	select {
333
334
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
335
		require.Empty(t, s.pendingReqCh)
336
337
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
338
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
339
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
340
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
341
342
343
344
345
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

346
	// Try to load a model that won't fit
347
348
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
350
351
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
352
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
	time.Sleep(2 * time.Millisecond)
354
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
355
356
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
357
358
359
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
360
	// Mark b done so it can unload
361
	b.ctxDone()
362
363
364
365
366
367
368
369
	// Report recovered VRAM usage so scheduler will finish waiting and unload
	time.Sleep(1 * time.Millisecond)
	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 24 * format.GigaByte
		return []discover.GpuInfo{g}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
370
	select {
371
372
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
373
		require.Empty(t, s.pendingReqCh)
374
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
375
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
376
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
377
	}
378
379
380
381
382
383
384
385
386
387
388
389
390
	// Wait for b to close
closeWait:
	for {
		select {
		case <-ctx.Done():
			t.Fatal("timeout")
		default:
			if b.srv.closeCalled {
				break closeWait
			}
			time.Sleep(1 * time.Millisecond)
		}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
392
393
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
394
395
}

396
func TestSchedGetRunner(t *testing.T) {
397
	ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
398
399
	defer done()

400
401
402
	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
Michael Yang's avatar
int  
Michael Yang committed
403
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
404
	s := InitScheduler(ctx)
405
	s.waitForRecovery = 10 * time.Millisecond
406
407
408
409
410
411
412
413
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
414
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
415
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
416
417
418
419
420
421
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
422
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
423
424
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
425
426
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
427
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
428
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
429
	}
430
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
431
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
432
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
433
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
434

435
436
437
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
438
	// Starts in pending channel, then should be quickly processed to return an error
439
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
440
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
441
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
442
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
443
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
444
445
446
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
447
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
448
449
}

450
func TestSchedExpireRunner(t *testing.T) {
451
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Patrick Devine's avatar
Patrick Devine committed
452
453
	defer done()
	s := InitScheduler(ctx)
454
	s.waitForRecovery = 10 * time.Millisecond
Patrick Devine's avatar
Patrick Devine committed
455
456
457
458
459
460
461
462
463
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
464
	var f *ggml.GGML
465
	gpus := discover.GpuInfoList{}
466
	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
Michael Yang's avatar
Michael Yang committed
467
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
468
		server.modelPath = model
Patrick Devine's avatar
Patrick Devine committed
469
470
		return server, nil
	}
Jesse Gross's avatar
Jesse Gross committed
471
	s.load(req, f, gpus, false)
Patrick Devine's avatar
Patrick Devine committed
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
498
// TODO - add one scenario that triggers the bogus finished event with positive ref count
499
func TestSchedPrematureExpired(t *testing.T) {
500
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
501
502
503
	defer done()

	// Same model, same request
504
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
505
	s := InitScheduler(ctx)
506
	s.waitForRecovery = 10 * time.Millisecond
507
508
	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
509
510
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
511
		return []discover.GpuInfo{g}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
512
513
514
515
516
517
518
519
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
520
521
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
522
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
523
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
524
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
525
526
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
527
528
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
529
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
530
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
531
	}
532
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
533
534
535
536
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
537
	require.Empty(t, s.finishedReqCh)
538
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
539
	require.Empty(t, s.loaded)
540
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
541
542
543
544
545
546

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

547
func TestSchedUseLoadedRunner(t *testing.T) {
548
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
549
550
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
551
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
552
		successCh:       make(chan *runnerRef, 1),
553
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
555
	}
	finished := make(chan *LlmRequest)
556
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
557
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
558
559
560
561
562
563
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
564
565
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
566
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
567
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
568
569
570
571
572
573
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

574
func TestSchedUpdateFreeSpace(t *testing.T) {
575
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
576
	defer done()
577
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
578
		{
579
580
581
			DeviceID: ml.DeviceID{
				ID: "1",
			},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
582
583
		},
		{
584
585
586
			DeviceID: ml.DeviceID{
				ID: "2",
			},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
587
588
589
590
591
592
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
593
594
595
596
597
598
599
600
601
602
603
604
	gpuIDs := []ml.DeviceID{
		{
			ID: "1",
		},
		{
			ID: "2",
		},
	}
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{{ID: "1"}: 50, {ID: "2"}: 50}}
	llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{{ID: "1"}: 125, {ID: "2"}: 75}}
	r1 := &runnerRef{llama: llm1, gpus: gpuIDs, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpuIDs, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
605
606

	s := InitScheduler(ctx)
607
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
608
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
609
610
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
611
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
612
613

	s.updateFreeSpace(gpus)
614
615
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
616
617
}

618
func TestSchedFindRunnerToUnload(t *testing.T) {
619
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
620
	defer done()
621

Daniel Hiltgen's avatar
Daniel Hiltgen committed
622
623
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
624
625

	s := InitScheduler(ctx)
626
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
627
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
628
629
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
630
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
631

632
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
633
634
	require.Equal(t, r2, resp)
	r2.refCount = 1
635
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
636
637
638
	require.Equal(t, r1, resp)
}

639
func TestSchedNeedsReload(t *testing.T) {
640
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
641
642
	defer done()

643
	llm := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
644
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
645
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
646
647
648
649
650
651
652
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
653
654
655
656
657
658
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
659
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
660
661
662
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
663
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
664
665
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
666
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
667
668
669
670
671
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
672
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
673
674
675
676
677
678
679
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
680
681
682
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
683
684
685
	require.False(t, resp)
}

686
func TestSchedUnloadAllRunners(t *testing.T) {
687
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
688
689
	defer done()

690
691
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
	llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
692
	s := InitScheduler(ctx)
693
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
694
695
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
696
697
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
698

Daniel Hiltgen's avatar
Daniel Hiltgen committed
699
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
700
701
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
702
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
703
704
705
706
707
708
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

709
func TestSchedUnload(t *testing.T) {
710
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
711
712
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
713
714
715
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
716
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
717
718
}

719
func TestSchedAlreadyCanceled(t *testing.T) {
720
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
721
722
723
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
724
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0}, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
725
	s := InitScheduler(ctx)
726
	s.waitForRecovery = 10 * time.Millisecond
Daniel Hiltgen's avatar
Daniel Hiltgen committed
727
728
729
730
731
732
733
734
735
736
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

Jesse Gross's avatar
Jesse Gross committed
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
type mockLlm struct {
	modelPath         string
	pingResp          error
	waitResp          error
	completionResp    error
	embeddingResp     []float32
	embeddingRespErr  error
	tokenizeResp      []int
	tokenizeRespErr   error
	detokenizeResp    string
	detonekizeRespErr error
	closeResp         error
	closeCalled       bool
	vramSize          uint64
	totalSize         uint64
752
	vramByGPU         map[ml.DeviceID]uint64
753
754
}

Jesse Gross's avatar
Jesse Gross committed
755
756
func (s *mockLlm) ModelPath() string {
	return s.modelPath
Daniel Hiltgen's avatar
Daniel Hiltgen committed
757
758
}

759
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
Jesse Gross's avatar
Jesse Gross committed
760
761
762
	if requireFull {
		for _, g := range gpus {
			if g.FreeMemory >= s.vramSize {
763
				return []ml.DeviceID{g.DeviceID}, nil
Jesse Gross's avatar
Jesse Gross committed
764
765
766
			}
		}

767
768
769
770
771
		return nil, llm.ErrLoadRequiredFull
	}
	gpuIDs := make([]ml.DeviceID, len(gpus))
	for i := range gpus {
		gpuIDs[i] = gpus[i].DeviceID
Jesse Gross's avatar
Jesse Gross committed
772
	}
773
	return gpuIDs, nil
Jesse Gross's avatar
Jesse Gross committed
774
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
775
776
777
778
779
func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
780

781
782
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
783
}
Michael Yang's avatar
lint  
Michael Yang committed
784

Daniel Hiltgen's avatar
Daniel Hiltgen committed
785
786
787
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
788

Daniel Hiltgen's avatar
Daniel Hiltgen committed
789
790
791
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
792

Daniel Hiltgen's avatar
Daniel Hiltgen committed
793
794
795
796
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
797
798
799
800
801
802
803
804
func (s *mockLlm) VRAMSize() uint64                                   { return s.vramSize }
func (s *mockLlm) TotalSize() uint64                                  { return s.totalSize }
func (s *mockLlm) VRAMByGPU(id ml.DeviceID) uint64                    { return s.vramByGPU[id] }
func (s *mockLlm) Pid() int                                           { return -1 }
func (s *mockLlm) GetPort() int                                       { return -1 }
func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return nil }
func (s *mockLlm) HasExited() bool                                    { return false }
func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }