sched_test.go 23.8 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
16
	"github.com/ollama/ollama/discover"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
18
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
	"github.com/ollama/ollama/llm"
20
	"github.com/ollama/ollama/ml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
21
22
)

Michael Yang's avatar
lint  
Michael Yang committed
23
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
24
25
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
26
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
29
}

func TestInitScheduler(t *testing.T) {
30
	ctx, done := context.WithCancel(t.Context())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
32
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
38
}

func TestLoad(t *testing.T) {
39
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
40
41
	defer done()
	s := InitScheduler(ctx)
Michael Yang's avatar
Michael Yang committed
42
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
43
44
45
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
47
48
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
49
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
51
	}
	// Fail to load model first
Michael Yang's avatar
Michael Yang committed
52
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
53
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
54
	}
55
	gpus := discover.GpuInfoList{}
Jesse Gross's avatar
Jesse Gross committed
56
	s.load(req, f, gpus, false)
Michael Yang's avatar
lint  
Michael Yang committed
57
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
59
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
60
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
64
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

65
	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
Michael Yang's avatar
Michael Yang committed
66
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
67
		server.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
68
69
		return server, nil
	}
Jesse Gross's avatar
Jesse Gross committed
70
	s.load(req, f, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
72
73
74
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
Jesse Gross's avatar
Jesse Gross committed
75
		require.Equal(t, uint64(10), resp.vramSize)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
81
82
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
83
	server.waitResp = errors.New("wait failure")
Jesse Gross's avatar
Jesse Gross committed
84
	s.load(req, f, gpus, false)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
86
87
88
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
98
99
	require.Len(t, s.expiredCh, 1)
}

100
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
102
103
104
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
105
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
106
107
}

Michael Yang's avatar
Michael Yang committed
108
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
109
	scenario.srv.modelPath = model
Daniel Hiltgen's avatar
Daniel Hiltgen committed
110
111
112
	return scenario.srv, nil
}

113
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, vramSize uint64, duration *api.Duration, vramByGPU map[ml.DeviceID]uint64) *reqBundle {
114
115
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
116
117
	t.Helper()

118
	p, _ := createBinFile(t, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
119
120
121
122
123
124
125
126
127
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
128
	}, []*ggml.Tensor{
129
130
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
131
	})
132

133
134
135
136
137
138
	model := &Model{Name: modelName, ModelPath: p}
	f, err := llm.LoadModel(model.ModelPath, 0)
	if err != nil {
		t.Fatal(err)
	}
	b.f = f
139
140
141
142
143
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
144
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		opts:            api.DefaultOptions(),
146
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
148
149
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
150
	b.srv = &mockLlm{vramSize: vramSize, vramByGPU: vramByGPU}
151
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
153
}

154
155
156
func getGpuFn(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
	slog.Info("test getGpuFn called", "runners", runners)
	g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
157
158
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
159
	return []discover.GpuInfo{g}
160
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161

162
163
164
func getCpuFn() discover.GpuInfo {
	slog.Info("test getCpuFn called")
	g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "cpu"}}
165
166
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
167
	return g
168
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
169

170
func TestRequestsSameModelSameRequest(t *testing.T) {
171
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
172
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
173
	s := InitScheduler(ctx)
174
175
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
176
177
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0}, nil)
178
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
179
	b.f = a.f
180
181
182
183

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
184
185
186
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
187
188
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
189
		require.Empty(t, s.pendingReqCh)
190
191
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
192
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
196
197
	}

	// Same runner as first request due to not needing a reload
198
199
200
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
201
	select {
202
203
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
204
		require.Empty(t, s.pendingReqCh)
205
206
207
208
209
210
211
212
213
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
214
	ctx, done := context.WithTimeout(t.Context(), 5000*time.Millisecond)
215
216
217
218
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
219
220
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond}, nil)
221
222
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
223
	b.f = a.f
224
225
226
227
228
229
230
231
232
233
234
235

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
236
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
239
240
241
	}

	// Trigger a reload
242
243
244
245
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
246
247
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
248
	a.ctxDone()
249
250
251
252
253
254
255
256
257
	// Report recovered VRAM usage
	time.Sleep(1 * time.Millisecond)
	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
		slog.Info("XXX altered getGpuFn called")
		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 24 * format.GigaByte
		return []discover.GpuInfo{g}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
258
	select {
259
260
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
261
		require.Empty(t, s.pendingReqCh)
262
263
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
264
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
265
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
266
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
267
	}
268
269
270
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
271
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
272
273
	defer done()
	s := InitScheduler(ctx)
274
275
	s.getGpuFn = getGpuFn // 1 metal GPU
	s.getCpuFn = getCpuFn // 1 CPU
276
277

	// Multiple loaded models
278
279
280
281
282
283
284
285
	a := newScenarioRequest(t, ctx, "model-a-1g-gpu", 1*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 1 * format.GigaByte})
	a.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
	b := newScenarioRequest(t, ctx, "model-b-10g-gpu", 10*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 10 * format.GigaByte})
	b.req.sessionDuration = &api.Duration{Duration: 5 * time.Millisecond}
	c := newScenarioRequest(t, ctx, "model-c-10g-cpu", 10*format.GigaByte, nil, nil /* No GPU load */)
	c.req.opts.NumGPU = 0                                                                                                                         // CPU load, will be allowed
	b.req.sessionDuration = &api.Duration{Duration: 10 * time.Millisecond}                                                                        // longer than b to cause the scheduler to favor unloading b over c
	d := newScenarioRequest(t, ctx, "model-d-10g-gpu", 13*format.GigaByte, nil, map[ml.DeviceID]uint64{{Library: "metal"}: 13 * format.GigaByte}) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286

Michael Yang's avatar
int  
Michael Yang committed
287
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
288
289
290
291
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292
	select {
293
294
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
295
		require.Empty(t, s.pendingReqCh)
296
297
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
298
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
299
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
300
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
301
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
302
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
303
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305

Michael Yang's avatar
int  
Michael Yang committed
306
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
307
308
309
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
310
	select {
311
312
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
313
		require.Empty(t, s.pendingReqCh)
314
315
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
316
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
317
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
318
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
319
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
320
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
321
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323

Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
	// This is a CPU load with NumGPU = 0 so it should load
325
326
327
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
328
	select {
329
330
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
331
		require.Empty(t, s.pendingReqCh)
332
333
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
334
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
335
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
336
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
337
338
339
340
341
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

342
	// Try to load a model that won't fit
343
344
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
345
346
347
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
348
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
	time.Sleep(2 * time.Millisecond)
350
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
352
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
354
355
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
356
	// Mark b done so it can unload
357
	b.ctxDone()
358
359
360
361
362
363
364
365
	// Report recovered VRAM usage so scheduler will finish waiting and unload
	time.Sleep(1 * time.Millisecond)
	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 24 * format.GigaByte
		return []discover.GpuInfo{g}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
366
	select {
367
368
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
369
		require.Empty(t, s.pendingReqCh)
370
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
371
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
372
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
373
	}
374
375
376
377
378
379
380
381
382
383
384
385
386
	// Wait for b to close
closeWait:
	for {
		select {
		case <-ctx.Done():
			t.Fatal("timeout")
		default:
			if b.srv.closeCalled {
				break closeWait
			}
			time.Sleep(1 * time.Millisecond)
		}
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
388
389
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390
391
}

392
func TestGetRunner(t *testing.T) {
393
	ctx, done := context.WithTimeout(t.Context(), 3*time.Second)
394
395
	defer done()

396
397
398
	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond}, nil)
Michael Yang's avatar
int  
Michael Yang committed
399
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
400
401
402
403
404
405
406
407
408
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
409
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
410
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
411
412
413
414
415
416
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
417
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
418
419
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
420
421
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
422
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
423
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
424
	}
425
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
426
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
427
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
428
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
429

430
431
432
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
433
	// Starts in pending channel, then should be quickly processed to return an error
434
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
435
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
436
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
437
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
438
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
439
440
441
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
442
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
443
444
}

Patrick Devine's avatar
Patrick Devine committed
445
func TestExpireRunner(t *testing.T) {
446
	ctx, done := context.WithTimeout(t.Context(), 20*time.Millisecond)
Patrick Devine's avatar
Patrick Devine committed
447
448
449
450
451
452
453
454
455
456
457
	defer done()
	s := InitScheduler(ctx)
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
458
	var f *ggml.GGML
459
	gpus := discover.GpuInfoList{}
460
	server := &mockLlm{vramSize: 10, vramByGPU: map[ml.DeviceID]uint64{}}
Michael Yang's avatar
Michael Yang committed
461
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Jesse Gross's avatar
Jesse Gross committed
462
		server.modelPath = model
Patrick Devine's avatar
Patrick Devine committed
463
464
		return server, nil
	}
Jesse Gross's avatar
Jesse Gross committed
465
	s.load(req, f, gpus, false)
Patrick Devine's avatar
Patrick Devine committed
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
492
493
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
494
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
495
496
497
	defer done()

	// Same model, same request
498
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
499
	s := InitScheduler(ctx)
500
501
	s.getGpuFn = func(ctx context.Context, runners []discover.FilteredRunnerDiscovery) discover.GpuInfoList {
		g := discover.GpuInfo{DeviceID: ml.DeviceID{Library: "metal"}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
502
503
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
504
		return []discover.GpuInfo{g}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
505
506
507
508
509
510
511
512
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
513
514
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
515
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
516
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
517
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
518
519
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
520
521
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
522
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
523
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
524
	}
525
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
526
527
528
529
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
530
	require.Empty(t, s.finishedReqCh)
531
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
532
	require.Empty(t, s.loaded)
533
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
534
535
536
537
538
539
540

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
541
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
542
543
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
544
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
545
		successCh:       make(chan *runnerRef, 1),
546
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
547
548
	}
	finished := make(chan *LlmRequest)
549
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
550
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
551
552
553
554
555
556
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
557
558
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
559
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
560
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
561
562
563
564
565
566
567
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
568
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
569
	defer done()
570
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
571
		{
572
573
574
			DeviceID: ml.DeviceID{
				ID: "1",
			},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
575
576
		},
		{
577
578
579
			DeviceID: ml.DeviceID{
				ID: "2",
			},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
580
581
582
583
584
585
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
586
587
588
589
590
591
592
593
594
595
596
597
	gpuIDs := []ml.DeviceID{
		{
			ID: "1",
		},
		{
			ID: "2",
		},
	}
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{{ID: "1"}: 50, {ID: "2"}: 50}}
	llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{{ID: "1"}: 125, {ID: "2"}: 75}}
	r1 := &runnerRef{llama: llm1, gpus: gpuIDs, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpuIDs, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
598
599

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
600
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
601
602
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
603
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
604
605

	s.updateFreeSpace(gpus)
606
607
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
608
609
610
}

func TestFindRunnerToUnload(t *testing.T) {
611
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
612
	defer done()
613

Daniel Hiltgen's avatar
Daniel Hiltgen committed
614
615
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
616
617

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
618
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
619
620
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
621
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
622

623
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
624
625
	require.Equal(t, r2, resp)
	r2.refCount = 1
626
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
627
628
629
630
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
631
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
632
633
	defer done()

634
	llm := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
635
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
636
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
637
638
639
640
641
642
643
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
644
645
646
647
648
649
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
650
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
651
652
653
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
654
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
655
656
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
657
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
658
659
660
661
662
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
663
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
664
665
666
667
668
669
670
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
671
672
673
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
674
675
676
677
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
678
	ctx, done := context.WithTimeout(t.Context(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
679
680
	defer done()

681
682
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
	llm2 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
683
684
685
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
686
687
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
688

Daniel Hiltgen's avatar
Daniel Hiltgen committed
689
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
690
691
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
692
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
693
694
695
696
697
698
699
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
700
	llm1 := &mockLlm{vramByGPU: map[ml.DeviceID]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
701
702
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
703
704
705
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
706
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
707
708
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
709
func TestAlreadyCanceled(t *testing.T) {
710
	ctx, done := context.WithTimeout(t.Context(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
711
712
713
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
714
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0}, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
715
716
717
718
719
720
721
722
723
724
725
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

Jesse Gross's avatar
Jesse Gross committed
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
type mockLlm struct {
	modelPath         string
	pingResp          error
	waitResp          error
	completionResp    error
	embeddingResp     []float32
	embeddingRespErr  error
	tokenizeResp      []int
	tokenizeRespErr   error
	detokenizeResp    string
	detonekizeRespErr error
	closeResp         error
	closeCalled       bool
	vramSize          uint64
	totalSize         uint64
741
	vramByGPU         map[ml.DeviceID]uint64
742
743
}

Jesse Gross's avatar
Jesse Gross committed
744
745
func (s *mockLlm) ModelPath() string {
	return s.modelPath
Daniel Hiltgen's avatar
Daniel Hiltgen committed
746
747
}

748
func (s *mockLlm) Load(ctx context.Context, gpus discover.GpuInfoList, requireFull bool) ([]ml.DeviceID, error) {
Jesse Gross's avatar
Jesse Gross committed
749
750
751
	if requireFull {
		for _, g := range gpus {
			if g.FreeMemory >= s.vramSize {
752
				return []ml.DeviceID{g.DeviceID}, nil
Jesse Gross's avatar
Jesse Gross committed
753
754
755
			}
		}

756
757
758
759
760
		return nil, llm.ErrLoadRequiredFull
	}
	gpuIDs := make([]ml.DeviceID, len(gpus))
	for i := range gpus {
		gpuIDs[i] = gpus[i].DeviceID
Jesse Gross's avatar
Jesse Gross committed
761
	}
762
	return gpuIDs, nil
Jesse Gross's avatar
Jesse Gross committed
763
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
764
765
766
767
768
func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
769

770
771
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
772
}
Michael Yang's avatar
lint  
Michael Yang committed
773

Daniel Hiltgen's avatar
Daniel Hiltgen committed
774
775
776
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
777

Daniel Hiltgen's avatar
Daniel Hiltgen committed
778
779
780
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
781

Daniel Hiltgen's avatar
Daniel Hiltgen committed
782
783
784
785
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
786
787
788
789
790
791
792
793
func (s *mockLlm) VRAMSize() uint64                                   { return s.vramSize }
func (s *mockLlm) TotalSize() uint64                                  { return s.totalSize }
func (s *mockLlm) VRAMByGPU(id ml.DeviceID) uint64                    { return s.vramByGPU[id] }
func (s *mockLlm) Pid() int                                           { return -1 }
func (s *mockLlm) GetPort() int                                       { return -1 }
func (s *mockLlm) GetDeviceInfos(ctx context.Context) []ml.DeviceInfo { return nil }
func (s *mockLlm) HasExited() bool                                    { return false }
func (s *mockLlm) GetActiveDeviceIDs() []ml.DeviceID                  { return nil }