sched_test.go 23.1 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
16
	"github.com/ollama/ollama/discover"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
18
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
20
21
	"github.com/ollama/ollama/llm"
)

Michael Yang's avatar
lint  
Michael Yang committed
22
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
23
24
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
25
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
28
29
30
31
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
37
}

func TestLoad(t *testing.T) {
38
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
	defer done()
	s := InitScheduler(ctx)
Michael Yang's avatar
Michael Yang committed
41
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
43
44
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
48
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
	}
	// Fail to load model first
Michael Yang's avatar
Michael Yang committed
51
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
52
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
	}
54
	gpus := discover.GpuInfoList{}
Michael Yang's avatar
Michael Yang committed
55
	s.load(req, f, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
56
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
59
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
63
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

64
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
65
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
67
		return server, nil
	}
Michael Yang's avatar
Michael Yang committed
68
	s.load(req, f, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
70
71
72
73
74
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
80
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
81
	server.waitResp = errors.New("wait failure")
Michael Yang's avatar
Michael Yang committed
82
	s.load(req, f, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
83
84
85
86
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
97
	require.Len(t, s.expiredCh, 1)
}

98
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
100
101
102
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
103
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
105
}

Michael Yang's avatar
Michael Yang committed
106
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
109
	return scenario.srv, nil
}

110
111
112
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
115
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
116
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
117
118
	defer f.Close()

Michael Yang's avatar
Michael Yang committed
119
	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
121
122
123
124
125
126
127
128
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
Michael Yang's avatar
Michael Yang committed
129
	}, []ggml.Tensor{
130
131
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Michael Yang's avatar
Michael Yang committed
132
	}))
Michael Yang's avatar
lint  
Michael Yang committed
133
	require.NoError(t, err)
134

Daniel Hiltgen's avatar
Daniel Hiltgen committed
135
136
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
Michael Yang's avatar
Michael Yang committed
137
	b.f, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
138
	require.NoError(t, err)
139

140
141
142
143
144
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		opts:            api.DefaultOptions(),
147
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
149
150
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
151
152
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
153
154
}

155
156
func getGpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "metal"}
157
158
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
159
	return []discover.GpuInfo{g}
160
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161

162
163
func getCpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "cpu"}
164
165
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
166
	return []discover.GpuInfo{g}
167
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
168

169
170
171
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
172
	s := InitScheduler(ctx)
173
174
175
176
177
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
178
	b.f = a.f
179
180
181
182

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
183
184
185
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
186
187
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
188
		require.Empty(t, s.pendingReqCh)
189
190
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
191
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
192
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
195
196
	}

	// Same runner as first request due to not needing a reload
197
198
199
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
200
	select {
201
202
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
203
		require.Empty(t, s.pendingReqCh)
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
222
	b.f = a.f
223
224
225
226
227
228
229
230
231
232
233
234

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
235
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
236
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
239
240
	}

	// Trigger a reload
241
242
243
244
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
245
246
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
247
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
248
	select {
249
250
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
251
		require.Empty(t, s.pendingReqCh)
252
253
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
254
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
255
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
256
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
	}
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
273

Michael Yang's avatar
int  
Michael Yang committed
274
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
275
276
277
278
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
279
	select {
280
281
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
282
		require.Empty(t, s.pendingReqCh)
283
284
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
285
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
286
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292

Michael Yang's avatar
int  
Michael Yang committed
293
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
294
295
296
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
297
	select {
298
299
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
300
		require.Empty(t, s.pendingReqCh)
301
302
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
303
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
304
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
310

Daniel Hiltgen's avatar
Daniel Hiltgen committed
311
	// This is a CPU load with NumGPU = 0 so it should load
312
313
314
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
315
	select {
316
317
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
318
		require.Empty(t, s.pendingReqCh)
319
320
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
321
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
322
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
325
326
327
328
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

329
	// Try to load a model that won't fit
330
331
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
332
333
334
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
335
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
336
	time.Sleep(2 * time.Millisecond)
337
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
338
339
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
340
341
342
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
343
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
344
	select {
345
346
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
347
		require.Empty(t, s.pendingReqCh)
348
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
349
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
352
353
354
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
355
356
}

357
func TestGetRunner(t *testing.T) {
358
	ctx, done := context.WithTimeout(context.Background(), 3*time.Second)
359
360
361
362
363
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
364
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
365
366
367
368
369
370
371
372
373
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
374
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
375
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
376
377
378
379
380
381
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
382
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
383
384
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
385
386
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
387
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
388
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
	}
390
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
391
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
393
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
394

395
396
397
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
398
	// Starts in pending channel, then should be quickly processed to return an error
399
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
400
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
401
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
402
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
403
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
404
405
406
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
407
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
408
409
}

Patrick Devine's avatar
Patrick Devine committed
410
411
412
413
414
415
416
417
418
419
420
421
422
func TestExpireRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
423
	var f *ggml.GGML
424
	gpus := discover.GpuInfoList{}
Patrick Devine's avatar
Patrick Devine committed
425
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
426
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Patrick Devine's avatar
Patrick Devine committed
427
428
		return server, nil
	}
Michael Yang's avatar
Michael Yang committed
429
	s.load(req, f, gpus, 0)
Patrick Devine's avatar
Patrick Devine committed
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
456
457
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
458
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
459
460
461
	defer done()

	// Same model, same request
462
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
463
	s := InitScheduler(ctx)
464
465
	s.getGpuFn = func() discover.GpuInfoList {
		g := discover.GpuInfo{Library: "metal"}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
466
467
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
468
		return []discover.GpuInfo{g}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
469
470
471
472
473
474
475
476
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
477
478
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
479
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
480
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
481
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
482
483
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
484
485
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
486
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
487
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
488
	}
489
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
490
491
492
493
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
494
	require.Empty(t, s.finishedReqCh)
495
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
496
	require.Empty(t, s.loaded)
497
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
498
499
500
501
502
503
504

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
505
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
506
507
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
508
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
509
		successCh:       make(chan *runnerRef, 1),
510
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
511
512
	}
	finished := make(chan *LlmRequest)
513
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
514
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
515
516
517
518
519
520
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
521
522
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
523
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
524
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
525
526
527
528
529
530
531
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
532
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
533
	defer done()
534
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
535
536
537
538
539
540
541
542
543
544
545
546
547
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
548
549
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
550
551
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
552
553

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
554
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
555
556
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
557
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
558
559

	s.updateFreeSpace(gpus)
560
561
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
562
563
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
564
565
566
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
567
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
568
569
570
571
572
573
574
575
576
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
577
	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
578
579
580
581
582
583
584
585
586
587

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

588
	r1.gpus = discover.GpuInfoList{gpus[1]}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
589
590
591
592
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

593
	r1.gpus = discover.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
594
595
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
596
597
598
}

func TestFindRunnerToUnload(t *testing.T) {
599
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
600
	defer done()
601

Daniel Hiltgen's avatar
Daniel Hiltgen committed
602
603
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
604
605

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
606
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
607
608
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
609
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
610

611
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
612
613
	require.Equal(t, r2, resp)
	r2.refCount = 1
614
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
615
616
617
618
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
619
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
620
621
	defer done()

622
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
623
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
624
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
625
626
627
628
629
630
631
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
632
633
634
635
636
637
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
638
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
639
640
641
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
642
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
643
644
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
645
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
646
647
648
649
650
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
651
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
652
653
654
655
656
657
658
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
659
660
661
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
662
663
664
665
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
666
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
667
668
	defer done()

669
670
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
671
672
673
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
674
675
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
676

Daniel Hiltgen's avatar
Daniel Hiltgen committed
677
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
678
679
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
680
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
681
682
683
684
685
686
687
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
688
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
689
690
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
691
692
693
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
694
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
695
696
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
697
698
699
700
701
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
702
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
703
704
705
706
707
708
709
710
711
712
713
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

714
715
716
717
718
func TestHomogeneousGPUs(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)

719
	s.getGpuFn = func() discover.GpuInfoList {
720
		// Set memory values to require the model to be spread
721
		gpus := []discover.GpuInfo{
722
723
724
725
726
727
728
729
730
731
732
			{Library: "cuda"},
			{Library: "rocm"},
		}
		gpus[0].TotalMemory = 1 * format.GibiByte
		gpus[0].FreeMemory = 256 * format.MebiByte
		gpus[1].TotalMemory = 1 * format.GibiByte
		gpus[1].FreeMemory = 256 * format.MebiByte
		return gpus
	}
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
Michael Yang's avatar
Michael Yang committed
733
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
734
		require.Len(t, gpus, 1)
Michael Yang's avatar
Michael Yang committed
735
		return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
	}
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
753
type mockLlm struct {
754
755
756
	pingResp           error
	waitResp           error
	completionResp     error
757
758
	embeddingResp      []float32
	embeddingRespErr   error
759
760
761
762
763
764
765
766
767
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
768
769
770
771
772
773
774
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
775

776
777
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
778
}
Michael Yang's avatar
lint  
Michael Yang committed
779

Daniel Hiltgen's avatar
Daniel Hiltgen committed
780
781
782
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
783

Daniel Hiltgen's avatar
Daniel Hiltgen committed
784
785
786
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
787

Daniel Hiltgen's avatar
Daniel Hiltgen committed
788
789
790
791
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
792
793
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
794
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }
795
func (s *mockLlm) Pid() int                               { return -1 }