sched_test.go 23.1 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
package server

import (
	"bytes"
	"context"
Michael Yang's avatar
lint  
Michael Yang committed
6
	"errors"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
7
8
9
10
11
	"log/slog"
	"os"
	"testing"
	"time"

Michael Yang's avatar
lint  
Michael Yang committed
12
13
	"github.com/stretchr/testify/require"

Daniel Hiltgen's avatar
Daniel Hiltgen committed
14
15
	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/app/lifecycle"
16
	"github.com/ollama/ollama/discover"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
17
	"github.com/ollama/ollama/format"
Michael Yang's avatar
Michael Yang committed
18
	"github.com/ollama/ollama/fs/ggml"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
19
20
21
	"github.com/ollama/ollama/llm"
)

Michael Yang's avatar
lint  
Michael Yang committed
22
func TestMain(m *testing.M) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
23
24
	os.Setenv("OLLAMA_DEBUG", "1")
	lifecycle.InitLogging()
Michael Yang's avatar
lint  
Michael Yang committed
25
	os.Exit(m.Run())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
26
27
28
29
30
31
}

func TestInitScheduler(t *testing.T) {
	ctx, done := context.WithCancel(context.Background())
	defer done()
	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
32
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
	require.NotNil(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
34
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
35
36
37
}

func TestLoad(t *testing.T) {
38
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
39
40
	defer done()
	s := InitScheduler(ctx)
Michael Yang's avatar
Michael Yang committed
41
	var f *ggml.GGML // value not used in tests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
42
43
44
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
45
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
48
		sessionDuration: &api.Duration{Duration: 2 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
49
50
	}
	// Fail to load model first
Michael Yang's avatar
Michael Yang committed
51
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Michael Yang's avatar
lint  
Michael Yang committed
52
		return nil, errors.New("something failed to load model blah")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
	}
54
	gpus := discover.GpuInfoList{}
Michael Yang's avatar
Michael Yang committed
55
	s.load(req, f, gpus, 0)
Michael Yang's avatar
lint  
Michael Yang committed
56
	require.Empty(t, req.successCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
57
	require.Len(t, req.errCh, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
58
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
59
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
61
62
63
	err := <-req.errCh
	require.Contains(t, err.Error(), "this model may be incompatible")

64
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
65
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
66
67
		return server, nil
	}
Michael Yang's avatar
Michael Yang committed
68
	s.load(req, f, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
69
70
71
72
73
74
	select {
	case err := <-req.errCh:
		require.NoError(t, err)
	case resp := <-req.successCh:
		require.Equal(t, uint64(10), resp.estimatedVRAM)
		require.Equal(t, uint(1), resp.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
78
79
80
	}

	req.model.ModelPath = "dummy_model_path"
Michael Yang's avatar
lint  
Michael Yang committed
81
	server.waitResp = errors.New("wait failure")
Michael Yang's avatar
Michael Yang committed
82
	s.load(req, f, gpus, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
83
84
85
86
	select {
	case err := <-req.errCh:
		require.Contains(t, err.Error(), "wait failure")
	case resp := <-req.successCh:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
87
		t.Fatalf("unexpected success %v", resp)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
89
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	runner := s.loaded["dummy_model_path"]
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93
	require.NotNil(t, runner)
	require.Equal(t, uint(0), runner.refCount)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
	time.Sleep(1 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
97
	require.Len(t, s.expiredCh, 1)
}

98
type reqBundle struct {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
100
101
102
	ctx     context.Context //nolint:containedctx
	ctxDone func()
	srv     *mockLlm
	req     *LlmRequest
Michael Yang's avatar
Michael Yang committed
103
	f       *ggml.GGML
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
105
}

Michael Yang's avatar
Michael Yang committed
106
func (scenario *reqBundle) newServer(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
107
108
109
	return scenario.srv, nil
}

110
111
112
func newScenarioRequest(t *testing.T, ctx context.Context, modelName string, estimatedVRAM uint64, duration *api.Duration) *reqBundle {
	b := &reqBundle{}
	b.ctx, b.ctxDone = context.WithCancel(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
115
	t.Helper()

	f, err := os.CreateTemp(t.TempDir(), modelName)
Michael Yang's avatar
lint  
Michael Yang committed
116
	require.NoError(t, err)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
117
118
	defer f.Close()

Michael Yang's avatar
Michael Yang committed
119
	require.NoError(t, ggml.WriteGGUF(f, ggml.KV{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
121
122
123
124
125
126
127
128
		"general.architecture":          "llama",
		"llama.context_length":          uint32(32),
		"llama.embedding_length":        uint32(4096),
		"llama.block_count":             uint32(1),
		"llama.attention.head_count":    uint32(32),
		"llama.attention.head_count_kv": uint32(32),
		"tokenizer.ggml.tokens":         []string{" "},
		"tokenizer.ggml.scores":         []float32{0},
		"tokenizer.ggml.token_type":     []int32{0},
Michael Yang's avatar
Michael Yang committed
129
	}, []ggml.Tensor{
130
131
		{Name: "blk.0.attn.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
		{Name: "output.weight", Kind: uint32(0), Offset: uint64(0), Shape: []uint64{1, 1, 1, 1}, WriterTo: bytes.NewReader(make([]byte, 32))},
Michael Yang's avatar
Michael Yang committed
132
	}))
Michael Yang's avatar
lint  
Michael Yang committed
133
	require.NoError(t, err)
134

Daniel Hiltgen's avatar
Daniel Hiltgen committed
135
136
	fname := f.Name()
	model := &Model{Name: modelName, ModelPath: fname}
Michael Yang's avatar
Michael Yang committed
137
	b.f, err = llm.LoadModel(model.ModelPath, 0)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
138
	require.NoError(t, err)
139

140
141
142
143
144
	if duration == nil {
		duration = &api.Duration{Duration: 5 * time.Millisecond}
	}
	b.req = &LlmRequest{
		ctx:             b.ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
		model:           model,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
		opts:            api.DefaultOptions(),
147
		sessionDuration: duration,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
148
149
150
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
	}
151
	b.req.opts.NumCtx = 4096
152
153
	b.srv = &mockLlm{estimatedVRAM: estimatedVRAM, estimatedVRAMByGPU: map[string]uint64{"": estimatedVRAM}}
	return b
Daniel Hiltgen's avatar
Daniel Hiltgen committed
154
155
}

156
157
func getGpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "metal"}
158
159
	g.TotalMemory = 24 * format.GigaByte
	g.FreeMemory = 12 * format.GigaByte
160
	return []discover.GpuInfo{g}
161
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
162

163
164
func getCpuFn() discover.GpuInfoList {
	g := discover.GpuInfo{Library: "cpu"}
165
166
	g.TotalMemory = 32 * format.GigaByte
	g.FreeMemory = 26 * format.GigaByte
167
	return []discover.GpuInfo{g}
168
}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
169

170
171
172
func TestRequestsSameModelSameRequest(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
173
	s := InitScheduler(ctx)
174
175
176
177
178
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 11, &api.Duration{Duration: 0})
	b.req.model = a.req.model
Michael Yang's avatar
Michael Yang committed
179
	b.f = a.f
180
181
182
183

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
184
185
186
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
187
188
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
189
		require.Empty(t, s.pendingReqCh)
190
191
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
192
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
194
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
196
197
	}

	// Same runner as first request due to not needing a reload
198
199
200
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
201
	select {
202
203
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
204
		require.Empty(t, s.pendingReqCh)
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

func TestRequestsSimpleReloadSameModel(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1", 20, &api.Duration{Duration: 5 * time.Millisecond})
	tmpModel := *a.req.model
	b.req.model = &tmpModel
Michael Yang's avatar
Michael Yang committed
223
	b.f = a.f
224
225
226
227
228
229
230
231
232
233
234
235

	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
236
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
239
240
241
	}

	// Trigger a reload
242
243
244
245
	s.newServerFn = b.newServer
	b.req.model.AdapterPaths = []string{"new"}
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
246
247
	// finish first two requests, so model can reload
	time.Sleep(1 * time.Millisecond)
248
	a.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
249
	select {
250
251
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
252
		require.Empty(t, s.pendingReqCh)
253
254
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
255
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
256
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
258
	}
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
}

func TestRequestsMultipleLoadedModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn

	// Multiple loaded models
	a := newScenarioRequest(t, ctx, "ollama-model-3a", 1*format.GigaByte, nil)
	b := newScenarioRequest(t, ctx, "ollama-model-3b", 24*format.GigaByte, nil)
	c := newScenarioRequest(t, ctx, "ollama-model-4a", 30, nil)
	c.req.opts.NumGPU = 0                                       // CPU load, will be allowed
	d := newScenarioRequest(t, ctx, "ollama-model-3c", 30, nil) // Needs prior unloaded
Daniel Hiltgen's avatar
Daniel Hiltgen committed
274

Michael Yang's avatar
int  
Michael Yang committed
275
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "1")
276
277
278
279
	s.newServerFn = a.newServer
	slog.Info("a")
	s.pendingReqCh <- a.req
	s.Run(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
280
	select {
281
282
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
283
		require.Empty(t, s.pendingReqCh)
284
285
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
286
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
287
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
288
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
289
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
290
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
291
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
292
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
293

Michael Yang's avatar
int  
Michael Yang committed
294
	t.Setenv("OLLAMA_MAX_LOADED_MODELS", "0")
295
296
297
	s.newServerFn = b.newServer
	slog.Info("b")
	s.pendingReqCh <- b.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
298
	select {
299
300
	case resp := <-b.req.successCh:
		require.Equal(t, resp.llama, b.srv)
Michael Yang's avatar
lint  
Michael Yang committed
301
		require.Empty(t, s.pendingReqCh)
302
303
		require.Empty(t, b.req.errCh)
	case err := <-b.req.errCh:
304
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
305
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
306
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
307
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
308
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
309
	require.Len(t, s.loaded, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
310
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
311

Daniel Hiltgen's avatar
Daniel Hiltgen committed
312
	// This is a CPU load with NumGPU = 0 so it should load
313
314
315
	s.newServerFn = c.newServer
	slog.Info("c")
	s.pendingReqCh <- c.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
316
	select {
317
318
	case resp := <-c.req.successCh:
		require.Equal(t, resp.llama, c.srv)
Michael Yang's avatar
lint  
Michael Yang committed
319
		require.Empty(t, s.pendingReqCh)
320
321
		require.Empty(t, c.req.errCh)
	case err := <-c.req.errCh:
322
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
323
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
324
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
325
326
327
328
329
	}
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()

330
	// Try to load a model that won't fit
331
332
	s.newServerFn = d.newServer
	slog.Info("d")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
333
334
335
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 3)
	s.loadedMu.Unlock()
336
	a.ctxDone() // Won't help since this one isn't big enough to make room
Daniel Hiltgen's avatar
Daniel Hiltgen committed
337
	time.Sleep(2 * time.Millisecond)
338
	s.pendingReqCh <- d.req
Daniel Hiltgen's avatar
Daniel Hiltgen committed
339
340
	// finish prior request, so new model can load
	time.Sleep(6 * time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
341
342
343
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
344
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
345
	select {
346
347
	case resp := <-d.req.successCh:
		require.Equal(t, resp.llama, d.srv)
Michael Yang's avatar
lint  
Michael Yang committed
348
		require.Empty(t, s.pendingReqCh)
349
		require.Empty(t, d.req.errCh)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
350
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
351
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
352
	}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
353
354
355
	s.loadedMu.Lock()
	require.Len(t, s.loaded, 2)
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
356
357
}

358
func TestGetRunner(t *testing.T) {
359
	ctx, done := context.WithTimeout(context.Background(), 3*time.Second)
360
361
362
363
364
	defer done()

	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
	b := newScenarioRequest(t, ctx, "ollama-model-1b", 10, &api.Duration{Duration: 2 * time.Millisecond})
	c := newScenarioRequest(t, ctx, "ollama-model-1c", 10, &api.Duration{Duration: 2 * time.Millisecond})
Michael Yang's avatar
int  
Michael Yang committed
365
	t.Setenv("OLLAMA_MAX_QUEUE", "1")
366
367
368
369
370
371
372
373
374
	s := InitScheduler(ctx)
	s.getGpuFn = getGpuFn
	s.getCpuFn = getCpuFn
	s.newServerFn = a.newServer
	slog.Info("a")
	successCh1a, errCh1a := s.GetRunner(a.ctx, a.req.model, a.req.opts, a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	slog.Info("b")
	successCh1b, errCh1b := s.GetRunner(b.ctx, b.req.model, b.req.opts, b.req.sessionDuration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
375
	require.Len(t, s.pendingReqCh, 1)
Michael Yang's avatar
lint  
Michael Yang committed
376
	require.Empty(t, successCh1b)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
377
378
379
380
381
382
	require.Len(t, errCh1b, 1)
	err := <-errCh1b
	require.Contains(t, err.Error(), "server busy")
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
383
		require.Equal(t, resp.llama, a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
384
385
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
386
387
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
388
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
389
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
390
	}
391
	a.ctxDone() // Set "a" model to idle so it can unload
Daniel Hiltgen's avatar
Daniel Hiltgen committed
392
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
393
	require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
394
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
395

396
397
398
	c.req.model.ModelPath = "bad path"
	slog.Info("c")
	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
399
	// Starts in pending channel, then should be quickly processed to return an error
400
	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
Michael Yang's avatar
lint  
Michael Yang committed
401
	require.Empty(t, successCh1c)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
402
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
403
	require.Empty(t, s.loaded)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
404
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
405
406
407
	require.Len(t, errCh1c, 1)
	err = <-errCh1c
	require.Contains(t, err.Error(), "bad path")
408
	b.ctxDone()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
409
410
}

Patrick Devine's avatar
Patrick Devine committed
411
412
413
414
415
416
417
418
419
420
421
422
423
func TestExpireRunner(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 20*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)
	req := &LlmRequest{
		ctx:             ctx,
		model:           &Model{ModelPath: "foo"},
		opts:            api.DefaultOptions(),
		successCh:       make(chan *runnerRef, 1),
		errCh:           make(chan error, 1),
		sessionDuration: &api.Duration{Duration: 2 * time.Minute},
	}

Michael Yang's avatar
Michael Yang committed
424
	var f *ggml.GGML
425
	gpus := discover.GpuInfoList{}
Patrick Devine's avatar
Patrick Devine committed
426
	server := &mockLlm{estimatedVRAM: 10, estimatedVRAMByGPU: map[string]uint64{}}
Michael Yang's avatar
Michael Yang committed
427
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
Patrick Devine's avatar
Patrick Devine committed
428
429
		return server, nil
	}
Michael Yang's avatar
Michael Yang committed
430
	s.load(req, f, gpus, 0)
Patrick Devine's avatar
Patrick Devine committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456

	select {
	case err := <-req.errCh:
		if err != nil {
			t.Fatalf("expected no errors when loading, got '%s'", err.Error())
		}
	case resp := <-req.successCh:
		s.loadedMu.Lock()
		if resp.refCount != uint(1) || len(s.loaded) != 1 {
			t.Fatalf("expected a model to be loaded")
		}
		s.loadedMu.Unlock()
	}

	s.expireRunner(&Model{ModelPath: "foo"})

	s.finishedReqCh <- req
	s.processCompleted(ctx)

	s.loadedMu.Lock()
	if len(s.loaded) != 0 {
		t.Fatalf("expected model to be unloaded")
	}
	s.loadedMu.Unlock()
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
457
458
// TODO - add one scenario that triggers the bogus finished event with positive ref count
func TestPrematureExpired(t *testing.T) {
459
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
460
461
462
	defer done()

	// Same model, same request
463
	scenario1a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, nil)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
464
	s := InitScheduler(ctx)
465
466
	s.getGpuFn = func() discover.GpuInfoList {
		g := discover.GpuInfo{Library: "metal"}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
467
468
		g.TotalMemory = 24 * format.GigaByte
		g.FreeMemory = 12 * format.GigaByte
469
		return []discover.GpuInfo{g}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
470
471
472
473
474
475
476
477
	}
	s.newServerFn = scenario1a.newServer
	successCh1a, errCh1a := s.GetRunner(scenario1a.ctx, scenario1a.req.model, scenario1a.req.opts, scenario1a.req.sessionDuration)
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-successCh1a:
		require.Equal(t, resp.llama, scenario1a.srv)
Michael Yang's avatar
lint  
Michael Yang committed
478
479
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, errCh1a)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
480
		s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
481
		require.Len(t, s.loaded, 1)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
482
		s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
483
484
		slog.Info("sending premature expired event now")
		s.expiredCh <- resp // Shouldn't happen in real life, but make sure its safe
485
486
	case err := <-errCh1a:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
487
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
488
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
489
	}
490
	time.Sleep(scenario1a.req.sessionDuration.Duration)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
491
492
493
494
	scenario1a.ctxDone()
	time.Sleep(20 * time.Millisecond)
	require.LessOrEqual(t, len(s.finishedReqCh), 1)
	time.Sleep(10 * time.Millisecond)
Michael Yang's avatar
lint  
Michael Yang committed
495
	require.Empty(t, s.finishedReqCh)
496
	s.loadedMu.Lock()
Michael Yang's avatar
lint  
Michael Yang committed
497
	require.Empty(t, s.loaded)
498
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
499
500
501
502
503
504
505

	// also shouldn't happen in real life
	s.finishedReqCh <- scenario1a.req
	time.Sleep(5 * time.Millisecond)
}

func TestUseLoadedRunner(t *testing.T) {
506
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
507
508
	req := &LlmRequest{
		ctx:             ctx,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
509
		opts:            api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
510
		successCh:       make(chan *runnerRef, 1),
511
		sessionDuration: &api.Duration{Duration: 2},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
512
513
	}
	finished := make(chan *LlmRequest)
514
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
515
	r1 := &runnerRef{llama: llm1, sessionDuration: 1, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
516
517
518
519
520
521
	req.useLoadedRunner(r1, finished)
	require.Equal(t, uint(1), r1.refCount)
	require.Equal(t, time.Duration(2), r1.sessionDuration)
	select {
	case success := <-req.successCh:
		require.Equal(t, r1, success)
522
523
	case err := <-req.errCh:
		t.Fatal(err.Error())
Daniel Hiltgen's avatar
Daniel Hiltgen committed
524
	case <-ctx.Done():
Daniel Hiltgen's avatar
Daniel Hiltgen committed
525
		t.Fatal("timeout")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
526
527
528
529
530
531
532
	}
	done()
	fin := <-finished
	require.Equal(t, req, fin)
}

func TestUpdateFreeSpace(t *testing.T) {
533
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
534
	defer done()
535
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
536
537
538
539
540
541
542
543
544
545
546
547
548
		{
			Library: "a",
			ID:      "1",
		},
		{
			Library: "a",
			ID:      "2",
		},
	}
	gpus[0].TotalMemory = 1000
	gpus[0].FreeMemory = 900
	gpus[1].TotalMemory = 2000
	gpus[1].FreeMemory = 1900
549
550
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 50, "2": 50}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{"1": 125, "2": 75}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
551
552
	r1 := &runnerRef{llama: llm1, gpus: gpus, numParallel: 1}
	r2 := &runnerRef{llama: llm2, gpus: gpus, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
553
554

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
555
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
556
557
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
558
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
559
560

	s.updateFreeSpace(gpus)
561
562
	require.Equal(t, uint64(1000-50-125), gpus[0].FreeMemory)
	require.Equal(t, uint64(2000-50-75), gpus[1].FreeMemory)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
563
564
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
565
566
567
func TestFilterGPUsWithoutLoadingModels(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
568
	gpus := discover.GpuInfoList{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
569
570
571
572
573
574
575
576
577
		{
			Library: "cuda",
			ID:      "0",
		},
		{
			Library: "cuda",
			ID:      "1",
		},
	}
578
	r1 := &runnerRef{gpus: discover.GpuInfoList{gpus[0]}, loading: true}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
579
580
581
582
583
584
585
586
587
588

	s := InitScheduler(ctx)
	s.loadedMu.Lock()
	s.loaded["a"] = r1
	s.loadedMu.Unlock()

	tmp := s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "1", tmp[0].ID)

589
	r1.gpus = discover.GpuInfoList{gpus[1]}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
590
591
592
593
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 1)
	require.Equal(t, "0", tmp[0].ID)

594
	r1.gpus = discover.GpuInfoList{}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
595
596
	tmp = s.filterGPUsWithoutLoadingModels(gpus)
	require.Len(t, tmp, 2)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
597
598
599
}

func TestFindRunnerToUnload(t *testing.T) {
600
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
601
	defer done()
602

Daniel Hiltgen's avatar
Daniel Hiltgen committed
603
604
	r1 := &runnerRef{refCount: 1, sessionDuration: 1, numParallel: 1}
	r2 := &runnerRef{sessionDuration: 2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
605
606

	s := InitScheduler(ctx)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
607
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
608
609
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
610
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
611

612
	resp := s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
613
614
	require.Equal(t, r2, resp)
	r2.refCount = 1
615
	resp = s.findRunnerToUnload()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
616
617
618
619
	require.Equal(t, r1, resp)
}

func TestNeedsReload(t *testing.T) {
620
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
621
622
	defer done()

623
	llm := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
624
	do := api.DefaultOptions()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
625
	runner := &runnerRef{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
626
627
628
629
630
631
632
		model: &Model{
			AdapterPaths:   []string{"adapter1"},
			ProjectorPaths: []string{"projector1"},
		},
		Options:     &do,
		llama:       llm,
		numParallel: 1,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
633
634
635
636
637
638
	}
	req := &LlmRequest{
		model: &Model{
			AdapterPaths:   []string{"adapter2"},
			ProjectorPaths: []string{"projector2"},
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
639
		opts: api.DefaultOptions(),
Daniel Hiltgen's avatar
Daniel Hiltgen committed
640
641
642
	}
	resp := runner.needsReload(ctx, req)
	require.True(t, resp)
643
	req.model.AdapterPaths = runner.model.AdapterPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
644
645
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
646
	req.model.ProjectorPaths = runner.model.ProjectorPaths
Daniel Hiltgen's avatar
Daniel Hiltgen committed
647
648
649
650
651
	runner.loading = true
	req.opts.NumBatch = 1234
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	req.opts.NumBatch = runner.Options.NumBatch
Michael Yang's avatar
lint  
Michael Yang committed
652
	llm.pingResp = errors.New("foo")
Daniel Hiltgen's avatar
Daniel Hiltgen committed
653
654
655
656
657
658
659
	resp = runner.needsReload(ctx, req)
	require.True(t, resp)
	llm.pingResp = nil
	resp = runner.needsReload(ctx, req)
	require.False(t, resp)
	req.opts.NumGPU = 99
	resp = runner.needsReload(ctx, req)
660
661
662
	require.True(t, resp)
	req.opts.NumGPU = -1
	resp = runner.needsReload(ctx, req)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
663
664
665
666
	require.False(t, resp)
}

func TestUnloadAllRunners(t *testing.T) {
667
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
668
669
	defer done()

670
671
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
	llm2 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
672
673
674
	s := InitScheduler(ctx)
	s.unloadAllRunners()

Daniel Hiltgen's avatar
Daniel Hiltgen committed
675
676
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{llama: llm2, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
677

Daniel Hiltgen's avatar
Daniel Hiltgen committed
678
	s.loadedMu.Lock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
679
680
	s.loaded["a"] = r1
	s.loaded["b"] = r2
Daniel Hiltgen's avatar
Daniel Hiltgen committed
681
	s.loadedMu.Unlock()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
682
683
684
685
686
687
688
	s.unloadAllRunners()

	require.True(t, llm1.closeCalled)
	require.True(t, llm2.closeCalled)
}

func TestUnload(t *testing.T) {
689
	llm1 := &mockLlm{estimatedVRAMByGPU: map[string]uint64{}}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
690
691
	r1 := &runnerRef{llama: llm1, numParallel: 1}
	r2 := &runnerRef{model: &Model{AdapterPaths: []string{"A"}}, numParallel: 1}
Daniel Hiltgen's avatar
Daniel Hiltgen committed
692
693
694
	r1.unload()
	require.True(t, llm1.closeCalled)
	r2.unload()
695
	require.Nil(t, r2.model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
696
697
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
698
699
700
701
702
func TestAlreadyCanceled(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 500*time.Millisecond)
	defer done()
	dctx, done2 := context.WithCancel(ctx)
	done2()
703
	scenario1a := newScenarioRequest(t, dctx, "ollama-model-1", 10, &api.Duration{Duration: 0})
Daniel Hiltgen's avatar
Daniel Hiltgen committed
704
705
706
707
708
709
710
711
712
713
714
	s := InitScheduler(ctx)
	slog.Info("scenario1a")
	s.pendingReqCh <- scenario1a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	time.Sleep(5 * time.Millisecond)
	require.Empty(t, s.pendingReqCh)
	require.Empty(t, scenario1a.req.errCh)
	require.Empty(t, scenario1a.req.successCh)
}

715
716
717
718
719
func TestHomogeneousGPUs(t *testing.T) {
	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
	defer done()
	s := InitScheduler(ctx)

720
	s.getGpuFn = func() discover.GpuInfoList {
721
		// Set memory values to require the model to be spread
722
		gpus := []discover.GpuInfo{
723
724
725
726
727
728
729
730
731
732
733
			{Library: "cuda"},
			{Library: "rocm"},
		}
		gpus[0].TotalMemory = 1 * format.GibiByte
		gpus[0].FreeMemory = 256 * format.MebiByte
		gpus[1].TotalMemory = 1 * format.GibiByte
		gpus[1].FreeMemory = 256 * format.MebiByte
		return gpus
	}
	s.getCpuFn = getCpuFn
	a := newScenarioRequest(t, ctx, "ollama-model-1", 10, &api.Duration{Duration: 5 * time.Millisecond})
Michael Yang's avatar
Michael Yang committed
734
	s.newServerFn = func(gpus discover.GpuInfoList, model string, f *ggml.GGML, adapters []string, projectors []string, opts api.Options, numParallel int) (llm.LlamaServer, error) {
735
		require.Len(t, gpus, 1)
Michael Yang's avatar
Michael Yang committed
736
		return a.newServer(gpus, model, f, adapters, projectors, opts, numParallel)
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
	}
	slog.Info("a")
	s.pendingReqCh <- a.req
	require.Len(t, s.pendingReqCh, 1)
	s.Run(ctx)
	select {
	case resp := <-a.req.successCh:
		require.Equal(t, resp.llama, a.srv)
		require.Empty(t, s.pendingReqCh)
		require.Empty(t, a.req.errCh)
	case err := <-a.req.errCh:
		t.Fatal(err.Error())
	case <-ctx.Done():
		t.Fatal("timeout")
	}
}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
754
type mockLlm struct {
755
756
757
	pingResp           error
	waitResp           error
	completionResp     error
758
759
	embeddingResp      []float32
	embeddingRespErr   error
760
761
762
763
764
765
766
767
768
	tokenizeResp       []int
	tokenizeRespErr    error
	detokenizeResp     string
	detonekizeRespErr  error
	closeResp          error
	closeCalled        bool
	estimatedVRAM      uint64
	estimatedTotal     uint64
	estimatedVRAMByGPU map[string]uint64
Daniel Hiltgen's avatar
Daniel Hiltgen committed
769
770
771
772
773
774
775
}

func (s *mockLlm) Ping(ctx context.Context) error             { return s.pingResp }
func (s *mockLlm) WaitUntilRunning(ctx context.Context) error { return s.waitResp }
func (s *mockLlm) Completion(ctx context.Context, req llm.CompletionRequest, fn func(llm.CompletionResponse)) error {
	return s.completionResp
}
Michael Yang's avatar
lint  
Michael Yang committed
776

777
778
func (s *mockLlm) Embedding(ctx context.Context, input string) ([]float32, error) {
	return s.embeddingResp, s.embeddingRespErr
Daniel Hiltgen's avatar
Daniel Hiltgen committed
779
}
Michael Yang's avatar
lint  
Michael Yang committed
780

Daniel Hiltgen's avatar
Daniel Hiltgen committed
781
782
783
func (s *mockLlm) Tokenize(ctx context.Context, content string) ([]int, error) {
	return s.tokenizeResp, s.tokenizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
784

Daniel Hiltgen's avatar
Daniel Hiltgen committed
785
786
787
func (s *mockLlm) Detokenize(ctx context.Context, tokens []int) (string, error) {
	return s.detokenizeResp, s.detonekizeRespErr
}
Michael Yang's avatar
lint  
Michael Yang committed
788

Daniel Hiltgen's avatar
Daniel Hiltgen committed
789
790
791
792
func (s *mockLlm) Close() error {
	s.closeCalled = true
	return s.closeResp
}
793
794
func (s *mockLlm) EstimatedVRAM() uint64                  { return s.estimatedVRAM }
func (s *mockLlm) EstimatedTotal() uint64                 { return s.estimatedTotal }
Daniel Hiltgen's avatar
Daniel Hiltgen committed
795
func (s *mockLlm) EstimatedVRAMByGPU(gpuid string) uint64 { return s.estimatedVRAMByGPU[gpuid] }