concurrency_test.go 6.97 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
//go:build integration

package integration

import (
	"context"
7
	"fmt"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
8
	"log/slog"
9
	"math"
Michael Yang's avatar
Michael Yang committed
10
	"os"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
11
12
13
14
15
16
	"strconv"
	"sync"
	"testing"
	"time"

	"github.com/stretchr/testify/require"
Michael Yang's avatar
uint64  
Michael Yang committed
17
18
19

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
20
21
22
23
24
25
)

func TestMultiModelConcurrency(t *testing.T) {
	var (
		req = [2]api.GenerateRequest{
			{
26
				Model:     smol,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
27
28
29
				Prompt:    "why is the ocean blue?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
30
				Options: map[string]any{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
31
32
33
34
					"seed":        42,
					"temperature": 0.0,
				},
			}, {
35
				Model:     "qwen3:0.6b",
Daniel Hiltgen's avatar
Daniel Hiltgen committed
36
37
38
				Prompt:    "what is the origin of the us thanksgiving holiday?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
39
				Options: map[string]any{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
40
41
42
43
44
45
					"seed":        42,
					"temperature": 0.0,
				},
			},
		}
		resp = [2][]string{
Michael Yang's avatar
Michael Yang committed
46
			{"sunlight"},
47
			{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
48
49
50
51
		}
	)
	var wg sync.WaitGroup
	wg.Add(len(req))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
52
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
53
	defer cancel()
54
55
56
57
58
59
60
61

	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	for i := 0; i < len(req); i++ {
		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
62
63
64
	for i := 0; i < len(req); i++ {
		go func(i int) {
			defer wg.Done()
65
66
			// Note: CPU based inference can crawl so don't give up too quickly
			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
67
68
69
70
71
		}(i)
	}
	wg.Wait()
}

72
func TestIntegrationConcurrentPredict(t *testing.T) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
73
74
75
76
	req, resp := GenerateRequests()
	reqLimit := len(req)
	iterLimit := 5

Michael Yang's avatar
Michael Yang committed
77
78
	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
		maxVram, err := strconv.ParseUint(s, 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
		require.NoError(t, err)
		// Don't hammer on small VRAM cards...
Michael Yang's avatar
Michael Yang committed
81
		if maxVram < 4*format.GibiByte {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
82
83
84
85
86
87
			reqLimit = min(reqLimit, 2)
			iterLimit = 2
		}
	}

	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
88
89
90
91
92
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Get the server running (if applicable) warm the model up with a single initial request
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95

	var wg sync.WaitGroup
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
	wg.Add(reqLimit)
	for i := 0; i < reqLimit; i++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
99
		go func(i int) {
			defer wg.Done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
			for j := 0; j < iterLimit; j++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
				slog.Info("Starting", "req", i, "iter", j)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
				// On slower GPUs it can take a while to process the concurrent requests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
				// so we allow a much longer initial timeout
Daniel Hiltgen's avatar
Daniel Hiltgen committed
104
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
105
106
107
108
109
110
111
112
			}
		}(i)
	}
	wg.Wait()
}

// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
Michael Yang's avatar
uint64  
Michael Yang committed
113
114
	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
	if s == "" {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
115
116
		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
	}
Michael Yang's avatar
uint64  
Michael Yang committed
117
118
119
120
121

	maxVram, err := strconv.ParseUint(s, 10, 64)
	if err != nil {
		t.Fatal(err)
	}
122
123
124
	if maxVram < 2*format.GibiByte {
		t.Skip("VRAM less than 2G, skipping model stress tests")
	}
Michael Yang's avatar
uint64  
Michael Yang committed
125

Daniel Hiltgen's avatar
Daniel Hiltgen committed
126
127
128
129
130
131
132
	type model struct {
		name string
		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
	}

	smallModels := []model{
		{
133
134
			name: "llama3.2:1b",
			size: 2876 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
135
136
		},
		{
137
138
			name: "qwen3:0.6b",
			size: 1600 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
139
140
141
		},
		{
			name: "gemma:2b",
Michael Yang's avatar
uint64  
Michael Yang committed
142
			size: 2364 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
143
144
		},
		{
145
146
			name: "deepseek-r1:1.5b",
			size: 2048 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
147
148
149
		},
		{
			name: "starcoder2:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
150
			size: 2166 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
151
152
153
		},
	}
	mediumModels := []model{
154
155
156
157
		{
			name: "qwen3:8b",
			size: 6600 * format.MebiByte,
		},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
158
159
		{
			name: "llama2",
Michael Yang's avatar
uint64  
Michael Yang committed
160
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
161
162
		},
		{
163
164
			name: "deepseek-r1:7b",
			size: 5600 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
165
166
		},
		{
167
168
			name: "mistral",
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
169
170
171
		},
		{
			name: "dolphin-mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
172
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
173
174
175
		},
		{
			name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
176
177
178
179
180
			size: 5000 * format.MebiByte,
		},
		{
			name: "codellama:7b",
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
181
182
183
184
185
186
187
		},
	}

	// These seem to be too slow to be useful...
	// largeModels := []model{
	// 	{
	// 		name: "llama2:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
188
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
189
190
191
	// 	},
	// 	{
	// 		name: "codellama:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
192
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
193
194
195
	// 	},
	// 	{
	// 		name: "orca-mini:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
196
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
197
198
199
	// 	},
	// 	{
	// 		name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
200
	// 		size: 5000 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
201
202
203
	// 	},
	// 	{
	// 		name: "starcoder2:15b",
Michael Yang's avatar
uint64  
Michael Yang committed
204
	// 		size: 9100 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
205
206
207
208
209
	// 	},
	// }

	var chosenModels []model
	switch {
Michael Yang's avatar
uint64  
Michael Yang committed
210
	case maxVram < 10000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
211
212
		slog.Info("selecting small models")
		chosenModels = smallModels
Michael Yang's avatar
uint64  
Michael Yang committed
213
	// case maxVram < 30000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
214
215
216
217
218
	default:
		slog.Info("selecting medium models")
		chosenModels = mediumModels
		// default:
		// 	slog.Info("selecting large models")
219
		// 	chosenModels = largeModels
Daniel Hiltgen's avatar
Daniel Hiltgen committed
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
	}

	req, resp := GenerateRequests()

	for i := range req {
		if i > len(chosenModels) {
			break
		}
		req[i].Model = chosenModels[i].name
	}

	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Make sure all the models are pulled before we get started
	for _, r := range req {
		require.NoError(t, PullIfMissing(ctx, client, r.Model))
	}

	var wg sync.WaitGroup
Michael Yang's avatar
uint64  
Michael Yang committed
242
	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
Daniel Hiltgen's avatar
Daniel Hiltgen committed
243
	for i := 0; i < len(req); i++ {
244
		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
Michael Yang's avatar
Michael Yang committed
245
246
		if i > 1 && consumed > maxVram {
			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
247
248
249
			break
		}
		consumed += chosenModels[i].size
Michael Yang's avatar
Michael Yang committed
250
		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
251
252
253
254
255
256

		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			for j := 0; j < 3; j++ {
				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
257
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
258
259
260
			}
		}(i)
	}
261
262
	go func() {
		for {
263
			time.Sleep(10 * time.Second)
264
265
266
267
268
269
270
271
272
273
			select {
			case <-ctx.Done():
				return
			default:
				models, err := client.ListRunning(ctx)
				if err != nil {
					slog.Warn("failed to list running models", "error", err)
					continue
				}
				for _, m := range models.Models {
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
					var procStr string
					switch {
					case m.SizeVRAM == 0:
						procStr = "100% CPU"
					case m.SizeVRAM == m.Size:
						procStr = "100% GPU"
					case m.SizeVRAM > m.Size || m.Size == 0:
						procStr = "Unknown"
					default:
						sizeCPU := m.Size - m.SizeVRAM
						cpuPercent := math.Round(float64(sizeCPU) / float64(m.Size) * 100)
						procStr = fmt.Sprintf("%d%%/%d%%", int(cpuPercent), int(100-cpuPercent))
					}

					slog.Info("loaded model snapshot", "model", m.Name, "CPU/GPU", procStr, "expires", format.HumanTime(m.ExpiresAt, "Never"))
289
290
291
292
				}
			}
		}
	}()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
293
294
	wg.Wait()
}