concurrency_test.go 6.39 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
//go:build integration

package integration

import (
	"context"
	"log/slog"
Michael Yang's avatar
Michael Yang committed
8
	"os"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
9
10
11
12
13
14
	"strconv"
	"sync"
	"testing"
	"time"

	"github.com/stretchr/testify/require"
Michael Yang's avatar
uint64  
Michael Yang committed
15
16
17

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
18
19
20
21
22
23
)

func TestMultiModelConcurrency(t *testing.T) {
	var (
		req = [2]api.GenerateRequest{
			{
24
				Model:     "llama3.2:1b",
Daniel Hiltgen's avatar
Daniel Hiltgen committed
25
26
27
				Prompt:    "why is the ocean blue?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
28
				Options: map[string]any{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
29
30
31
32
					"seed":        42,
					"temperature": 0.0,
				},
			}, {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
34
35
36
				Model:     "tinydolphin",
				Prompt:    "what is the origin of the us thanksgiving holiday?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
37
				Options: map[string]any{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
38
39
40
41
42
43
					"seed":        42,
					"temperature": 0.0,
				},
			},
		}
		resp = [2][]string{
Michael Yang's avatar
Michael Yang committed
44
			{"sunlight"},
45
			{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
48
49
		}
	)
	var wg sync.WaitGroup
	wg.Add(len(req))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	defer cancel()
52
53
54
55
56
57
58
59

	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	for i := 0; i < len(req); i++ {
		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
	for i := 0; i < len(req); i++ {
		go func(i int) {
			defer wg.Done()
63
64
			// Note: CPU based inference can crawl so don't give up too quickly
			DoGenerate(ctx, t, client, req[i], resp[i], 90*time.Second, 30*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
65
66
67
68
69
		}(i)
	}
	wg.Wait()
}

70
func TestIntegrationConcurrentPredict(t *testing.T) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
71
72
73
74
	req, resp := GenerateRequests()
	reqLimit := len(req)
	iterLimit := 5

Michael Yang's avatar
Michael Yang committed
75
76
	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
		maxVram, err := strconv.ParseUint(s, 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
77
78
		require.NoError(t, err)
		// Don't hammer on small VRAM cards...
Michael Yang's avatar
Michael Yang committed
79
		if maxVram < 4*format.GibiByte {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
80
81
82
83
84
85
			reqLimit = min(reqLimit, 2)
			iterLimit = 2
		}
	}

	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
87
88
89
90
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Get the server running (if applicable) warm the model up with a single initial request
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93

	var wg sync.WaitGroup
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95
	wg.Add(reqLimit)
	for i := 0; i < reqLimit; i++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
		go func(i int) {
			defer wg.Done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
			for j := 0; j < iterLimit; j++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
				slog.Info("Starting", "req", i, "iter", j)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
				// On slower GPUs it can take a while to process the concurrent requests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
				// so we allow a much longer initial timeout
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
105
106
107
108
109
110
			}
		}(i)
	}
	wg.Wait()
}

// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
Michael Yang's avatar
uint64  
Michael Yang committed
111
112
	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
	if s == "" {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
	}
Michael Yang's avatar
uint64  
Michael Yang committed
115
116
117
118
119

	maxVram, err := strconv.ParseUint(s, 10, 64)
	if err != nil {
		t.Fatal(err)
	}
120
121
122
	if maxVram < 2*format.GibiByte {
		t.Skip("VRAM less than 2G, skipping model stress tests")
	}
Michael Yang's avatar
uint64  
Michael Yang committed
123

Daniel Hiltgen's avatar
Daniel Hiltgen committed
124
125
126
127
128
129
130
	type model struct {
		name string
		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
	}

	smallModels := []model{
		{
131
132
			name: "llama3.2:1b",
			size: 2876 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
133
134
135
		},
		{
			name: "phi",
Michael Yang's avatar
uint64  
Michael Yang committed
136
			size: 2616 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
138
139
		},
		{
			name: "gemma:2b",
Michael Yang's avatar
uint64  
Michael Yang committed
140
			size: 2364 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
141
142
143
		},
		{
			name: "stable-code:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
144
			size: 2608 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
146
147
		},
		{
			name: "starcoder2:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
148
			size: 2166 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
149
150
151
152
153
		},
	}
	mediumModels := []model{
		{
			name: "llama2",
Michael Yang's avatar
uint64  
Michael Yang committed
154
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155
156
157
		},
		{
			name: "mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
158
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
160
161
		},
		{
			name: "orca-mini:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
162
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
163
164
165
		},
		{
			name: "dolphin-mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
166
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
167
168
169
		},
		{
			name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
170
171
172
173
174
			size: 5000 * format.MebiByte,
		},
		{
			name: "codellama:7b",
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
175
176
177
178
179
180
181
		},
	}

	// These seem to be too slow to be useful...
	// largeModels := []model{
	// 	{
	// 		name: "llama2:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
182
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
183
184
185
	// 	},
	// 	{
	// 		name: "codellama:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
186
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
187
188
189
	// 	},
	// 	{
	// 		name: "orca-mini:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
190
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
191
192
193
	// 	},
	// 	{
	// 		name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
194
	// 		size: 5000 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
196
197
	// 	},
	// 	{
	// 		name: "starcoder2:15b",
Michael Yang's avatar
uint64  
Michael Yang committed
198
	// 		size: 9100 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
199
200
201
202
203
	// 	},
	// }

	var chosenModels []model
	switch {
Michael Yang's avatar
uint64  
Michael Yang committed
204
	case maxVram < 10000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
205
206
		slog.Info("selecting small models")
		chosenModels = smallModels
Michael Yang's avatar
uint64  
Michael Yang committed
207
	// case maxVram < 30000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
208
209
210
211
212
	default:
		slog.Info("selecting medium models")
		chosenModels = mediumModels
		// default:
		// 	slog.Info("selecting large models")
213
		// 	chosenModels = largeModels
Daniel Hiltgen's avatar
Daniel Hiltgen committed
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
	}

	req, resp := GenerateRequests()

	for i := range req {
		if i > len(chosenModels) {
			break
		}
		req[i].Model = chosenModels[i].name
	}

	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Make sure all the models are pulled before we get started
	for _, r := range req {
		require.NoError(t, PullIfMissing(ctx, client, r.Model))
	}

	var wg sync.WaitGroup
Michael Yang's avatar
uint64  
Michael Yang committed
236
	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
	for i := 0; i < len(req); i++ {
238
		// Always get at least 2 models, but don't overshoot VRAM too much or we'll take too long
Michael Yang's avatar
Michael Yang committed
239
240
		if i > 1 && consumed > maxVram {
			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
241
242
243
			break
		}
		consumed += chosenModels[i].size
Michael Yang's avatar
Michael Yang committed
244
		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
245
246
247
248
249
250

		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			for j := 0; j < 3; j++ {
				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
251
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
252
253
254
			}
		}(i)
	}
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
	go func() {
		for {
			time.Sleep(2 * time.Second)
			select {
			case <-ctx.Done():
				return
			default:
				models, err := client.ListRunning(ctx)
				if err != nil {
					slog.Warn("failed to list running models", "error", err)
					continue
				}
				for _, m := range models.Models {
					slog.Info("loaded model snapshot", "model", m)
				}
			}
		}
	}()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
273
274
	wg.Wait()
}