concurrency_test.go 6.3 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
//go:build integration

package integration

import (
	"context"
	"log/slog"
	"strconv"
	"sync"
	"testing"
	"time"

	"github.com/stretchr/testify/require"
Michael Yang's avatar
uint64  
Michael Yang committed
14
15
16
17

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/envconfig"
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
18
19
20
21
22
23
)

func TestMultiModelConcurrency(t *testing.T) {
	var (
		req = [2]api.GenerateRequest{
			{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
24
25
26
27
				Model:     "orca-mini",
				Prompt:    "why is the ocean blue?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
28
29
30
31
32
				Options: map[string]interface{}{
					"seed":        42,
					"temperature": 0.0,
				},
			}, {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
34
35
36
				Model:     "tinydolphin",
				Prompt:    "what is the origin of the us thanksgiving holiday?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
37
38
39
40
41
42
43
44
				Options: map[string]interface{}{
					"seed":        42,
					"temperature": 0.0,
				},
			},
		}
		resp = [2][]string{
			[]string{"sunlight"},
45
			[]string{"england", "english", "massachusetts", "pilgrims", "british"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
48
49
		}
	)
	var wg sync.WaitGroup
	wg.Add(len(req))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	defer cancel()
52
53
54
55
56
57
58
59

	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	for i := 0; i < len(req); i++ {
		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
	for i := 0; i < len(req); i++ {
		go func(i int) {
			defer wg.Done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
65
66
67
68
69
		}(i)
	}
	wg.Wait()
}

func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
71
72
73
	req, resp := GenerateRequests()
	reqLimit := len(req)
	iterLimit := 5

74
	vram := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
Daniel Hiltgen's avatar
Daniel Hiltgen committed
75
76
77
78
79
80
81
82
83
84
85
	if vram != "" {
		max, err := strconv.ParseUint(vram, 10, 64)
		require.NoError(t, err)
		// Don't hammer on small VRAM cards...
		if max < 4*1024*1024*1024 {
			reqLimit = min(reqLimit, 2)
			iterLimit = 2
		}
	}

	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
86
87
88
89
90
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Get the server running (if applicable) warm the model up with a single initial request
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
92
93

	var wg sync.WaitGroup
Daniel Hiltgen's avatar
Daniel Hiltgen committed
94
95
	wg.Add(reqLimit)
	for i := 0; i < reqLimit; i++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
96
97
		go func(i int) {
			defer wg.Done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
			for j := 0; j < iterLimit; j++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
				slog.Info("Starting", "req", i, "iter", j)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
				// On slower GPUs it can take a while to process the concurrent requests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
				// so we allow a much longer initial timeout
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
103
104
105
106
107
108
109
110
			}
		}(i)
	}
	wg.Wait()
}

// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
Michael Yang's avatar
uint64  
Michael Yang committed
111
112
	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
	if s == "" {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
113
114
		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
	}
Michael Yang's avatar
uint64  
Michael Yang committed
115
116
117
118
119
120

	maxVram, err := strconv.ParseUint(s, 10, 64)
	if err != nil {
		t.Fatal(err)
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
121
122
123
124
125
126
127
128
	type model struct {
		name string
		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
	}

	smallModels := []model{
		{
			name: "orca-mini",
Michael Yang's avatar
uint64  
Michael Yang committed
129
			size: 2992 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
130
131
132
		},
		{
			name: "phi",
Michael Yang's avatar
uint64  
Michael Yang committed
133
			size: 2616 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
134
135
136
		},
		{
			name: "gemma:2b",
Michael Yang's avatar
uint64  
Michael Yang committed
137
			size: 2364 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
138
139
140
		},
		{
			name: "stable-code:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
141
			size: 2608 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
142
143
144
		},
		{
			name: "starcoder2:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
145
			size: 2166 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
146
147
148
149
150
		},
	}
	mediumModels := []model{
		{
			name: "llama2",
Michael Yang's avatar
uint64  
Michael Yang committed
151
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
152
153
154
		},
		{
			name: "mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
155
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
156
157
158
		},
		{
			name: "orca-mini:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
159
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
160
161
162
		},
		{
			name: "dolphin-mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
163
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
164
165
166
		},
		{
			name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
167
168
169
170
171
			size: 5000 * format.MebiByte,
		},
		{
			name: "codellama:7b",
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
172
173
174
175
176
177
178
		},
	}

	// These seem to be too slow to be useful...
	// largeModels := []model{
	// 	{
	// 		name: "llama2:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
179
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
180
181
182
	// 	},
	// 	{
	// 		name: "codellama:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
183
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
184
185
186
	// 	},
	// 	{
	// 		name: "orca-mini:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
187
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
188
189
190
	// 	},
	// 	{
	// 		name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
191
	// 		size: 5000 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
192
193
194
	// 	},
	// 	{
	// 		name: "starcoder2:15b",
Michael Yang's avatar
uint64  
Michael Yang committed
195
	// 		size: 9100 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
196
197
198
199
200
	// 	},
	// }

	var chosenModels []model
	switch {
Michael Yang's avatar
uint64  
Michael Yang committed
201
	case maxVram < 10000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
202
203
		slog.Info("selecting small models")
		chosenModels = smallModels
Michael Yang's avatar
uint64  
Michael Yang committed
204
	// case maxVram < 30000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
	default:
		slog.Info("selecting medium models")
		chosenModels = mediumModels
		// default:
		// 	slog.Info("selecting large models")
		// 	chosenModels = largModels
	}

	req, resp := GenerateRequests()

	for i := range req {
		if i > len(chosenModels) {
			break
		}
		req[i].Model = chosenModels[i].name
	}

	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Make sure all the models are pulled before we get started
	for _, r := range req {
		require.NoError(t, PullIfMissing(ctx, client, r.Model))
	}

	var wg sync.WaitGroup
Michael Yang's avatar
uint64  
Michael Yang committed
233
	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
Daniel Hiltgen's avatar
Daniel Hiltgen committed
234
235
	for i := 0; i < len(req); i++ {
		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
Michael Yang's avatar
uint64  
Michael Yang committed
236
237
		if i > 1 && consumed > vram {
			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
238
239
240
			break
		}
		consumed += chosenModels[i].size
Michael Yang's avatar
uint64  
Michael Yang committed
241
		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(vram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
242
243
244
245
246
247

		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			for j := 0; j < 3; j++ {
				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
248
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
249
250
251
			}
		}(i)
	}
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
	go func() {
		for {
			time.Sleep(2 * time.Second)
			select {
			case <-ctx.Done():
				return
			default:
				models, err := client.ListRunning(ctx)
				if err != nil {
					slog.Warn("failed to list running models", "error", err)
					continue
				}
				for _, m := range models.Models {
					slog.Info("loaded model snapshot", "model", m)
				}
			}
		}
	}()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
270
271
	wg.Wait()
}