"ppocr/data/imaug/text_image_aug/augment.py" did not exist on "55c28ed5b4d5d3482c7ad0bb5b8706eaf122a755"
concurrency_test.go 6.23 KB
Newer Older
Daniel Hiltgen's avatar
Daniel Hiltgen committed
1
2
3
4
5
6
7
//go:build integration

package integration

import (
	"context"
	"log/slog"
Michael Yang's avatar
Michael Yang committed
8
	"os"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
9
10
11
12
13
14
	"strconv"
	"sync"
	"testing"
	"time"

	"github.com/stretchr/testify/require"
Michael Yang's avatar
uint64  
Michael Yang committed
15
16
17

	"github.com/ollama/ollama/api"
	"github.com/ollama/ollama/format"
Daniel Hiltgen's avatar
Daniel Hiltgen committed
18
19
20
21
22
23
)

func TestMultiModelConcurrency(t *testing.T) {
	var (
		req = [2]api.GenerateRequest{
			{
Daniel Hiltgen's avatar
Daniel Hiltgen committed
24
25
26
27
				Model:     "orca-mini",
				Prompt:    "why is the ocean blue?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
28
29
30
31
32
				Options: map[string]interface{}{
					"seed":        42,
					"temperature": 0.0,
				},
			}, {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
33
34
35
36
				Model:     "tinydolphin",
				Prompt:    "what is the origin of the us thanksgiving holiday?",
				Stream:    &stream,
				KeepAlive: &api.Duration{Duration: 10 * time.Second},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
37
38
39
40
41
42
43
				Options: map[string]interface{}{
					"seed":        42,
					"temperature": 0.0,
				},
			},
		}
		resp = [2][]string{
Michael Yang's avatar
Michael Yang committed
44
45
			{"sunlight"},
			{"england", "english", "massachusetts", "pilgrims", "british"},
Daniel Hiltgen's avatar
Daniel Hiltgen committed
46
47
48
49
		}
	)
	var wg sync.WaitGroup
	wg.Add(len(req))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
50
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*240)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
51
	defer cancel()
52
53
54
55
56
57
58
59

	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	for i := 0; i < len(req); i++ {
		require.NoError(t, PullIfMissing(ctx, client, req[i].Model))
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
60
61
62
	for i := 0; i < len(req); i++ {
		go func(i int) {
			defer wg.Done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
63
			DoGenerate(ctx, t, client, req[i], resp[i], 60*time.Second, 10*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
64
65
66
67
68
69
		}(i)
	}
	wg.Wait()
}

func TestIntegrationConcurrentPredictOrcaMini(t *testing.T) {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
70
71
72
73
	req, resp := GenerateRequests()
	reqLimit := len(req)
	iterLimit := 5

Michael Yang's avatar
Michael Yang committed
74
75
	if s := os.Getenv("OLLAMA_MAX_VRAM"); s != "" {
		maxVram, err := strconv.ParseUint(s, 10, 64)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
76
77
		require.NoError(t, err)
		// Don't hammer on small VRAM cards...
Michael Yang's avatar
Michael Yang committed
78
		if maxVram < 4*format.GibiByte {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
79
80
81
82
83
84
			reqLimit = min(reqLimit, 2)
			iterLimit = 2
		}
	}

	ctx, cancel := context.WithTimeout(context.Background(), 9*time.Minute)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
85
86
87
88
89
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Get the server running (if applicable) warm the model up with a single initial request
Daniel Hiltgen's avatar
Daniel Hiltgen committed
90
	DoGenerate(ctx, t, client, req[0], resp[0], 60*time.Second, 10*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
91
92

	var wg sync.WaitGroup
Daniel Hiltgen's avatar
Daniel Hiltgen committed
93
94
	wg.Add(reqLimit)
	for i := 0; i < reqLimit; i++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
95
96
		go func(i int) {
			defer wg.Done()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
97
			for j := 0; j < iterLimit; j++ {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
98
				slog.Info("Starting", "req", i, "iter", j)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
99
				// On slower GPUs it can take a while to process the concurrent requests
Daniel Hiltgen's avatar
Daniel Hiltgen committed
100
				// so we allow a much longer initial timeout
Daniel Hiltgen's avatar
Daniel Hiltgen committed
101
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 20*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
102
103
104
105
106
107
108
109
			}
		}(i)
	}
	wg.Wait()
}

// Stress the system if we know how much VRAM it has, and attempt to load more models than will fit
func TestMultiModelStress(t *testing.T) {
Michael Yang's avatar
uint64  
Michael Yang committed
110
111
	s := os.Getenv("OLLAMA_MAX_VRAM") // TODO - discover actual VRAM
	if s == "" {
Daniel Hiltgen's avatar
Daniel Hiltgen committed
112
113
		t.Skip("OLLAMA_MAX_VRAM not specified, can't pick the right models for the stress test")
	}
Michael Yang's avatar
uint64  
Michael Yang committed
114
115
116
117
118
119

	maxVram, err := strconv.ParseUint(s, 10, 64)
	if err != nil {
		t.Fatal(err)
	}

Daniel Hiltgen's avatar
Daniel Hiltgen committed
120
121
122
123
124
125
126
127
	type model struct {
		name string
		size uint64 // Approximate amount of VRAM they typically use when fully loaded in VRAM
	}

	smallModels := []model{
		{
			name: "orca-mini",
Michael Yang's avatar
uint64  
Michael Yang committed
128
			size: 2992 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
129
130
131
		},
		{
			name: "phi",
Michael Yang's avatar
uint64  
Michael Yang committed
132
			size: 2616 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
133
134
135
		},
		{
			name: "gemma:2b",
Michael Yang's avatar
uint64  
Michael Yang committed
136
			size: 2364 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
137
138
139
		},
		{
			name: "stable-code:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
140
			size: 2608 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
141
142
143
		},
		{
			name: "starcoder2:3b",
Michael Yang's avatar
uint64  
Michael Yang committed
144
			size: 2166 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
145
146
147
148
149
		},
	}
	mediumModels := []model{
		{
			name: "llama2",
Michael Yang's avatar
uint64  
Michael Yang committed
150
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
151
152
153
		},
		{
			name: "mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
154
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
155
156
157
		},
		{
			name: "orca-mini:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
158
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
159
160
161
		},
		{
			name: "dolphin-mistral",
Michael Yang's avatar
uint64  
Michael Yang committed
162
			size: 4620 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
163
164
165
		},
		{
			name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
166
167
168
169
170
			size: 5000 * format.MebiByte,
		},
		{
			name: "codellama:7b",
			size: 5118 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
171
172
173
174
175
176
177
		},
	}

	// These seem to be too slow to be useful...
	// largeModels := []model{
	// 	{
	// 		name: "llama2:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
178
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
179
180
181
	// 	},
	// 	{
	// 		name: "codellama:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
182
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
183
184
185
	// 	},
	// 	{
	// 		name: "orca-mini:13b",
Michael Yang's avatar
uint64  
Michael Yang committed
186
	// 		size: 7400 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
187
188
189
	// 	},
	// 	{
	// 		name: "gemma:7b",
Michael Yang's avatar
uint64  
Michael Yang committed
190
	// 		size: 5000 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
191
192
193
	// 	},
	// 	{
	// 		name: "starcoder2:15b",
Michael Yang's avatar
uint64  
Michael Yang committed
194
	// 		size: 9100 * format.MebiByte,
Daniel Hiltgen's avatar
Daniel Hiltgen committed
195
196
197
198
199
	// 	},
	// }

	var chosenModels []model
	switch {
Michael Yang's avatar
uint64  
Michael Yang committed
200
	case maxVram < 10000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
201
202
		slog.Info("selecting small models")
		chosenModels = smallModels
Michael Yang's avatar
uint64  
Michael Yang committed
203
	// case maxVram < 30000*format.MebiByte:
Daniel Hiltgen's avatar
Daniel Hiltgen committed
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
	default:
		slog.Info("selecting medium models")
		chosenModels = mediumModels
		// default:
		// 	slog.Info("selecting large models")
		// 	chosenModels = largModels
	}

	req, resp := GenerateRequests()

	for i := range req {
		if i > len(chosenModels) {
			break
		}
		req[i].Model = chosenModels[i].name
	}

	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute) // TODO baseline -- 10m too short
	defer cancel()
	client, _, cleanup := InitServerConnection(ctx, t)
	defer cleanup()

	// Make sure all the models are pulled before we get started
	for _, r := range req {
		require.NoError(t, PullIfMissing(ctx, client, r.Model))
	}

	var wg sync.WaitGroup
Michael Yang's avatar
uint64  
Michael Yang committed
232
	consumed := uint64(256 * format.MebiByte) // Assume some baseline usage
Daniel Hiltgen's avatar
Daniel Hiltgen committed
233
234
	for i := 0; i < len(req); i++ {
		// Always get at least 2 models, but dont' overshoot VRAM too much or we'll take too long
Michael Yang's avatar
Michael Yang committed
235
236
		if i > 1 && consumed > maxVram {
			slog.Info("achieved target vram exhaustion", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
237
238
239
			break
		}
		consumed += chosenModels[i].size
Michael Yang's avatar
Michael Yang committed
240
		slog.Info("target vram", "count", i, "vram", format.HumanBytes2(maxVram), "models", format.HumanBytes2(consumed))
Daniel Hiltgen's avatar
Daniel Hiltgen committed
241
242
243
244
245
246

		wg.Add(1)
		go func(i int) {
			defer wg.Done()
			for j := 0; j < 3; j++ {
				slog.Info("Starting", "req", i, "iter", j, "model", req[i].Model)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
247
				DoGenerate(ctx, t, client, req[i], resp[i], 120*time.Second, 5*time.Second)
Daniel Hiltgen's avatar
Daniel Hiltgen committed
248
249
250
			}
		}(i)
	}
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
	go func() {
		for {
			time.Sleep(2 * time.Second)
			select {
			case <-ctx.Done():
				return
			default:
				models, err := client.ListRunning(ctx)
				if err != nil {
					slog.Warn("failed to list running models", "error", err)
					continue
				}
				for _, m := range models.Models {
					slog.Info("loaded model snapshot", "model", m)
				}
			}
		}
	}()
Daniel Hiltgen's avatar
Daniel Hiltgen committed
269
270
	wg.Wait()
}