test_bench_serving.py 15.2 KB
Newer Older
Lifu Huang's avatar
Lifu Huang committed
1
2
import asyncio
import itertools
3
import unittest
Lifu Huang's avatar
Lifu Huang committed
4
5

import requests
6
7

from sglang.test.test_utils import (
8
9
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
10
    DEFAULT_MODEL_NAME_FOR_TEST,
Lianmin Zheng's avatar
Lianmin Zheng committed
11
    DEFAULT_MODEL_NAME_FOR_TEST_FP8,
12
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
13
    DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
14
    CustomTestCase,
15
    is_in_amd_ci,
16
    is_in_ci,
17
    run_bench_serving,
18
    write_github_step_summary,
19
20
21
)


22
class TestBenchServing(CustomTestCase):
23
24
25
26
27
28
29
30
    def test_offline_throughput_default(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=[],
        )

31
        if is_in_ci():
32
33
            write_github_step_summary(
                f"### test_offline_throughput_default\n"
Lifu Huang's avatar
Lifu Huang committed
34
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
35
            )
36
            if is_in_amd_ci():
Sai Enduri's avatar
Sai Enduri committed
37
                self.assertGreater(res["output_throughput"], 3050)
38
39
            else:
                self.assertGreater(res["output_throughput"], 3800)
40
41
42
43
44
45

    def test_offline_throughput_non_stream_small_batch_size(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=200,
            request_rate=float("inf"),
46
            other_server_args=["--max-running-requests", "10"],
47
48
49
50
            dataset_name="sharegpt",
            random_input_len=None,
            random_output_len=None,
            disable_stream=True,
51
            need_warmup=True,
52
53
54
        )

        if is_in_ci():
55
56
            write_github_step_summary(
                f"### test_offline_throughput_non_stream_small_batch_size\n"
Lifu Huang's avatar
Lifu Huang committed
57
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
58
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
59
            self.assertGreater(res["output_throughput"], 1050)
60
61
62
63
64
65
66
67
68

    def test_offline_throughput_without_radix_cache(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=["--disable-radix-cache"],
        )

69
        if is_in_ci():
70
71
            write_github_step_summary(
                f"### test_offline_throughput_without_radix_cache\n"
Lifu Huang's avatar
Lifu Huang committed
72
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
73
            )
74
            if is_in_amd_ci():
kk's avatar
kk committed
75
                self.assertGreater(res["output_throughput"], 3050)
76
77
            else:
                self.assertGreater(res["output_throughput"], 3800)
78
79
80
81
82
83
84
85
86

    def test_offline_throughput_without_chunked_prefill(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=["--chunked-prefill-size", "-1"],
        )

87
        if is_in_ci():
88
89
            write_github_step_summary(
                f"### test_offline_throughput_without_chunked_prefill\n"
Lifu Huang's avatar
Lifu Huang committed
90
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
91
            )
92
            self.assertGreater(res["output_throughput"], 2600)
93
94
95
96
97
98
99
100
101
102
103
104
105
106

    def test_offline_throughput_with_triton_attention_backend(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=[
                "--attention-backend",
                "triton",
                "--context-length",
                "8192",
            ],
        )

107
        if is_in_ci():
108
109
            write_github_step_summary(
                f"### test_offline_throughput_with_triton_attention_backend\n"
Lifu Huang's avatar
Lifu Huang committed
110
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
111
            )
112
            if is_in_amd_ci():
113
114
115
                self.assertGreater(res["output_throughput"], 3500)
            else:
                self.assertGreater(res["output_throughput"], 3700)
116

117
118
    def test_offline_throughput_default_fp8(self):
        res = run_bench_serving(
Lianmin Zheng's avatar
Lianmin Zheng committed
119
            model=DEFAULT_MODEL_NAME_FOR_TEST_FP8,
120
121
122
123
124
125
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=[],
        )

        if is_in_ci():
126
127
            write_github_step_summary(
                f"### test_offline_throughput_default_fp8\n"
Lifu Huang's avatar
Lifu Huang committed
128
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
129
            )
130
            if is_in_amd_ci():
kk's avatar
kk committed
131
                self.assertGreater(res["output_throughput"], 3500)
132
133
            else:
                self.assertGreater(res["output_throughput"], 4300)
134

135
136
137
138
139
140
141
142
    def test_online_latency_default(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=100,
            request_rate=1,
            other_server_args=[],
        )

143
        if is_in_ci():
144
145
            write_github_step_summary(
                f"### test_online_latency_default\n"
Lifu Huang's avatar
Lifu Huang committed
146
                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
147
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
148
            self.assertLess(res["median_e2e_latency_ms"], 11000)
149
            if is_in_amd_ci():
150
151
152
                self.assertLess(res["median_ttft_ms"], 115)
            else:
                self.assertLess(res["median_ttft_ms"], 86)
153
            self.assertLess(res["median_itl_ms"], 10)
154

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    def test_vlm_offline_throughput(self):
        res = run_bench_serving(
            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
            num_prompts=200,
            request_rate=float("inf"),
            other_server_args=[
                "--mem-fraction-static",
                "0.7",
            ],
            dataset_name="mmmu",
        )

        if is_in_ci():
            write_github_step_summary(
                f"### test_vlm_offline_throughput\n"
Lifu Huang's avatar
Lifu Huang committed
170
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
171
            )
172
            if is_in_amd_ci():
173
174
175
176
177
178
179
180
                self.assertGreater(res["output_throughput"], 2000)
                # TODO: not set yet, need AMD machine
            else:
                self.assertGreater(res["output_throughput"], 2500)

    def test_vlm_online_latency(self):
        res = run_bench_serving(
            model=DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
181
            num_prompts=250,
182
183
184
185
186
187
188
189
190
191
192
            request_rate=1,
            other_server_args=[
                "--mem-fraction-static",
                "0.7",
            ],
            dataset_name="mmmu",
        )

        if is_in_ci():
            write_github_step_summary(
                f"### test_vlm_online_latency\n"
Lifu Huang's avatar
Lifu Huang committed
193
                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
194
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
195
            self.assertLess(res["median_e2e_latency_ms"], 16500)
196
            if is_in_amd_ci():
197
198
199
                self.assertLess(res["median_ttft_ms"], 150)
                # TODO: not set yet, need AMD machine
            else:
Yineng Zhang's avatar
Yineng Zhang committed
200
                self.assertLess(res["median_ttft_ms"], 100)
201
202
            self.assertLess(res["median_itl_ms"], 8)

Lifu Huang's avatar
Lifu Huang committed
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
    def test_lora_online_latency(self):
        # TODO (lifuhuang): verify LoRA support in AMD.
        if is_in_amd_ci():
            pass

        res = self._run_lora_latency_test(enable_background_task=False)

        if is_in_ci():
            write_github_step_summary(
                f"### test_lora_online_latency\n"
                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
            )
            self.assertLess(res["median_e2e_latency_ms"], 2400)
            self.assertLess(res["median_ttft_ms"], 58)

    def test_lora_online_latency_with_concurrent_adapter_updates(self):
        # TODO (lifuhuang): verify LoRA support in AMD.
        if is_in_amd_ci():
            pass

        res = self._run_lora_latency_test(enable_background_task=True)

        if is_in_ci():
            write_github_step_summary(
                f"### test_lora_online_latency\n"
                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
                f"median_ttft_ms: {res['median_ttft_ms']:.2f} ms\n"
            )
            self.assertLess(res["median_e2e_latency_ms"], 4000)
233
            self.assertLess(res["median_ttft_ms"], 80)
Lifu Huang's avatar
Lifu Huang committed
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

    def _run_lora_latency_test(self, enable_background_task: bool):
        """
        Run a latency test for LoRA with the specified background task setting.
        """

        async def lora_loader_unloader_task(
            base_url: str,
            start_event: asyncio.Event,
            stop_event: asyncio.Event,
        ):
            """
            A background task that repeatedly loads and unloads a LoRA adapter.
            """
            await start_event.wait()

            path_cycler = itertools.cycle(
                [
                    "pbevan11/llama-3.1-8b-ocr-correction",
                    "faridlazuarda/valadapt-llama-3.1-8B-it-chinese",
                    "philschmid/code-llama-3-1-8b-text-to-sql-lora",
                ]
            )
            load_url = f"{base_url}/load_lora_adapter"
            unload_url = f"{base_url}/unload_lora_adapter"
            num_updates = 0

            while not stop_event.is_set():
                # 1. Load the LoRA adapter
                lora_path = next(path_cycler)
                response = await asyncio.to_thread(
                    requests.post,
                    load_url,
                    json={"lora_name": lora_path, "lora_path": lora_path},
                )
                self.assertTrue(
                    response.ok, f"Failed to load LoRA adapter: {response.text}"
                )
                num_updates += 1

                if stop_event.is_set():
                    break

                # Yield control to allow other tasks to run.
                await asyncio.sleep(1)

                # 2. Unload the LoRA adapter
                response = await asyncio.to_thread(
                    requests.post,
                    unload_url,
                    json={"lora_name": lora_path},
                )
                self.assertTrue(
                    response.ok, f"Failed to unload LoRA adapter: {response.text}"
                )
                num_updates += 1

                # Yield control to allow other tasks to run.
                await asyncio.sleep(1)

        background_task = lora_loader_unloader_task if enable_background_task else None
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=400,
            request_rate=8,
            other_server_args=[
                "--enable-lora",
                "--max-loras-per-batch",
                "1",
                "--disable-radix-cache",
                "--random-seed",
                "42",
                "--mem-fraction-static",
                "0.8",
                "--lora-paths",
                "Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16",
                "--max-lora-rank",
                "256",
            ],
            dataset_name="random",
            random_input_len=256,
            random_output_len=256,
            lora_name=["Nutanix/Meta-Llama-3.1-8B-Instruct_lora_4_alpha_16"],
            background_task=background_task,
        )

        return res

322
323
324
    def test_online_latency_eagle(self):
        res = run_bench_serving(
            model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
325
326
            num_prompts=300,
            request_rate=8,
327
            sharegpt_context_len=3072,
328
329
330
331
332
333
334
335
336
337
            disable_ignore_eos=True,
            dataset_name="sharegpt",
            other_server_args=[
                "--speculative-algorithm",
                "EAGLE",
                "--speculative-draft-model-path",
                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
                "--speculative-num-steps",
                "5",
                "--speculative-eagle-topk",
338
                "4",
339
                "--speculative-num-draft-tokens",
340
                "16",
341
342
343
                "--mem-fraction-static",
                "0.7",
            ],
344
            need_warmup=True,
345
            seed=42,
346
347
348
349
350
        )

        if is_in_ci():
            write_github_step_summary(
                f"### test_online_latency_eagle\n"
Lifu Huang's avatar
Lifu Huang committed
351
352
                f"median_e2e_latency_ms: {res['median_e2e_latency_ms']:.2f} ms\n"
                f"accept_length: {res['accept_length']:.2f} \n"
353
            )
354
            if is_in_amd_ci():
fzyzcjy's avatar
fzyzcjy committed
355
                self.assertLess(res["median_e2e_latency_ms"], 1800)
356
357
            else:
                self.assertLess(res["median_e2e_latency_ms"], 900)
Lianmin Zheng's avatar
Lianmin Zheng committed
358
            self.assertGreater(res["accept_length"], 3.0)
359

360
361
362
363
364
365
366
367
    def test_moe_offline_throughput_default(self):
        res = run_bench_serving(
            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
            num_prompts=300,
            request_rate=float("inf"),
            other_server_args=["--tp", "2"],
        )

368
        if is_in_ci():
369
370
            write_github_step_summary(
                f"### test_moe_offline_throughput_default\n"
Lifu Huang's avatar
Lifu Huang committed
371
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
372
            )
373
            if is_in_amd_ci():
374
375
376
                self.assertGreater(res["output_throughput"], 2100)
            else:
                self.assertGreater(res["output_throughput"], 2200)
377
378
379
380
381
382
383
384
385

    def test_moe_offline_throughput_without_radix_cache(self):
        res = run_bench_serving(
            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
            num_prompts=300,
            request_rate=float("inf"),
            other_server_args=["--tp", "2", "--disable-radix-cache"],
        )

386
        if is_in_ci():
387
388
            write_github_step_summary(
                f"### test_moe_offline_throughput_without_radix_cache\n"
Lifu Huang's avatar
Lifu Huang committed
389
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
390
            )
391
            if is_in_amd_ci():
392
393
394
                self.assertGreater(res["output_throughput"], 2100)
            else:
                self.assertGreater(res["output_throughput"], 2200)
395

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
    def test_pp_offline_throughput_default_decode(self):
        res = run_bench_serving(
            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
            num_prompts=1000,
            request_rate=float("inf"),
            random_input_len=1,
            random_output_len=1024,
            other_server_args=["--pp", "2"],
            need_warmup=True,
            seed=42,
        )

        if is_in_ci():
            write_github_step_summary(
                f"### test_pp_offline_throughput_default_decode\n"
Lifu Huang's avatar
Lifu Huang committed
411
                f"Output throughput: {res['output_throughput']:.2f} token/s\n"
412
            )
413
            self.assertGreater(res["output_throughput"], 6700)
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435

    def test_pp_long_context_prefill(self):
        res = run_bench_serving(
            model="meta-llama/Llama-3.3-70B-Instruct",
            num_prompts=4,
            request_rate=float("inf"),
            random_input_len=128000,
            random_output_len=1,
            dataset_name="random",
            other_server_args=[
                "--quantization",
                "fp8",
                "--pp",
                2,
            ],
            need_warmup=False,
            seed=42,
        )

        if is_in_ci():
            write_github_step_summary(
                f"### test_pp_long_context_latency_prefill\n"
Lifu Huang's avatar
Lifu Huang committed
436
                f"input_throughput: {res['input_throughput']:.2f} ms\n"
437
438
439
            )
            self.assertGreater(res["input_throughput"], 4000)

440
441
442

if __name__ == "__main__":
    unittest.main()