test_bench_serving.py 6.75 KB
Newer Older
1
2
3
import unittest

from sglang.test.test_utils import (
4
5
    DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
    DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
6
    DEFAULT_FP8_MODEL_NAME_FOR_TEST,
7
8
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
9
    is_in_ci,
10
    run_bench_serving,
11
    write_github_step_summary,
12
13
14
15
16
17
18
19
20
21
22
23
24
)


class TestBenchServing(unittest.TestCase):

    def test_offline_throughput_default(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=[],
        )

25
        if is_in_ci():
26
27
28
29
            write_github_step_summary(
                f"### test_offline_throughput_default\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
30
            self.assertGreater(res["output_throughput"], 3350)
31
32
33
34
35
36

    def test_offline_throughput_non_stream_small_batch_size(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=200,
            request_rate=float("inf"),
37
            other_server_args=["--max-running-requests", "10"],
38
39
40
41
            dataset_name="sharegpt",
            random_input_len=None,
            random_output_len=None,
            disable_stream=True,
42
            need_warmup=True,
43
44
45
        )

        if is_in_ci():
46
47
48
49
            write_github_step_summary(
                f"### test_offline_throughput_non_stream_small_batch_size\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
Lianmin Zheng's avatar
Lianmin Zheng committed
50
51
            # There is a regression with torch 2.5
            # This number was 950 for torch 2.4
52
            self.assertGreater(res["output_throughput"], 850)
53
54
55
56
57
58
59
60
61

    def test_offline_throughput_without_radix_cache(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=["--disable-radix-cache"],
        )

62
        if is_in_ci():
63
64
65
66
            write_github_step_summary(
                f"### test_offline_throughput_without_radix_cache\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
67
            self.assertGreater(res["output_throughput"], 3350)
68
69
70
71
72
73
74
75
76

    def test_offline_throughput_without_chunked_prefill(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=["--chunked-prefill-size", "-1"],
        )

77
        if is_in_ci():
78
79
80
81
            write_github_step_summary(
                f"### test_offline_throughput_without_chunked_prefill\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
82
            self.assertGreater(res["output_throughput"], 2600)
83
84
85
86
87
88
89
90
91
92
93
94
95
96

    def test_offline_throughput_with_triton_attention_backend(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=[
                "--attention-backend",
                "triton",
                "--context-length",
                "8192",
            ],
        )

97
        if is_in_ci():
98
99
100
101
            write_github_step_summary(
                f"### test_offline_throughput_with_triton_attention_backend\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
102
            self.assertGreater(res["output_throughput"], 3450)
103

104
105
106
107
108
109
110
111
112
    def test_offline_throughput_default_fp8(self):
        res = run_bench_serving(
            model=DEFAULT_FP8_MODEL_NAME_FOR_TEST,
            num_prompts=500,
            request_rate=float("inf"),
            other_server_args=[],
        )

        if is_in_ci():
113
114
115
116
            write_github_step_summary(
                f"### test_offline_throughput_default_fp8\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
117
            self.assertGreater(res["output_throughput"], 3850)
118

119
120
121
122
123
124
125
126
    def test_online_latency_default(self):
        res = run_bench_serving(
            model=DEFAULT_MODEL_NAME_FOR_TEST,
            num_prompts=100,
            request_rate=1,
            other_server_args=[],
        )

127
        if is_in_ci():
128
129
            write_github_step_summary(
                f"### test_online_latency_default\n"
130
                f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
131
            )
132
            self.assertLess(res["median_e2e_latency_ms"], 12000)
133
134
            self.assertLess(res["median_ttft_ms"], 86)
            self.assertLess(res["median_itl_ms"], 10)
135

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    def test_online_latency_eagle(self):
        res = run_bench_serving(
            model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
            num_prompts=50,
            request_rate=1,
            disable_ignore_eos=True,
            dataset_name="sharegpt",
            other_server_args=[
                "--speculative-algorithm",
                "EAGLE",
                "--speculative-draft-model-path",
                DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
                "--speculative-num-steps",
                "5",
                "--speculative-eagle-topk",
                "8",
                "--speculative-num-draft-tokens",
                "64",
                "--mem-fraction-static",
                "0.7",
            ],
        )

        if is_in_ci():
            write_github_step_summary(
                f"### test_online_latency_eagle\n"
                f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
            )
            self.assertLess(res["median_e2e_latency_ms"], 10000)

166
167
168
169
170
171
172
173
    def test_moe_offline_throughput_default(self):
        res = run_bench_serving(
            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
            num_prompts=300,
            request_rate=float("inf"),
            other_server_args=["--tp", "2"],
        )

174
        if is_in_ci():
175
176
177
178
            write_github_step_summary(
                f"### test_moe_offline_throughput_default\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
179
            self.assertGreater(res["output_throughput"], 2150)
180
181
182
183
184
185
186
187
188

    def test_moe_offline_throughput_without_radix_cache(self):
        res = run_bench_serving(
            model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
            num_prompts=300,
            request_rate=float("inf"),
            other_server_args=["--tp", "2", "--disable-radix-cache"],
        )

189
        if is_in_ci():
190
191
192
193
            write_github_step_summary(
                f"### test_moe_offline_throughput_without_radix_cache\n"
                f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
            )
194
            self.assertGreater(res["output_throughput"], 2150)
195
196
197
198


if __name__ == "__main__":
    unittest.main()