"official/nlp/tasks/electra_task.py" did not exist on "6c63efed800ecd5f6168f17c706f35f07f27928b"
test_serving_throughput.py 3.45 KB
Newer Older
1
import os
2
3
4
5
import unittest
from types import SimpleNamespace

from sglang.bench_serving import run_benchmark
Lianmin Zheng's avatar
Lianmin Zheng committed
6
from sglang.srt.server_args import ServerArgs
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server


class TestServingThroughput(unittest.TestCase):

    def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
        # Launch the server
        other_args = []
        if disable_radix_cache:
            other_args.append("--disable-radix-cache")
        if disable_flashinfer:
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])

        model = DEFAULT_MODEL_NAME_FOR_TEST
        base_url = "http://127.0.0.1:9157"
        process = popen_launch_server(
            model, base_url, timeout=300, other_args=other_args
        )

        # Run benchmark
        num_prompts = 400
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
            host=None,
            port=None,
            dataset_name="random",
            dataset_path="",
            model=None,
            tokenizer=None,
            num_prompts=num_prompts,
            sharegpt_output_len=None,
            random_input_len=4096,
            random_output_len=2048,
            random_range_ratio=0.0,
            request_rate=float("inf"),
            multi=None,
            seed=0,
            output_file=None,
            disable_tqdm=False,
            disable_stream=False,
            disable_ignore_eos=False,
            extra_request_body=None,
        )

        try:
            res = run_benchmark(args)
        finally:
            kill_child_process(process.pid)

        assert res["completed"] == num_prompts
60
        return res
61
62

    def test_default(self):
63
        res = self.run_test(
Lianmin Zheng's avatar
Lianmin Zheng committed
64
65
66
            disable_radix_cache=ServerArgs.disable_radix_cache,
            disable_flashinfer=ServerArgs.disable_flashinfer,
            chunked_prefill_size=ServerArgs.chunked_prefill_size,
67
68
        )

69
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
Lianmin Zheng's avatar
Lianmin Zheng committed
70
71
            # A100 (PCIE) performance
            assert res["output_throughput"] >= 1400
72

73
    def test_default_without_radix_cache(self):
74
        res = self.run_test(
75
            disable_radix_cache=True,
Lianmin Zheng's avatar
Lianmin Zheng committed
76
77
            disable_flashinfer=ServerArgs.disable_flashinfer,
            chunked_prefill_size=ServerArgs.chunked_prefill_size,
78
79
        )

80
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
Lianmin Zheng's avatar
Lianmin Zheng committed
81
82
            # A100 (PCIE) performance
            assert res["output_throughput"] >= 1450
83

Lianmin Zheng's avatar
Lianmin Zheng committed
84
85
86
87
88
    def test_default_with_chunked_prefill(self):
        res = self.run_test(
            disable_radix_cache=ServerArgs.disable_radix_cache,
            disable_flashinfer=ServerArgs.disable_flashinfer,
            chunked_prefill_size=8192,
89
90
        )

Lianmin Zheng's avatar
Lianmin Zheng committed
91
92
93
94
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
            assert res["output_throughput"] >= 1400

95
96
97
98
99
100
101
102
103
104
105
106
107
    def test_all_cases(self):
        for disable_radix_cache in [False, True]:
            for disable_flashinfer in [False, True]:
                for chunked_prefill_size in [-1, 2048]:
                    self.run_test(
                        disable_radix_cache=False,
                        disable_flashinfer=False,
                        chunked_prefill_size=-1,
                    )


if __name__ == "__main__":
    unittest.main()