test_nightly_text_models_perf.py 5.46 KB
Newer Older
Mick's avatar
Mick committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
import subprocess
import time
import unittest

from sglang.bench_one_batch_server import BenchmarkResult
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    _parse_int_list_env,
    is_in_ci,
    parse_models,
    popen_launch_server,
    write_github_step_summary,
)

PROFILE_DIR = "performance_profiles_text_models"


class TestNightlyTextModelsPerformance(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model_groups = [
            (parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False),
            (parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
            # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
        ]
        cls.base_url = DEFAULT_URL_FOR_TEST
        cls.batch_sizes = [1, 1, 8, 16, 64]
        cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
        cls.output_lens = tuple(_parse_int_list_env("NIGHTLY_OUTPUT_LENS", "512"))
        os.makedirs(PROFILE_DIR, exist_ok=True)
        cls.full_report = f"## {cls.__name__}\n" + BenchmarkResult.help_str()

    def test_bench_one_batch(self):
        all_benchmark_results = []

        for model_group, is_fp8, is_tp2 in self.model_groups:
            for model in model_group:
                benchmark_results = []
                with self.subTest(model=model):
                    process = popen_launch_server(
                        model=model,
                        base_url=self.base_url,
                        other_args=["--tp", "2"] if is_tp2 else [],
                        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
                    )
                    try:

                        profile_filename = (
                            f"{model.replace('/', '_')}_{int(time.time())}"
                        )
                        profile_path_prefix = os.path.join(
                            PROFILE_DIR, profile_filename
                        )
                        json_output_file = (
                            f"results_{model.replace('/', '_')}_{int(time.time())}.json"
                        )

                        command = [
                            "python3",
                            "-m",
                            "sglang.bench_one_batch_server",
                            "--model",
                            model,
                            "--base-url",
                            self.base_url,
                            "--batch-size",
                            *[str(x) for x in self.batch_sizes],
                            "--input-len",
                            *[str(x) for x in self.input_lens],
                            "--output-len",
                            *[str(x) for x in self.output_lens],
                            "--show-report",
                            "--profile",
                            "--profile-by-stage",
                            "--profile-filename-prefix",
                            profile_path_prefix,
                            f"--output-path={json_output_file}",
                            "--no-append-to-github-summary",
                        ]

                        print(f"Running command: {' '.join(command)}")
                        result = subprocess.run(command, capture_output=True, text=True)

                        if result.returncode != 0:
                            print(
                                f"Error running benchmark for {model} with batch size:"
                            )
                            print(result.stderr)
                            # Continue to next batch size even if one fails
                            continue

                        # Load and deserialize JSON results
                        if os.path.exists(json_output_file):
                            import json

                            with open(json_output_file, "r") as f:
                                json_data = json.load(f)

                            # Convert JSON data to BenchmarkResult objects
                            for data in json_data:
                                benchmark_result = BenchmarkResult(**data)
                                all_benchmark_results.append(benchmark_result)
                                benchmark_results.append(benchmark_result)

                            print(
                                f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
                            )

                            # Clean up JSON file
                            os.remove(json_output_file)
                        else:
                            print(
                                f"Warning: JSON output file {json_output_file} not found"
                            )

                    finally:
                        kill_process_tree(process.pid)

                    report_part = BenchmarkResult.generate_markdown_report(
                        PROFILE_DIR, benchmark_results
                    )
                    self.full_report += report_part + "\n"

        if is_in_ci():
            write_github_step_summary(self.full_report)


if __name__ == "__main__":
    unittest.main()