test_gpt_oss_common.py 3.76 KB
Newer Older
1
import os
2
3
4
5
from concurrent.futures import ThreadPoolExecutor
from types import SimpleNamespace
from typing import Dict, List, Literal, Optional

6
from sglang.srt.utils import is_hip, kill_process_tree
7
8
9
10
11
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
Lianmin Zheng's avatar
Lianmin Zheng committed
12
    is_in_ci,
13
    popen_launch_server,
Lianmin Zheng's avatar
Lianmin Zheng committed
14
    write_github_step_summary,
15
16
17
)

_base_url = DEFAULT_URL_FOR_TEST
18
_is_hip = is_hip()
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40


class BaseTestGptOss(CustomTestCase):
    def run_test(
        self,
        model_variant: Literal["20b", "120b"],
        quantization: Literal["mxfp4", "bf16"],
        expected_score_of_reasoning_effort: Dict[str, float],
        other_args: Optional[List[str]] = None,
    ):
        if other_args is None:
            other_args = []

        model = {
            ("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
            ("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
            ("20b", "mxfp4"): "openai/gpt-oss-20b",
            ("120b", "mxfp4"): "openai/gpt-oss-120b",
        }[(model_variant, quantization)]

        if model_variant == "20b":
            other_args += ["--cuda-graph-max-bs", "600"]
41
42
        if _is_hip:
            os.environ["SGLANG_USE_AITER"] = "0"
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
        self._run_test_raw(
            model=model,
            expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
            other_args=other_args,
        )

    def _run_test_raw(
        self,
        model: str,
        expected_score_of_reasoning_effort: Dict[str, float],
        other_args: List[str],
    ):
        process = popen_launch_server(
            model,
            _base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=other_args,
        )

        try:
            # run multiple tests in parallel since we are mostly bound by the longest generate sequence
            # instead of the number of questions
            with ThreadPoolExecutor(max_workers=4) as executor:
                list(
                    executor.map(
                        lambda d: self._run_one_eval(**d),
                        [
                            dict(
                                model=model,
                                reasoning_effort=reasoning_effort,
                                expected_score=expected_score,
                            )
                            for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
                        ],
                    )
                )
        finally:
            kill_process_tree(process.pid)

    def _run_one_eval(self, model, reasoning_effort, expected_score):
        args = SimpleNamespace(
            base_url=_base_url,
            model=model,
            eval_name="gpqa",
            num_examples=198,
            # use enough threads to allow parallelism
            num_threads=198,
            # TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
            # otherwise a lot of questions are not answered
            max_tokens=4096,
            # simple-evals by default use 0.5 and is better than 0.0 temperature
            # but here for reproducibility, we use 0.1
            temperature=0.1,
            reasoning_effort=reasoning_effort,
        )

Lianmin Zheng's avatar
Lianmin Zheng committed
99
100
101
        setup = f"model={model} reasoning_effort={reasoning_effort} expected_score={expected_score}"

        print(f"Evaluation start: {setup}")
102
        metrics = run_eval(args)
Lianmin Zheng's avatar
Lianmin Zheng committed
103
        print(f"Evaluation end: {setup} {metrics=}")
104
        self.assertGreaterEqual(metrics["score"], expected_score)
Lianmin Zheng's avatar
Lianmin Zheng committed
105
106
107
108
109
110
111

        if is_in_ci():
            write_github_step_summary(
                f"### test_gpt_oss_common\n"
                f"Setup: {setup}\n"
                f"Score: {metrics['score']:.2f}\n"
            )