Unverified Commit 442534aa authored by fzyzcjy's avatar fzyzcjy Committed by GitHub
Browse files

Add CI for gpt-oss model on hopper (#8851)

parent de8b8b6e
...@@ -65,9 +65,10 @@ def run_eval(args): ...@@ -65,9 +65,10 @@ def run_eval(args):
sampler = ChatCompletionSampler( sampler = ChatCompletionSampler(
model=args.model, model=args.model,
max_tokens=2048, max_tokens=getattr(args, "max_tokens", 2048),
base_url=base_url, base_url=base_url,
temperature=getattr(args, "temperature", 0.0), temperature=getattr(args, "temperature", 0.0),
reasoning_effort=getattr(args, "reasoning_effort", None),
) )
# Run eval # Run eval
...@@ -120,7 +121,9 @@ if __name__ == "__main__": ...@@ -120,7 +121,9 @@ if __name__ == "__main__":
parser.add_argument("--eval-name", type=str, default="mmlu") parser.add_argument("--eval-name", type=str, default="mmlu")
parser.add_argument("--num-examples", type=int) parser.add_argument("--num-examples", type=int)
parser.add_argument("--num-threads", type=int, default=512) parser.add_argument("--num-threads", type=int, default=512)
parser.add_argument("--max-tokens", type=int, default=2048)
parser.add_argument("--temperature", type=float, default=0.0) parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--reasoning-effort", type=str)
args = parser.parse_args() args = parser.parse_args()
run_eval(args) run_eval(args)
...@@ -91,6 +91,7 @@ class ChatCompletionSampler(SamplerBase): ...@@ -91,6 +91,7 @@ class ChatCompletionSampler(SamplerBase):
model: Optional[str] = None, model: Optional[str] = None,
system_message: Optional[str] = None, system_message: Optional[str] = None,
temperature: float = 0.0, temperature: float = 0.0,
reasoning_effort: Optional[str] = None,
max_tokens: int = 2048, max_tokens: int = 2048,
): ):
self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient()) self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient())
...@@ -102,7 +103,11 @@ class ChatCompletionSampler(SamplerBase): ...@@ -102,7 +103,11 @@ class ChatCompletionSampler(SamplerBase):
self.system_message = system_message self.system_message = system_message
self.temperature = temperature self.temperature = temperature
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.reasoning_effort = reasoning_effort
self.image_format = "url" self.image_format = "url"
print(
f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=}"
)
def _handle_image( def _handle_image(
self, self,
...@@ -138,6 +143,7 @@ class ChatCompletionSampler(SamplerBase): ...@@ -138,6 +143,7 @@ class ChatCompletionSampler(SamplerBase):
messages=message_list, messages=message_list,
temperature=self.temperature, temperature=self.temperature,
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
reasoning_effort=self.reasoning_effort,
) )
return response.choices[0].message.content return response.choices[0].message.content
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
......
...@@ -71,6 +71,8 @@ class GPQAEval(Eval): ...@@ -71,6 +71,8 @@ class GPQAEval(Eval):
) )
] ]
response_text = sampler(prompt_messages) response_text = sampler(prompt_messages)
if response_text is None:
response_text = ""
match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text) match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
extracted_answer = match.group(1) if match else None extracted_answer = match.group(1) if match else None
score = 1.0 if extracted_answer == correct_answer else 0.0 score = 1.0 if extracted_answer == correct_answer else 0.0
......
...@@ -63,6 +63,7 @@ suites = { ...@@ -63,6 +63,7 @@ suites = {
TestFile("test_fp8_kernel.py", 8), TestFile("test_fp8_kernel.py", 8),
TestFile("test_function_call_parser.py", 10), TestFile("test_function_call_parser.py", 10),
TestFile("test_fused_moe.py", 30), TestFile("test_fused_moe.py", 30),
TestFile("test_gpt_oss_1gpu.py", 600),
TestFile("test_hicache.py", 116), TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 127), TestFile("test_hicache_mla.py", 127),
TestFile("test_hicache_storage.py", 127), TestFile("test_hicache_storage.py", 127),
...@@ -104,7 +105,7 @@ suites = { ...@@ -104,7 +105,7 @@ suites = {
TestFile("test_utils_update_weights.py", 48), TestFile("test_utils_update_weights.py", 48),
TestFile("test_vision_chunked_prefill.py", 175), TestFile("test_vision_chunked_prefill.py", 175),
TestFile("test_vlm_input_format.py", 300), TestFile("test_vlm_input_format.py", 300),
TestFile("test_vision_openai_server_a.py", 584), TestFile("test_vision_openai_server_a.py", 989),
TestFile("test_vision_openai_server_b.py", 620), TestFile("test_vision_openai_server_b.py", 620),
TestFile("test_w8a8_quantization.py", 46), TestFile("test_w8a8_quantization.py", 46),
TestFile("test_reasoning_parser.py", 5), TestFile("test_reasoning_parser.py", 5),
...@@ -176,6 +177,7 @@ suites = { ...@@ -176,6 +177,7 @@ suites = {
TestFile("test_update_weights_from_distributed.py", 103), TestFile("test_update_weights_from_distributed.py", 103),
], ],
"per-commit-4-gpu": [ "per-commit-4-gpu": [
TestFile("test_gpt_oss_4gpu.py", 600),
TestFile("test_local_attn.py", 250), TestFile("test_local_attn.py", 250),
TestFile("test_pp_single_node.py", 372), TestFile("test_pp_single_node.py", 372),
TestFile("test_multi_instance_release_memory_occupation.py", 64), TestFile("test_multi_instance_release_memory_occupation.py", 64),
......
import unittest
from test_gpt_oss_common import BaseTestGptOss
class TestGptOss1Gpu(BaseTestGptOss):
def test_mxfp4_20b(self):
self.run_test(
model_variant="20b",
quantization="mxfp4",
expected_score_of_reasoning_effort={
"low": 0.38,
"medium": 0.38,
"high": 0.29, # TODO investigate
},
)
def test_bf16_20b(self):
self.run_test(
model_variant="20b",
quantization="bf16",
expected_score_of_reasoning_effort={
"low": 0.38,
"medium": 0.38,
"high": 0.29, # TODO investigate
},
)
if __name__ == "__main__":
unittest.main()
import unittest
from test_gpt_oss_common import BaseTestGptOss
class TestGptOss4Gpu(BaseTestGptOss):
def test_bf16_120b(self):
self.run_test(
model_variant="120b",
quantization="bf16",
expected_score_of_reasoning_effort={
"low": 0.61,
# remove to speed up
# "medium": 0.61,
# "high": 0.61,
},
other_args=["--tp", "4", "--cuda-graph-max-bs", "200"],
)
def test_mxfp4_120b(self):
self.run_test(
model_variant="120b",
quantization="mxfp4",
expected_score_of_reasoning_effort={
"low": 0.61,
# remove to speed up
# "medium": 0.61,
# "high": 0.61,
},
other_args=[
"--tp",
"4",
"--cuda-graph-max-bs",
"200",
"--mem-fraction-static",
"0.93",
],
)
if __name__ == "__main__":
unittest.main()
from concurrent.futures import ThreadPoolExecutor
from types import SimpleNamespace
from typing import Dict, List, Literal, Optional
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
_base_url = DEFAULT_URL_FOR_TEST
class BaseTestGptOss(CustomTestCase):
def run_test(
self,
model_variant: Literal["20b", "120b"],
quantization: Literal["mxfp4", "bf16"],
expected_score_of_reasoning_effort: Dict[str, float],
other_args: Optional[List[str]] = None,
):
if other_args is None:
other_args = []
model = {
("20b", "bf16"): "lmsys/gpt-oss-20b-bf16",
("120b", "bf16"): "lmsys/gpt-oss-120b-bf16",
("20b", "mxfp4"): "openai/gpt-oss-20b",
("120b", "mxfp4"): "openai/gpt-oss-120b",
}[(model_variant, quantization)]
if model_variant == "20b":
other_args += ["--cuda-graph-max-bs", "600"]
self._run_test_raw(
model=model,
expected_score_of_reasoning_effort=expected_score_of_reasoning_effort,
other_args=other_args,
)
def _run_test_raw(
self,
model: str,
expected_score_of_reasoning_effort: Dict[str, float],
other_args: List[str],
):
process = popen_launch_server(
model,
_base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
try:
# run multiple tests in parallel since we are mostly bound by the longest generate sequence
# instead of the number of questions
with ThreadPoolExecutor(max_workers=4) as executor:
list(
executor.map(
lambda d: self._run_one_eval(**d),
[
dict(
model=model,
reasoning_effort=reasoning_effort,
expected_score=expected_score,
)
for reasoning_effort, expected_score in expected_score_of_reasoning_effort.items()
],
)
)
finally:
kill_process_tree(process.pid)
def _run_one_eval(self, model, reasoning_effort, expected_score):
args = SimpleNamespace(
base_url=_base_url,
model=model,
eval_name="gpqa",
num_examples=198,
# use enough threads to allow parallelism
num_threads=198,
# TODO 4k is still not enough, we need e.g. 64k token, but that is super slow
# otherwise a lot of questions are not answered
max_tokens=4096,
# simple-evals by default use 0.5 and is better than 0.0 temperature
# but here for reproducibility, we use 0.1
temperature=0.1,
reasoning_effort=reasoning_effort,
)
print(f"Evaluation start: {model=} {reasoning_effort=} {expected_score=}")
metrics = run_eval(args)
print(
f"Evaluation end: {model=} {reasoning_effort=} {expected_score=} {metrics=}"
)
self.assertGreaterEqual(metrics["score"], expected_score)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment