"vscode:/vscode.git/clone" did not exist on "06029dd6edcc719e5831443179bcb206c6355093"
Unverified Commit d6777a70 authored by hlu1's avatar hlu1 Committed by GitHub
Browse files

Add --thinking-mode to run_eval (#11189)


Signed-off-by: default avatarHao Lu <14827759+hlu1@users.noreply.github.com>
parent 8c574902
...@@ -16,13 +16,29 @@ from sglang.test.simple_eval_common import ( ...@@ -16,13 +16,29 @@ from sglang.test.simple_eval_common import (
) )
def get_thinking_kwargs(args):
if args.thinking_mode in THINKING_MODE_CHOICES:
thinking_param = (
"thinking" if args.thinking_mode == "deepseek-v3" else "enable_thinking"
)
return {
"chat_template_kwargs": {thinking_param: True},
"separate_reasoning": True,
}
return {}
def run_eval_once(args, base_url: str, eval_obj: Eval) -> dict: def run_eval_once(args, base_url: str, eval_obj: Eval) -> dict:
# Get thinking kwargs based on user's choice
thinking_kwargs = get_thinking_kwargs(args)
sampler = ChatCompletionSampler( sampler = ChatCompletionSampler(
model=args.model, model=args.model,
max_tokens=getattr(args, "max_tokens", 2048), max_tokens=getattr(args, "max_tokens", 2048),
base_url=base_url, base_url=base_url,
temperature=getattr(args, "temperature", 0.0), temperature=getattr(args, "temperature", 0.0),
reasoning_effort=getattr(args, "reasoning_effort", None), reasoning_effort=getattr(args, "reasoning_effort", None),
extra_body=thinking_kwargs,
) )
# Run eval # Run eval
...@@ -136,6 +152,8 @@ def run_eval(args): ...@@ -136,6 +152,8 @@ def run_eval(args):
return metrics return metrics
THINKING_MODE_CHOICES = ["deepseek-r1", "deepseek-v3", "qwen3"]
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
...@@ -166,6 +184,13 @@ if __name__ == "__main__": ...@@ -166,6 +184,13 @@ if __name__ == "__main__":
parser.add_argument("--max-tokens", type=int, default=2048) parser.add_argument("--max-tokens", type=int, default=2048)
parser.add_argument("--temperature", type=float, default=0.0) parser.add_argument("--temperature", type=float, default=0.0)
parser.add_argument("--reasoning-effort", type=str) parser.add_argument("--reasoning-effort", type=str)
parser.add_argument(
"--thinking-mode",
default=None,
type=str,
choices=THINKING_MODE_CHOICES,
help="Enable thinking mode in Deepseek R1, V3.1/3.2, or Qwen3",
)
args = parser.parse_args() args = parser.parse_args()
run_eval(args) run_eval(args)
...@@ -93,6 +93,7 @@ class ChatCompletionSampler(SamplerBase): ...@@ -93,6 +93,7 @@ class ChatCompletionSampler(SamplerBase):
temperature: float = 0.0, temperature: float = 0.0,
reasoning_effort: Optional[str] = None, reasoning_effort: Optional[str] = None,
max_tokens: int = 2048, max_tokens: int = 2048,
extra_body: Optional[Dict[str, Any]] = None,
): ):
self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient()) self.client = OpenAI(base_url=base_url, http_client=LargerHttpxClient())
...@@ -104,9 +105,10 @@ class ChatCompletionSampler(SamplerBase): ...@@ -104,9 +105,10 @@ class ChatCompletionSampler(SamplerBase):
self.temperature = temperature self.temperature = temperature
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.reasoning_effort = reasoning_effort self.reasoning_effort = reasoning_effort
self.extra_body = extra_body
self.image_format = "url" self.image_format = "url"
print( print(
f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=}" f"ChatCompletionSampler initialized with {self.system_message=} {self.temperature=} {self.max_tokens=} {self.reasoning_effort=} {self.extra_body=}"
) )
def _handle_image( def _handle_image(
...@@ -144,6 +146,7 @@ class ChatCompletionSampler(SamplerBase): ...@@ -144,6 +146,7 @@ class ChatCompletionSampler(SamplerBase):
temperature=self.temperature, temperature=self.temperature,
max_tokens=self.max_tokens, max_tokens=self.max_tokens,
reasoning_effort=self.reasoning_effort, reasoning_effort=self.reasoning_effort,
extra_body=self.extra_body,
) )
return response.choices[0].message.content return response.choices[0].message.content
# NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment