Allow benchmarking tool to handle empty response (#12174)

Co-authored-by: Claude <noreply@anthropic.com>

Allow benchmarking tool to handle empty response (#12174)
Co-authored-by: Claude <noreply@anthropic.com>
2b71531a · Kangyan-Zhou · GitHub · 25c50498 · 2b71531a · 2b71531a
Unverified Commit 2b71531a authored Oct 29, 2025 by Kangyan-Zhou Committed by GitHub Oct 29, 2025
5 changed files
--- a/python/sglang/test/simple_eval_common.py
+++ b/python/sglang/test/simple_eval_common.py
@@ -148,7 +148,7 @@ class ChatCompletionSampler(SamplerBase):
                    reasoning_effort=self.reasoning_effort,
                    extra_body=self.extra_body,
                )
-                return response.choices[0].message.content
+                return response.choices[0].message.content or ""
            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are rerunning MMMU
            except openai.BadRequestError as e:
                print("Bad Request Error", e)
@@ -161,7 +161,9 @@ class ChatCompletionSampler(SamplerBase):
                )
                time.sleep(exception_backoff)
                trial += 1
-            # unknown error shall throw exception
+        # If all retries are exhausted, return empty string instead of None
+        print(f"All retry attempts exhausted for request. Returning empty response.")
+        return ""


 QUERY_TEMPLATE_MULTICHOICE = """
@@ -261,7 +263,7 @@ def format_multichoice_question(row):
 def check_equality(sampler: SamplerBase, expr1: str, expr2: str):
    prompt = EQUALITY_TEMPLATE % {"expression1": expr1, "expression2": expr2}
    response = sampler([dict(content=prompt, role="user")])
-    return response.lower().strip() == "yes"
+    return (response or "").lower().strip() == "yes"


 def _compute_stat(values: list, stat: str):

--- a/python/sglang/test/simple_eval_humaneval.py
+++ b/python/sglang/test/simple_eval_humaneval.py
@@ -80,6 +80,7 @@ class HumanEval(Eval):
        instruction = "Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n"

        def find_code(completion):
+            completion = completion or ""
            pattern = re.compile(r"```python\n(.*?)```", re.DOTALL)
            matches = pattern.findall(completion)
            extracted_answer = matches[0] if len(matches) >= 1 else completion

--- a/python/sglang/test/simple_eval_math.py
+++ b/python/sglang/test/simple_eval_math.py
@@ -54,6 +54,7 @@ class MathEval(Eval):
                sampler._pack_message(content=QUERY_TEMPLATE.format(**row), role="user")
            ]
            response_text = sampler(prompt_messages)
+            response_text = response_text or ""
            match = re.search(ANSWER_PATTERN, response_text)
            extracted_answer = match.group(1) if match else None
            score = float(

--- a/python/sglang/test/simple_eval_mmlu.py
+++ b/python/sglang/test/simple_eval_mmlu.py
@@ -101,6 +101,7 @@ class MMLUEval(Eval):
                )
            ]
            response_text = sampler(prompt_messages)
+            response_text = response_text or ""
            match = re.search(ANSWER_PATTERN_MULTICHOICE, response_text)
            extracted_answer = match.group(1) if match else None
            score = 1.0 if extracted_answer == row["Answer"] else 0.0

--- a/python/sglang/test/simple_eval_mmmu_vlm.py
+++ b/python/sglang/test/simple_eval_mmmu_vlm.py
@@ -204,6 +204,7 @@ class MMMUVLMEval(Eval):

            # Sample
            response_text = sampler(prompt_messages)
+            response_text = response_text or ""

            # Parse and score
            gold = sample["answer"]