[CI] Tweaks to GPT-OSS Eval (Blackwell) for stability (#26030)

Signed-off-by: mgoin <mgoin64@gmail.com>

[CI] Tweaks to GPT-OSS Eval (Blackwell) for stability (#26030)
Signed-off-by: mgoin <mgoin64@gmail.com>
ee04c0cd · Michael Goin · GitHub · c36f0aa3 · ee04c0cd · ee04c0cd
Unverified Commit ee04c0cd authored Oct 01, 2025 by Michael Goin Committed by GitHub Oct 01, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 4 deletions

.buildkite/test-pipeline.yaml .buildkite/test-pipeline.yaml +1 -1

tests/evals/gpt_oss/test_gpqa_correctness.py tests/evals/gpt_oss/test_gpqa_correctness.py +2 -3

No files found.
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -845,7 +845,7 @@ steps:
  - vllm/v1/attention/backends/flashinfer.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2'
+    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 - label: Blackwell Quantized MoE Test
  timeout_in_minutes: 60

--- a/tests/evals/gpt_oss/test_gpqa_correctness.py
+++ b/tests/evals/gpt_oss/test_gpqa_correctness.py
@@ -26,7 +26,8 @@ def run_gpqa_eval(model_name: str, base_url: str) -> float:
    # Build the command to run the evaluation
    cmd = [
        sys.executable, "-m", "gpt_oss.evals", "--eval", "gpqa", "--model",
-        model_name, "--reasoning-effort", "low", "--base-url", base_url
+        model_name, "--reasoning-effort", "low", "--base-url", base_url,
+        "--n-threads", "200"
    ]
    try:
@@ -72,8 +73,6 @@ def test_gpqa_correctness(request):
    # Add standard server arguments
    server_args.extend([
-        "--max-model-len",
-        "32768",
        "--trust-remote-code",
    ])