minor: update gsm8k threshold (#2125)

4f8c3aea · Yineng Zhang · GitHub · 2369e882 · 4f8c3aea · 4f8c3aea
Unverified Commit 4f8c3aea authored Nov 22, 2024 by Yineng Zhang Committed by GitHub Nov 22, 2024
3 changed files
--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-eval.yml
@@ -27,14 +27,14 @@ jobs:
          bash scripts/ci_install_dependency.sh
          pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"

-      - name: Test human eval
+      - name: Test gsm8k
        timeout-minutes: 120
        run: |
          cd test/srt
-          python3 test_nightly_human_eval.py
+          python3 test_nightly_gsm8k_eval.py

-      - name: Test gsm8k
+      - name: Test human eval
        timeout-minutes: 120
        run: |
          cd test/srt
-          python3 test_nightly_gsm8k_eval.py
+          python3 test_nightly_human_eval.py
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -439,13 +439,17 @@ def popen_launch_server(
        process = subprocess.Popen(command, stdout=None, stderr=None, env=env)

    start_time = time.time()
+    with requests.Session() as session:
        while time.time() - start_time < timeout:
            try:
                headers = {
                    "Content-Type": "application/json; charset=utf-8",
                    "Authorization": f"Bearer {api_key}",
                }
-            response = requests.get(f"{base_url}/health_generate", headers=headers)
+                response = session.get(
+                    f"{base_url}/health_generate",
+                    headers=headers,
+                )
                if response.status_code == 200:
                    return process
            except requests.RequestException:

--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
 import json
 import os
+import subprocess
 import unittest
+import warnings
 from datetime import datetime
 from types import SimpleNamespace

@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
 )

 MODEL_SCORE_THRESHOLDS = {
-    "meta-llama/Llama-3.1-8B-Instruct": 0.8316,
-    "mistralai/Mistral-7B-Instruct-v0.3": 0.5861,
-    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672,
-    "google/gemma-2-27b-it": 0.9227,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.9623,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.8791,
-    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672,
-    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356,
-    "neuralmagic/gemma-2-2b-it-FP8": 0.6059,
-    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504,
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138,
-    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504,
-    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395,
-    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435,
+    "meta-llama/Llama-3.1-8B-Instruct": 0.83,
+    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
+    "google/gemma-2-27b-it": 0.92,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.96,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
+    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
+    "neuralmagic/gemma-2-2b-it-FP8": 0.60,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
+    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
+    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84,
 }


@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
        base_url,
        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
        other_args=other_args,
+        return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
    )
    return process

@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
            kill_child_process(self.process.pid, include_self=True)

    def test_mgsm_en_all_models(self):
+        warnings.filterwarnings(
+            "ignore", category=ResourceWarning, message="unclosed.*socket"
+        )
        is_first = True
        all_results = []