Fix nightly accuracy tests (#2780)

b22f3f64 · Lianmin Zheng · GitHub · 6fb57683 · b22f3f64 · b22f3f64
Unverified Commit b22f3f64 authored Jan 07, 2025 by Lianmin Zheng Committed by GitHub Jan 07, 2025
5 changed files
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
 DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
-DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -49,8 +49,7 @@ suites = {
    ],
    "nightly": [
        "test_nightly_gsm8k_eval.py",
-        "test_nightly_human_eval.py",
+        # Disable temporarily
-        # Disable temporarly
        # "test_nightly_math_eval.py",
    ],
    "sampling/penaltylib": glob.glob(

--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
 import json
 import os
-import subprocess
 import unittest
 import warnings
 from datetime import datetime
@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
+    is_in_ci,
    popen_launch_server,
+    write_github_step_summary,
 )
 MODEL_SCORE_THRESHOLDS = {
-    "meta-llama/Llama-3.1-8B-Instruct": 0.83,
+    "meta-llama/Llama-3.1-8B-Instruct": 0.82,
    "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
-    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
+    "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
    "google/gemma-2-27b-it": 0.92,
-    "meta-llama/Llama-3.1-70B-Instruct": 0.96,
+    "meta-llama/Llama-3.1-70B-Instruct": 0.95,
-    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63,
+    "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
-    "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
+    "Qwen/Qwen2-57B-A14B-Instruct": 0.88,
-    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
+    "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
    "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
-    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
+    "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
    "neuralmagic/gemma-2-2b-it-FP8": 0.60,
-    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
+    "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
-    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
+    "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62,
-    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
+    "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
    "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
    "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
    "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83,
@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
        base_url,
        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
        other_args=other_args,
-        return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
    )
    return process
@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
 def check_model_scores(results):
    failed_models = []
+    summary = " | model | score | threshold |\n"
+    summary += "| ----- | ----- | --------- |\n"
    for model, score in results:
        threshold = MODEL_SCORE_THRESHOLDS.get(model)
        if threshold is None:
@@ -111,11 +114,19 @@ def check_model_scores(results):
                f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
            )
+        line = f"| {model} | {score} | {threshold} |\n"
+        summary += line
+    print(summary)
+    if is_in_ci():
+        write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
    if failed_models:
        raise AssertionError("\n".join(failed_models))
-class TestEvalAccuracyLarge(unittest.TestCase):
+class TestNightlyGsm8KEval(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.model_groups = [
@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        ]
        cls.base_url = DEFAULT_URL_FOR_TEST
-    def setUp(self):
-        self.process = None
-    def tearDown(self):
-        if self.process:
-            kill_process_tree(self.process.pid)
    def test_mgsm_en_all_models(self):
        warnings.filterwarnings(
            "ignore", category=ResourceWarning, message="unclosed.*socket"
@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
        for model_group, is_fp8, is_tp2 in self.model_groups:
            for model in model_group:
                with self.subTest(model=model):
-                    self.process = launch_server(self.base_url, model, is_fp8, is_tp2)
+                    process = launch_server(self.base_url, model, is_fp8, is_tp2)
                    args = SimpleNamespace(
                        base_url=self.base_url,
@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
                    is_first = False
                    all_results.append((model, metrics["score"]))
+                    kill_process_tree(process.pid)
-                    self.tearDown()
        try:
            with open("results.json", "r") as f:

--- a/test/srt/test_nightly_human_eval.py
+++ b/test/srt/test_nightly_human_eval.py
@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
 )
-class TestEvalAccuracyLarge(unittest.TestCase):
+class TestNightlyHumanEval(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        if is_in_ci():

--- a/test/srt/test_skip_tokenizer_init.py
+++ b/test/srt/test_skip_tokenizer_init.py
@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
        print(json.dumps(ret))
        def assert_one_item(item):
-            assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"]
+            self.assertEqual(
-            assert len(item["token_ids"]) == max_new_tokens
+                len(item["token_ids"]), item["meta_info"]["completion_tokens"]
+            )
+            self.assertEqual(len(item["token_ids"]), max_new_tokens)
            assert item["meta_info"]["prompt_tokens"] == len(input_ids)
            if return_logprob: