Unverified Commit b22f3f64 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix nightly accuracy tests (#2780)

parent 6fb57683
......@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
......
......@@ -49,8 +49,7 @@ suites = {
],
"nightly": [
"test_nightly_gsm8k_eval.py",
"test_nightly_human_eval.py",
# Disable temporarly
# Disable temporarily
# "test_nightly_math_eval.py",
],
"sampling/penaltylib": glob.glob(
......
import json
import os
import subprocess
import unittest
import warnings
from datetime import datetime
......@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
is_in_ci,
popen_launch_server,
write_github_step_summary,
)
MODEL_SCORE_THRESHOLDS = {
"meta-llama/Llama-3.1-8B-Instruct": 0.83,
"meta-llama/Llama-3.1-8B-Instruct": 0.82,
"mistralai/Mistral-7B-Instruct-v0.3": 0.58,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
"google/gemma-2-27b-it": 0.92,
"meta-llama/Llama-3.1-70B-Instruct": 0.96,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63,
"Qwen/Qwen2-57B-A14B-Instruct": 0.87,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
"meta-llama/Llama-3.1-70B-Instruct": 0.95,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
"Qwen/Qwen2-57B-A14B-Instruct": 0.88,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
"neuralmagic/gemma-2-2b-it-FP8": 0.60,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83,
......@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
)
return process
......@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
def check_model_scores(results):
failed_models = []
summary = " | model | score | threshold |\n"
summary += "| ----- | ----- | --------- |\n"
for model, score in results:
threshold = MODEL_SCORE_THRESHOLDS.get(model)
if threshold is None:
......@@ -111,11 +114,19 @@ def check_model_scores(results):
f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
)
line = f"| {model} | {score} | {threshold} |\n"
summary += line
print(summary)
if is_in_ci():
write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
if failed_models:
raise AssertionError("\n".join(failed_models))
class TestEvalAccuracyLarge(unittest.TestCase):
class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
......@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
]
cls.base_url = DEFAULT_URL_FOR_TEST
def setUp(self):
self.process = None
def tearDown(self):
if self.process:
kill_process_tree(self.process.pid)
def test_mgsm_en_all_models(self):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
......@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
with self.subTest(model=model):
self.process = launch_server(self.base_url, model, is_fp8, is_tp2)
process = launch_server(self.base_url, model, is_fp8, is_tp2)
args = SimpleNamespace(
base_url=self.base_url,
......@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
is_first = False
all_results.append((model, metrics["score"]))
self.tearDown()
kill_process_tree(process.pid)
try:
with open("results.json", "r") as f:
......
......@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
)
class TestEvalAccuracyLarge(unittest.TestCase):
class TestNightlyHumanEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
if is_in_ci():
......
......@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
print(json.dumps(ret))
def assert_one_item(item):
assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"]
assert len(item["token_ids"]) == max_new_tokens
self.assertEqual(
len(item["token_ids"]), item["meta_info"]["completion_tokens"]
)
self.assertEqual(len(item["token_ids"]), max_new_tokens)
assert item["meta_info"]["prompt_tokens"] == len(input_ids)
if return_logprob:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment