Unverified Commit b22f3f64 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix nightly accuracy tests (#2780)

parent 6fb57683
...@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" ...@@ -36,7 +36,7 @@ DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
......
...@@ -49,8 +49,7 @@ suites = { ...@@ -49,8 +49,7 @@ suites = {
], ],
"nightly": [ "nightly": [
"test_nightly_gsm8k_eval.py", "test_nightly_gsm8k_eval.py",
"test_nightly_human_eval.py", # Disable temporarily
# Disable temporarly
# "test_nightly_math_eval.py", # "test_nightly_math_eval.py",
], ],
"sampling/penaltylib": glob.glob( "sampling/penaltylib": glob.glob(
......
import json import json
import os import os
import subprocess
import unittest import unittest
import warnings import warnings
from datetime import datetime from datetime import datetime
...@@ -16,24 +15,26 @@ from sglang.test.test_utils import ( ...@@ -16,24 +15,26 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
is_in_ci,
popen_launch_server, popen_launch_server,
write_github_step_summary,
) )
MODEL_SCORE_THRESHOLDS = { MODEL_SCORE_THRESHOLDS = {
"meta-llama/Llama-3.1-8B-Instruct": 0.83, "meta-llama/Llama-3.1-8B-Instruct": 0.82,
"mistralai/Mistral-7B-Instruct-v0.3": 0.58, "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85,
"google/gemma-2-27b-it": 0.92, "google/gemma-2-27b-it": 0.92,
"meta-llama/Llama-3.1-70B-Instruct": 0.96, "meta-llama/Llama-3.1-70B-Instruct": 0.95,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.63, "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
"Qwen/Qwen2-57B-A14B-Instruct": 0.87, "Qwen/Qwen2-57B-A14B-Instruct": 0.88,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.83,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83, "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.84,
"neuralmagic/gemma-2-2b-it-FP8": 0.60, "neuralmagic/gemma-2-2b-it-FP8": 0.60,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95, "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61, "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.62,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.95, "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.83,
...@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2): ...@@ -67,7 +68,6 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url, base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args, other_args=other_args,
return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
) )
return process return process
...@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"): ...@@ -99,6 +99,9 @@ def write_results_to_json(model, metrics, mode="a"):
def check_model_scores(results): def check_model_scores(results):
failed_models = [] failed_models = []
summary = " | model | score | threshold |\n"
summary += "| ----- | ----- | --------- |\n"
for model, score in results: for model, score in results:
threshold = MODEL_SCORE_THRESHOLDS.get(model) threshold = MODEL_SCORE_THRESHOLDS.get(model)
if threshold is None: if threshold is None:
...@@ -111,11 +114,19 @@ def check_model_scores(results): ...@@ -111,11 +114,19 @@ def check_model_scores(results):
f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})"
) )
line = f"| {model} | {score} | {threshold} |\n"
summary += line
print(summary)
if is_in_ci():
write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}")
if failed_models: if failed_models:
raise AssertionError("\n".join(failed_models)) raise AssertionError("\n".join(failed_models))
class TestEvalAccuracyLarge(unittest.TestCase): class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model_groups = [ cls.model_groups = [
...@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -127,13 +138,6 @@ class TestEvalAccuracyLarge(unittest.TestCase):
] ]
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
def setUp(self):
self.process = None
def tearDown(self):
if self.process:
kill_process_tree(self.process.pid)
def test_mgsm_en_all_models(self): def test_mgsm_en_all_models(self):
warnings.filterwarnings( warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket" "ignore", category=ResourceWarning, message="unclosed.*socket"
...@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -144,7 +148,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
for model_group, is_fp8, is_tp2 in self.model_groups: for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group: for model in model_group:
with self.subTest(model=model): with self.subTest(model=model):
self.process = launch_server(self.base_url, model, is_fp8, is_tp2) process = launch_server(self.base_url, model, is_fp8, is_tp2)
args = SimpleNamespace( args = SimpleNamespace(
base_url=self.base_url, base_url=self.base_url,
...@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -163,8 +167,7 @@ class TestEvalAccuracyLarge(unittest.TestCase):
is_first = False is_first = False
all_results.append((model, metrics["score"])) all_results.append((model, metrics["score"]))
kill_process_tree(process.pid)
self.tearDown()
try: try:
with open("results.json", "r") as f: with open("results.json", "r") as f:
......
...@@ -18,7 +18,7 @@ from sglang.test.test_utils import ( ...@@ -18,7 +18,7 @@ from sglang.test.test_utils import (
) )
class TestEvalAccuracyLarge(unittest.TestCase): class TestNightlyHumanEval(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
if is_in_ci(): if is_in_ci():
......
...@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase): ...@@ -55,8 +55,10 @@ class TestSkipTokenizerInit(unittest.TestCase):
print(json.dumps(ret)) print(json.dumps(ret))
def assert_one_item(item): def assert_one_item(item):
assert len(item["token_ids"]) == item["meta_info"]["completion_tokens"] self.assertEqual(
assert len(item["token_ids"]) == max_new_tokens len(item["token_ids"]), item["meta_info"]["completion_tokens"]
)
self.assertEqual(len(item["token_ids"]), max_new_tokens)
assert item["meta_info"]["prompt_tokens"] == len(input_ids) assert item["meta_info"]["prompt_tokens"] == len(input_ids)
if return_logprob: if return_logprob:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment