Unverified Commit 4f8c3aea authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

minor: update gsm8k threshold (#2125)

parent 2369e882
......@@ -27,14 +27,14 @@ jobs:
bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
- name: Test human eval
- name: Test gsm8k
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_human_eval.py
python3 test_nightly_gsm8k_eval.py
- name: Test gsm8k
- name: Test human eval
timeout-minutes: 120
run: |
cd test/srt
python3 test_nightly_gsm8k_eval.py
python3 test_nightly_human_eval.py
......@@ -439,18 +439,22 @@ def popen_launch_server(
process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
start_time = time.time()
while time.time() - start_time < timeout:
try:
headers = {
"Content-Type": "application/json; charset=utf-8",
"Authorization": f"Bearer {api_key}",
}
response = requests.get(f"{base_url}/health_generate", headers=headers)
if response.status_code == 200:
return process
except requests.RequestException:
pass
time.sleep(10)
with requests.Session() as session:
while time.time() - start_time < timeout:
try:
headers = {
"Content-Type": "application/json; charset=utf-8",
"Authorization": f"Bearer {api_key}",
}
response = session.get(
f"{base_url}/health_generate",
headers=headers,
)
if response.status_code == 200:
return process
except requests.RequestException:
pass
time.sleep(10)
raise TimeoutError("Server failed to start within the timeout period.")
......
import json
import os
import subprocess
import unittest
import warnings
from datetime import datetime
from types import SimpleNamespace
......@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
)
MODEL_SCORE_THRESHOLDS = {
"meta-llama/Llama-3.1-8B-Instruct": 0.8316,
"mistralai/Mistral-7B-Instruct-v0.3": 0.5861,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672,
"google/gemma-2-27b-it": 0.9227,
"meta-llama/Llama-3.1-70B-Instruct": 0.9623,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415,
"Qwen/Qwen2-57B-A14B-Instruct": 0.8791,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356,
"neuralmagic/gemma-2-2b-it-FP8": 0.6059,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435,
"meta-llama/Llama-3.1-8B-Instruct": 0.83,
"mistralai/Mistral-7B-Instruct-v0.3": 0.58,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
"google/gemma-2-27b-it": 0.92,
"meta-llama/Llama-3.1-70B-Instruct": 0.96,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
"Qwen/Qwen2-57B-A14B-Instruct": 0.87,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
"neuralmagic/gemma-2-2b-it-FP8": 0.60,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84,
}
......@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
)
return process
......@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
kill_child_process(self.process.pid, include_self=True)
def test_mgsm_en_all_models(self):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
)
is_first = True
all_results = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment