Unverified Commit 4f8c3aea authored by Yineng Zhang's avatar Yineng Zhang Committed by GitHub
Browse files

minor: update gsm8k threshold (#2125)

parent 2369e882
...@@ -27,14 +27,14 @@ jobs: ...@@ -27,14 +27,14 @@ jobs:
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus" pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
- name: Test human eval - name: Test gsm8k
timeout-minutes: 120 timeout-minutes: 120
run: | run: |
cd test/srt cd test/srt
python3 test_nightly_human_eval.py python3 test_nightly_gsm8k_eval.py
- name: Test gsm8k - name: Test human eval
timeout-minutes: 120 timeout-minutes: 120
run: | run: |
cd test/srt cd test/srt
python3 test_nightly_gsm8k_eval.py python3 test_nightly_human_eval.py
...@@ -439,18 +439,22 @@ def popen_launch_server( ...@@ -439,18 +439,22 @@ def popen_launch_server(
process = subprocess.Popen(command, stdout=None, stderr=None, env=env) process = subprocess.Popen(command, stdout=None, stderr=None, env=env)
start_time = time.time() start_time = time.time()
while time.time() - start_time < timeout: with requests.Session() as session:
try: while time.time() - start_time < timeout:
headers = { try:
"Content-Type": "application/json; charset=utf-8", headers = {
"Authorization": f"Bearer {api_key}", "Content-Type": "application/json; charset=utf-8",
} "Authorization": f"Bearer {api_key}",
response = requests.get(f"{base_url}/health_generate", headers=headers) }
if response.status_code == 200: response = session.get(
return process f"{base_url}/health_generate",
except requests.RequestException: headers=headers,
pass )
time.sleep(10) if response.status_code == 200:
return process
except requests.RequestException:
pass
time.sleep(10)
raise TimeoutError("Server failed to start within the timeout period.") raise TimeoutError("Server failed to start within the timeout period.")
......
import json import json
import os import os
import subprocess
import unittest import unittest
import warnings
from datetime import datetime from datetime import datetime
from types import SimpleNamespace from types import SimpleNamespace
...@@ -18,23 +20,23 @@ from sglang.test.test_utils import ( ...@@ -18,23 +20,23 @@ from sglang.test.test_utils import (
) )
MODEL_SCORE_THRESHOLDS = { MODEL_SCORE_THRESHOLDS = {
"meta-llama/Llama-3.1-8B-Instruct": 0.8316, "meta-llama/Llama-3.1-8B-Instruct": 0.83,
"mistralai/Mistral-7B-Instruct-v0.3": 0.5861, "mistralai/Mistral-7B-Instruct-v0.3": 0.58,
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.8672, "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.84,
"google/gemma-2-27b-it": 0.9227, "google/gemma-2-27b-it": 0.92,
"meta-llama/Llama-3.1-70B-Instruct": 0.9623, "meta-llama/Llama-3.1-70B-Instruct": 0.96,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 0.6415, "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64,
"Qwen/Qwen2-57B-A14B-Instruct": 0.8791, "Qwen/Qwen2-57B-A14B-Instruct": 0.87,
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.8672, "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.84,
"neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.5544, "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54,
"neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.8356, "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8": 0.83,
"neuralmagic/gemma-2-2b-it-FP8": 0.6059, "neuralmagic/gemma-2-2b-it-FP8": 0.60,
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.9504, "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.95,
"neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.6138, "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8": 0.61,
"neuralmagic/Qwen2-72B-Instruct-FP8": 0.9504, "neuralmagic/Qwen2-72B-Instruct-FP8": 0.95,
"neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.8197, "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.8395, "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4": 0.84,
"hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.8435, "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4": 0.84,
} }
...@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2): ...@@ -65,6 +67,7 @@ def launch_server(base_url, model, is_fp8, is_tp2):
base_url, base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args, other_args=other_args,
return_stdout_stderr=(subprocess.DEVNULL, subprocess.DEVNULL),
) )
return process return process
...@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -132,6 +135,9 @@ class TestEvalAccuracyLarge(unittest.TestCase):
kill_child_process(self.process.pid, include_self=True) kill_child_process(self.process.pid, include_self=True)
def test_mgsm_en_all_models(self): def test_mgsm_en_all_models(self):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
)
is_first = True is_first = True
all_results = [] all_results = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment