Unverified Commit 41598e0d authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Add longer accuracy test on CI (#1049)

parent 89f23a51
name: Accuracy Test
on:
push:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
pull_request:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
workflow_dispatch:
concurrency:
group: accuracy-test-${{ github.ref }}
cancel-in-progress: true
jobs:
accuracy-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: accuracy
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate Accuracy
run: |
cd test/srt
python3 test_eval_accuracy_large.py
...@@ -20,7 +20,7 @@ concurrency: ...@@ -20,7 +20,7 @@ concurrency:
jobs: jobs:
e2e-test: e2e-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: bench runs-on: e2e
steps: steps:
- name: Checkout code - name: Checkout code
......
...@@ -154,7 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -154,7 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance. - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size. - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
``` ```
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 2048 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
``` ```
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port. - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
``` ```
......
...@@ -669,19 +669,20 @@ async def benchmark( ...@@ -669,19 +669,20 @@ async def benchmark(
"backend": args.backend, "backend": args.backend,
"dataset_name": args.dataset_name, "dataset_name": args.dataset_name,
"request_rate": request_rate, "request_rate": request_rate,
"total_input": metrics.total_input, "total_input_tokens": metrics.total_input,
"total_output": metrics.total_output, "total_output_tokens": metrics.total_output,
"total_output_retokenized": metrics.total_output_retokenized, "total_output_tokens_retokenized": metrics.total_output_retokenized,
"mean_e2e_latency": metrics.mean_e2e_latency_ms, "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
"median_e2e_latency": metrics.median_e2e_latency_ms, "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
"median_ttft": metrics.median_ttft_ms, "median_ttft_ms": metrics.median_ttft_ms,
"median_itl": metrics.median_itl_ms, "median_itl_ms": metrics.median_itl_ms,
"output_token_throughput": metrics.output_throughput, "output_throughput": metrics.output_throughput,
"sharegpt_output_len": args.sharegpt_output_len, "sharegpt_output_len": args.sharegpt_output_len,
"random_input_len": args.random_input_len, "random_input_len": args.random_input_len,
"random_output_len": args.random_output_len, "random_output_len": args.random_output_len,
"random_range_ratio": args.random_range_ratio, "random_range_ratio": args.random_range_ratio,
"benchmark_duration": benchmark_duration, "duration": benchmark_duration,
"completed": metrics.completed,
} }
else: else:
print(f"Error running benchmark for request rate: {request_rate}") print(f"Error running benchmark for request rate: {request_rate}")
......
...@@ -64,8 +64,7 @@ from sglang.utils import get_exception_traceback ...@@ -64,8 +64,7 @@ from sglang.utils import get_exception_traceback
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# TODO: Rename "CI" to "SGLANG_IS_IN_CI". crash_on_warning = os.getenv("SGLANG_IS_IN_CI", "false") == "true"
crash_on_warning = os.getenv("CI", "false") == "true"
class ModelTpServer: class ModelTpServer:
......
...@@ -39,6 +39,14 @@ def run_eval(args): ...@@ -39,6 +39,14 @@ def run_eval(args):
eval_obj = MathEval( eval_obj = MathEval(
filename, equality_checker, args.num_examples, args.num_threads filename, equality_checker, args.num_examples, args.num_threads
) )
elif args.eval_name == "mgsm":
from sglang.test.simple_eval_mgsm import MGSMEval
eval_obj = MGSMEval(args.num_examples, args.num_threads)
elif args.eval_name == "mgsm_en":
from sglang.test.simple_eval_mgsm import MGSMEval
eval_obj = MGSMEval(args.num_examples, args.num_threads, languages=["en"])
elif args.eval_name == "gpqa": elif args.eval_name == "gpqa":
from sglang.test.simple_eval_gpqa import GPQAEval from sglang.test.simple_eval_gpqa import GPQAEval
......
# Adapted from https://github.com/openai/simple-evals/
"""
MGSM: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems.
Language Models are Multilingual Chain-of-Thought Reasoners
Freda Shi, Mirac Suzgun, Markus Freitag, Xuezhi Wang, Suraj Srivats, Soroush Vosoughi, Hyung Won Chung, Yi Tay, Sebastian Ruder, Denny Zhou, Dipanjan Das, Jason Wei
https://arxiv.org/abs/2210.03057 reference: https://github.com/google-research/url-nlp
"""
import re
import urllib
from typing import Optional
from sglang.test import simple_eval_common as common
from sglang.test.simple_eval_common import (
HTML_JINJA,
Eval,
EvalResult,
SamplerBase,
SingleEvalResult,
)
ALL_LANGUAGES = ["bn", "de", "en", "es", "fr", "ja", "ru", "sw", "te", "th", "zh"]
LATIN_LANGUAGES = ["de", "en", "es", "fr", "sw"]
NON_LATIN_LANGUAGES = ["bn", "ja", "ru", "te", "th", "zh"]
LANG_TO_FPATH = {
"bn": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_bn.tsv",
"de": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_de.tsv",
"en": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_en.tsv",
"es": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_es.tsv",
"fr": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_fr.tsv",
"ja": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ja.tsv",
"ru": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_ru.tsv",
"sw": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_sw.tsv",
"te": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_te.tsv",
"th": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_th.tsv",
"zh": "https://openaipublic.blob.core.windows.net/simple-evals/mgsm_zh.tsv",
}
LANG_TO_INSTRUCTIONS = {
"en": """Solve this math problem. Give the reasoning steps before giving the final answer on the last line by itself in the format of "Answer:". Do not add anything other than the integer answer after "Answer:".
{input}""",
"bn": """এই গণিতের সমস্যাটি সমাধান করুন। চূড়ান্ত উত্তর দেওয়ার আগে যুক্তিসম্পন্ন পদক্ষেপ প্রদান করুন। চূড়ান্ত উত্তরটি একক সংখ্যা হিসাবে "উত্তর:" এর পরে শেষ লাইনে দিন। "উত্তর:" এর পরে অন্য কিছু যুক্ত করবেন না।.
{input}""",
"de": """Löse dieses Mathematikproblem. Gib die Schritte zur Begründung an, bevor du die endgültige Antwort in der letzten Zeile alleine im Format "Antwort:" gibst. Füge nichts anderes als die ganzzahlige Antwort nach "Antwort:" hinzu.
{input}""",
"es": """Resuelve este problema matemático. Proporciona los pasos de razonamiento antes de dar la respuesta final en la última línea por sí misma en el formato de "Respuesta:". No añadas nada más que la respuesta entera después de "Respuesta:".
{input}""",
"fr": """Résolvez ce problème de mathématiques. Donnez les étapes de raisonnement avant de fournir la réponse finale sur la dernière ligne elle-même dans le format de "Réponse:". N'ajoutez rien d'autre que la réponse entière après "Réponse:".
{input}""",
"ja": """の数学の問題を解いてください。最終的な答えを出す前に、解答の推論過程を記述してください。そして最後の行には "答え:" の形式で答えを記述し、その後には整数の答え以外何も追加しないでください。
{input}""",
"ru": """Решите эту математическую задачу. Объясните шаги рассуждения перед тем, как дать окончательный ответ в последней строке сам по себе в формате "Ответ:". Не добавляйте ничего, кроме целочисленного ответа после "Ответ:".
{input}""",
"sw": """Suluhisha tatizo hili la hesabu. Toa hatua za mantiki kabla ya kutoa jibu la mwisho kwenye mstari wa mwisho peke yake katika muundo wa "Jibu:". Usiongeze chochote kingine isipokuwa jibu la integer baada ya "Jibu:".
{input}""",
"te": """ఈ గణిత సమస్యను పరిష్కరించండి. చివరి సమాధానాన్ని ఇవ్వదానికి ముందు తర్కాత్మక అదుగులను ఇవ్వండి. చివరి పంక్తిలో మాత్రమే 'సమాధానం:' అనే ఆకారంలో చివరి సమాధానాద్ని ఇవ్వండి సమాధానం: తర్వాత పూర్ణాంక సమాధానానికి తప్పించి ఎదేనా చేర్చవద్దు.
{input}""",
"th": """แก้ปัญหาคณิตศาสตร์นี้ ให้ให้ขั้นตอนการใช้เหตุผลก่อนที่จะให้คำตอบสุดท้ายในบรรทัดสุดท้ายโดยอยู่ในรูปแบบ "คำตอบ:" ไม่ควรเพิ่มอะไรนอกจากคำตอบที่เป็นจำนวนเต็มหลังจาก "คำตอบ:"
{input}""",
"zh": """解决这个数学问题。在最后一行给出答案前,请提供推理步骤。最后一行应该以 "答案: " 的形式独立给出答案。在 "答案:" 后不要添加除整数答案之外的任何内容。
{input}""",
}
LANG_TO_ANSWER_PREFIX = {
"en": "Answer",
"bn": "উত্তর",
"de": "Antwort",
"es": "Respuesta",
"fr": "Réponse",
"ja": "答え",
"ru": "Ответ",
"sw": "Jibu",
"te": "సమాధానం",
"th": "คำตอบ",
"zh": "答案",
}
def parse_answer(answer: str, answer_prefix: str) -> str:
if answer_prefix not in answer:
return ""
answer_text = answer.split(answer_prefix)[-1].strip()
# find all the numbers (including decimals) in the string
numbers = re.findall(r"\d+\.?\d*", answer_text.replace(",", ""))
# return the first number (removing trailing decimal point if present),
# or an empty string if there were no numbers
return numbers[-1].rstrip(".") if numbers else ""
def score_mgsm(target: str, prediction: str) -> bool:
if "." in prediction:
prediction = prediction.rstrip("0").rstrip(".")
target = target.replace(",", "")
prediction = prediction.replace(",", "")
return target == prediction
def get_lang_examples(lang: str) -> list[dict[str, str]]:
fpath = LANG_TO_FPATH[lang]
examples = []
with urllib.request.urlopen(fpath) as f:
for line in f.read().decode("utf-8").splitlines():
inputs, targets = line.strip().split("\t")
if "." in targets:
raise ValueError(f"targets {targets} contains a decimal point.")
# targets = int(targets.replace(",", ""))
examples.append({"inputs": inputs, "targets": targets, "lang": lang})
return examples
def get_all_examples() -> list[dict[str, str]]:
examples = []
for lang in ALL_LANGUAGES:
if lang != "en":
continue
examples += get_lang_examples(lang)
return examples
class MGSMEval(Eval):
def __init__(
self,
num_examples_per_lang: int = 250, # restrict to a subset of the data for debugging
num_threads: int = 64,
languages: Optional[list[str]] = ALL_LANGUAGES,
):
if languages is None:
languages = ALL_LANGUAGES
else:
for language in languages:
if language not in ALL_LANGUAGES:
raise ValueError(
f"language {language} is not a valid language. "
f"It should be one in {ALL_LANGUAGES}"
)
self._languages = languages
self._num_examples_per_lang = num_examples_per_lang
self._num_threads = num_threads
examples = []
for lang in self._languages:
lang_examples = get_lang_examples(lang)
examples.extend(lang_examples[: self._num_examples_per_lang])
self.examples = examples
def __call__(self, sampler: SamplerBase) -> EvalResult:
def fn(example: dict[str, str]):
language = example["lang"]
latin_language = (
"group_latin" if language in LATIN_LANGUAGES else "group_non_latin"
)
correct_answer = example["targets"]
instructoin = LANG_TO_INSTRUCTIONS[language]
prompt_messages = [
sampler._pack_message(
content=instructoin.format(input=example["inputs"]), role="user"
)
]
try:
response_text = sampler(prompt_messages)
except Exception as e:
response_text = ""
answer_prefix = LANG_TO_ANSWER_PREFIX[language]
extracted_answer = parse_answer(response_text, answer_prefix)
score = score_mgsm(correct_answer, extracted_answer)
html = common.jinja_env.from_string(HTML_JINJA).render(
prompt_messages=prompt_messages,
next_message=dict(content=response_text, role="assistant"),
score=score,
correct_answer=correct_answer,
extracted_answer=extracted_answer,
)
convo = prompt_messages + [dict(content=response_text, role="assistant")]
return SingleEvalResult(
html=html,
score=score,
convo=convo,
metrics={language: score, latin_language: score},
)
results = common.map_with_progress(
fn, self.examples, num_threads=self._num_threads
)
return common.aggregate_results(results, default_stats=("mean", "std"))
...@@ -7,7 +7,7 @@ suites = { ...@@ -7,7 +7,7 @@ suites = {
"minimal": [ "minimal": [
"test_chunked_prefill.py", "test_chunked_prefill.py",
"test_embedding_openai_server.py", "test_embedding_openai_server.py",
"test_eval_accuracy.py", "test_eval_accuracy_mini.py",
"test_large_max_new_tokens.py", "test_large_max_new_tokens.py",
"test_openai_server.py", "test_openai_server.py",
"test_skip_tokenizer_init.py", "test_skip_tokenizer_init.py",
......
...@@ -10,34 +10,41 @@ from sglang.test.test_utils import ( ...@@ -10,34 +10,41 @@ from sglang.test.test_utils import (
) )
class TestAccuracy(unittest.TestCase): class TestChunkedPrefill(unittest.TestCase):
@classmethod def run_mmlu(self, disable_radix_cache):
def setUpClass(cls): other_args = ["--chunked-prefill-size", "32"]
cls.model = DEFAULT_MODEL_NAME_FOR_TEST if disable_radix_cache:
cls.base_url = DEFAULT_URL_FOR_TEST other_args += ["--disable-radix-cache"]
cls.process = popen_launch_server(
cls.model, model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url, base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=300, timeout=300,
other_args=["--chunked-prefill-size", "32"], other_args=other_args,
) )
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace( args = SimpleNamespace(
base_url=self.base_url, base_url=base_url,
model=self.model, model=model,
eval_name="mmlu", eval_name="mmlu",
num_examples=20, num_examples=32,
num_threads=20, num_threads=32,
) )
metrics = run_eval(args) try:
assert metrics["score"] >= 0.5 metrics = run_eval(args)
assert metrics["score"] >= 0.6
finally:
kill_child_process(process.pid)
def test_chunked_prefill(self):
self.run_mmlu(disable_radix_cache=False)
def test_chunked_prefill_without_radix_cache(self):
self.run_mmlu(disable_radix_cache=True)
if __name__ == "__main__": if __name__ == "__main__":
......
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:7157"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=300,
other_args=["--log-level-http", "warning"],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=None,
num_threads=2048,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.70
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="humaneval",
num_examples=None,
num_threads=2048,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.65
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=None,
num_threads=2048,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.85
if __name__ == "__main__":
unittest.main()
...@@ -10,7 +10,7 @@ from sglang.test.test_utils import ( ...@@ -10,7 +10,7 @@ from sglang.test.test_utils import (
) )
class TestAccuracy(unittest.TestCase): class TestEvalAccuracyMini(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -27,12 +27,12 @@ class TestAccuracy(unittest.TestCase): ...@@ -27,12 +27,12 @@ class TestAccuracy(unittest.TestCase):
base_url=self.base_url, base_url=self.base_url,
model=self.model, model=self.model,
eval_name="mmlu", eval_name="mmlu",
num_examples=20, num_examples=32,
num_threads=20, num_threads=32,
) )
metrics = run_eval(args) metrics = run_eval(args)
assert metrics["score"] >= 0.5 assert metrics["score"] >= 0.6
if __name__ == "__main__": if __name__ == "__main__":
......
import os
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
...@@ -55,21 +56,30 @@ class TestServingThroughput(unittest.TestCase): ...@@ -55,21 +56,30 @@ class TestServingThroughput(unittest.TestCase):
kill_child_process(process.pid) kill_child_process(process.pid)
assert res["completed"] == num_prompts assert res["completed"] == num_prompts
return res
def test_default(self): def test_default(self):
self.run_test( res = self.run_test(
disable_radix_cache=False, disable_radix_cache=False,
disable_flashinfer=False, disable_flashinfer=False,
chunked_prefill_size=-1, chunked_prefill_size=-1,
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 performance
assert res["output_throughput"] >= 1300
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
self.run_test( res = self.run_test(
disable_radix_cache=True, disable_radix_cache=True,
disable_flashinfer=False, disable_flashinfer=False,
chunked_prefill_size=-1, chunked_prefill_size=-1,
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 performance
assert res["output_throughput"] >= 1400
def test_default_without_flashinfer(self): def test_default_without_flashinfer(self):
self.run_test( self.run_test(
disable_radix_cache=False, disable_radix_cache=False,
......
...@@ -10,7 +10,7 @@ from sglang.test.test_utils import ( ...@@ -10,7 +10,7 @@ from sglang.test.test_utils import (
) )
class TestAccuracy(unittest.TestCase): class TestTorchCompile(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -29,12 +29,12 @@ class TestAccuracy(unittest.TestCase): ...@@ -29,12 +29,12 @@ class TestAccuracy(unittest.TestCase):
base_url=self.base_url, base_url=self.base_url,
model=self.model, model=self.model,
eval_name="mmlu", eval_name="mmlu",
num_examples=20, num_examples=32,
num_threads=20, num_threads=32,
) )
metrics = run_eval(args) metrics = run_eval(args)
assert metrics["score"] >= 0.5 assert metrics["score"] >= 0.6
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment