Unverified Commit 0c1c72a0 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Fix accuracy test (#1051)

parent 41598e0d
...@@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import ( ...@@ -16,6 +16,8 @@ from sglang.test.simple_eval_common import (
def run_eval(args): def run_eval(args):
set_ulimit()
if "OPENAI_API_KEY" not in os.environ: if "OPENAI_API_KEY" not in os.environ:
os.environ["OPENAI_API_KEY"] = "EMPTY" os.environ["OPENAI_API_KEY"] = "EMPTY"
...@@ -117,7 +119,6 @@ if __name__ == "__main__": ...@@ -117,7 +119,6 @@ if __name__ == "__main__":
parser.add_argument("--eval-name", type=str, default="mmlu") parser.add_argument("--eval-name", type=str, default="mmlu")
parser.add_argument("--num-examples", type=int) parser.add_argument("--num-examples", type=int)
parser.add_argument("--num-threads", type=int, default=512) parser.add_argument("--num-threads", type=int, default=512)
set_ulimit()
args = parser.parse_args() args = parser.parse_args()
run_eval(args) run_eval(args)
...@@ -6,21 +6,15 @@ Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de ...@@ -6,21 +6,15 @@ Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de
https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
""" """
import json
import logging
import multiprocessing
import random import random
import re import re
from collections import Counter, defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO from typing import Dict, List
from typing import Any, Dict, List, Tuple
import blobfile as bf
import tqdm import tqdm
try: try:
from human_eval.data import HUMAN_EVAL, read_problems from human_eval.data import read_problems
from human_eval.evaluation import estimate_pass_at_k from human_eval.evaluation import estimate_pass_at_k
from human_eval.execution import check_correctness # , unsafe_execute from human_eval.execution import check_correctness # , unsafe_execute
except (ImportError, ModuleNotFoundError): except (ImportError, ModuleNotFoundError):
......
...@@ -32,12 +32,12 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -32,12 +32,12 @@ class TestEvalAccuracyLarge(unittest.TestCase):
base_url=self.base_url, base_url=self.base_url,
model=self.model, model=self.model,
eval_name="mmlu", eval_name="mmlu",
num_examples=None, num_examples=3000,
num_threads=2048, num_threads=1024,
) )
metrics = run_eval(args) metrics = run_eval(args)
assert metrics["score"] >= 0.70 assert metrics["score"] >= 0.71, f"{metrics}"
def test_human_eval(self): def test_human_eval(self):
args = SimpleNamespace( args = SimpleNamespace(
...@@ -45,11 +45,11 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -45,11 +45,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
model=self.model, model=self.model,
eval_name="humaneval", eval_name="humaneval",
num_examples=None, num_examples=None,
num_threads=2048, num_threads=1024,
) )
metrics = run_eval(args) metrics = run_eval(args)
assert metrics["score"] >= 0.65 assert metrics["score"] >= 0.65, f"{metrics}"
def test_mgsm_en(self): def test_mgsm_en(self):
args = SimpleNamespace( args = SimpleNamespace(
...@@ -57,11 +57,11 @@ class TestEvalAccuracyLarge(unittest.TestCase): ...@@ -57,11 +57,11 @@ class TestEvalAccuracyLarge(unittest.TestCase):
model=self.model, model=self.model,
eval_name="mgsm_en", eval_name="mgsm_en",
num_examples=None, num_examples=None,
num_threads=2048, num_threads=1024,
) )
metrics = run_eval(args) metrics = run_eval(args)
assert metrics["score"] >= 0.85 assert metrics["score"] >= 0.85, f"{metrics}"
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -66,8 +66,8 @@ class TestServingThroughput(unittest.TestCase): ...@@ -66,8 +66,8 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 performance # A100 (PCIE) performance
assert res["output_throughput"] >= 1300 assert res["output_throughput"] >= 1400
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
res = self.run_test( res = self.run_test(
...@@ -77,8 +77,8 @@ class TestServingThroughput(unittest.TestCase): ...@@ -77,8 +77,8 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 performance # A100 (PCIE) performance
assert res["output_throughput"] >= 1400 assert res["output_throughput"] >= 1450
def test_default_without_flashinfer(self): def test_default_without_flashinfer(self):
self.run_test( self.run_test(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment