Unverified Commit c877292c authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Re-organize CI tests (#1052)

parent 0c1c72a0
...@@ -45,8 +45,7 @@ jobs: ...@@ -45,8 +45,7 @@ jobs:
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
- name: Benchmark Serving Throughput (w/o FlashInfer) - name: Benchmark Serving Throughput (w/ ChunkedPrefill)
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
...@@ -54,7 +54,7 @@ class BaseToolCache: ...@@ -54,7 +54,7 @@ class BaseToolCache:
return val return val
def init_value(self, key): def init_value(self, key):
raise NotImplementedError raise NotImplementedError()
def get_cache_hit_rate(self): def get_cache_hit_rate(self):
if self.metrics["total"] == 0: if self.metrics["total"] == 0:
......
...@@ -410,6 +410,7 @@ class ModelTpServer: ...@@ -410,6 +410,7 @@ class ModelTpServer:
# Print stats # Print stats
if self.tp_rank == 0: if self.tp_rank == 0:
if isinstance(self.tree_cache, RadixCache):
self.tree_cache_metrics["total"] += ( self.tree_cache_metrics["total"] += (
adder.log_input_tokens + adder.log_hit_tokens adder.log_input_tokens + adder.log_hit_tokens
) / 10**9 ) / 10**9
...@@ -417,6 +418,8 @@ class ModelTpServer: ...@@ -417,6 +418,8 @@ class ModelTpServer:
tree_cache_hit_rate = ( tree_cache_hit_rate = (
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
) )
else:
tree_cache_hit_rate = 0.0
logger.info( logger.info(
f"[gpu={self.gpu_id}] Prefill batch. " f"[gpu={self.gpu_id}] Prefill batch. "
f"#new-seq: {len(can_run_list)}, " f"#new-seq: {len(can_run_list)}, "
......
...@@ -68,7 +68,7 @@ class ChunkCache(BasePrefixCache): ...@@ -68,7 +68,7 @@ class ChunkCache(BasePrefixCache):
req.last_node = entry req.last_node = entry
def insert(self): def insert(self):
raise NotImplementedError raise NotImplementedError()
def evict(self, num_tokens: int, evict_callback: Callable): def evict(self, num_tokens: int, evict_callback: Callable):
pass pass
......
...@@ -447,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer): ...@@ -447,6 +447,15 @@ def _wait_and_warmup(server_args, pipe_finish_writer):
print(f"Initialization failed. warmup error: {last_traceback}", flush=True) print(f"Initialization failed. warmup error: {last_traceback}", flush=True)
sys.exit(1) sys.exit(1)
# Print warnings here
if server_args.disable_radix_cache and server_args.chunked_prefill_size is not None:
logger.warning(
"You set both `--disable-radix-cache` and `--chunked-prefill-size`. "
"This combination is an experimental feature and we noticed it can lead to "
"wrong generation results. If you want to use chunked prefill, it is recommended "
"not using `--disable-radix-cache`."
)
logger.info("The server is fired up and ready to roll!") logger.info("The server is fired up and ready to roll!")
if pipe_finish_writer is not None: if pipe_finish_writer is not None:
pipe_finish_writer.send("init ok") pipe_finish_writer.send("init ok")
......
...@@ -12,6 +12,7 @@ suites = { ...@@ -12,6 +12,7 @@ suites = {
"test_openai_server.py", "test_openai_server.py",
"test_skip_tokenizer_init.py", "test_skip_tokenizer_init.py",
"test_torch_compile.py", "test_torch_compile.py",
"test_triton_attn_backend.py",
"test_vision_openai_server.py", "test_vision_openai_server.py",
"test_large_max_new_tokens.py", "test_large_max_new_tokens.py",
"models/test_generation_models.py", "models/test_generation_models.py",
......
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:7157"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=300,
other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=3000,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.71, f"{metrics}"
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="humaneval",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.65, f"{metrics}"
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.85, f"{metrics}"
if __name__ == "__main__":
unittest.main()
...@@ -3,6 +3,7 @@ import unittest ...@@ -3,6 +3,7 @@ import unittest
from types import SimpleNamespace from types import SimpleNamespace
from sglang.bench_serving import run_benchmark from sglang.bench_serving import run_benchmark
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
...@@ -60,9 +61,9 @@ class TestServingThroughput(unittest.TestCase): ...@@ -60,9 +61,9 @@ class TestServingThroughput(unittest.TestCase):
def test_default(self): def test_default(self):
res = self.run_test( res = self.run_test(
disable_radix_cache=False, disable_radix_cache=ServerArgs.disable_radix_cache,
disable_flashinfer=False, disable_flashinfer=ServerArgs.disable_flashinfer,
chunked_prefill_size=-1, chunked_prefill_size=ServerArgs.chunked_prefill_size,
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
...@@ -72,21 +73,25 @@ class TestServingThroughput(unittest.TestCase): ...@@ -72,21 +73,25 @@ class TestServingThroughput(unittest.TestCase):
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
res = self.run_test( res = self.run_test(
disable_radix_cache=True, disable_radix_cache=True,
disable_flashinfer=False, disable_flashinfer=ServerArgs.disable_flashinfer,
chunked_prefill_size=-1, chunked_prefill_size=ServerArgs.chunked_prefill_size,
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE) performance # A100 (PCIE) performance
assert res["output_throughput"] >= 1450 assert res["output_throughput"] >= 1450
def test_default_without_flashinfer(self): def test_default_with_chunked_prefill(self):
self.run_test( res = self.run_test(
disable_radix_cache=False, disable_radix_cache=ServerArgs.disable_radix_cache,
disable_flashinfer=True, disable_flashinfer=ServerArgs.disable_flashinfer,
chunked_prefill_size=-1, chunked_prefill_size=8192,
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE) performance
assert res["output_throughput"] >= 1400
def test_all_cases(self): def test_all_cases(self):
for disable_radix_cache in [False, True]: for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]: for disable_flashinfer in [False, True]:
......
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestTritonAttnBackend(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, other_args=["--disable-flashinfer"]
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=32,
num_threads=32,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.6
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment