Unverified Commit e86b1ccb authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Enable chunked prefill by default (#1040)

parent 8d2d876f
......@@ -47,8 +47,8 @@ jobs:
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
timeout-minutes: 10
- name: Benchmark Serving Throughput (w/ ChunkedPrefill)
- name: Benchmark Serving Throughput (w/o ChunkedPrefill)
run: |
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
timeout-minutes: 10
......@@ -49,7 +49,7 @@ class ServerArgs:
max_running_requests: Optional[int] = None
max_num_reqs: Optional[int] = None
max_total_tokens: Optional[int] = None
chunked_prefill_size: int = -1
chunked_prefill_size: int = 8192
max_prefill_tokens: int = 16384
schedule_policy: str = "lpm"
schedule_conservativeness: float = 1.0
......
......@@ -86,11 +86,11 @@ class TestServingThroughput(unittest.TestCase):
# A100 (PCIE) performance
assert res["output_throughput"] > 930
def test_default_with_chunked_prefill(self):
def test_default_without_chunked_prefill(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
disable_flashinfer=ServerArgs.disable_flashinfer,
chunked_prefill_size=8192,
chunked_prefill_size=-1,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
......
......@@ -71,7 +71,7 @@ class TestServingThroughput(unittest.TestCase):
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE) performance
assert res["output_throughput"] >= 1400
assert res["output_throughput"] > 1400
def test_default_without_radix_cache(self):
res = self.run_test(
......@@ -82,18 +82,18 @@ class TestServingThroughput(unittest.TestCase):
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE) performance
assert res["output_throughput"] >= 1450
assert res["output_throughput"] > 1450
def test_default_with_chunked_prefill(self):
def test_default_without_chunked_prefill(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
disable_flashinfer=ServerArgs.disable_flashinfer,
chunked_prefill_size=8192,
chunked_prefill_size=-1,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE) performance
assert res["output_throughput"] >= 1400
assert res["output_throughput"] > 1400
def test_all_cases(self):
for disable_radix_cache in [False, True]:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment