Enable chunked prefill by default (#1040)

e86b1ccb · Lianmin Zheng · GitHub · 8d2d876f · e86b1ccb · e86b1ccb
Unverified Commit e86b1ccb authored Aug 14, 2024 by Lianmin Zheng Committed by GitHub Aug 14, 2024
4 changed files
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -47,8 +47,8 @@ jobs:
        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
      timeout-minutes: 10

-    - name: Benchmark Serving Throughput (w/ ChunkedPrefill)
+    - name: Benchmark Serving Throughput (w/o ChunkedPrefill)
      run: |
        cd test/srt
-        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_with_chunked_prefill
+        python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
      timeout-minutes: 10
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -49,7 +49,7 @@ class ServerArgs:
    max_running_requests: Optional[int] = None
    max_num_reqs: Optional[int] = None
    max_total_tokens: Optional[int] = None
-    chunked_prefill_size: int = -1
+    chunked_prefill_size: int = 8192
    max_prefill_tokens: int = 16384
    schedule_policy: str = "lpm"
    schedule_conservativeness: float = 1.0

--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -86,11 +86,11 @@ class TestServingThroughput(unittest.TestCase):
            # A100 (PCIE) performance
            assert res["output_throughput"] > 930

-    def test_default_with_chunked_prefill(self):
+    def test_default_without_chunked_prefill(self):
        res = self.run_test(
            disable_radix_cache=ServerArgs.disable_radix_cache,
            disable_flashinfer=ServerArgs.disable_flashinfer,
-            chunked_prefill_size=8192,
+            chunked_prefill_size=-1,
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":

--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -71,7 +71,7 @@ class TestServingThroughput(unittest.TestCase):

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
-            assert res["output_throughput"] >= 1400
+            assert res["output_throughput"] > 1400

    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -82,18 +82,18 @@ class TestServingThroughput(unittest.TestCase):

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
-            assert res["output_throughput"] >= 1450
+            assert res["output_throughput"] > 1450

-    def test_default_with_chunked_prefill(self):
+    def test_default_without_chunked_prefill(self):
        res = self.run_test(
            disable_radix_cache=ServerArgs.disable_radix_cache,
            disable_flashinfer=ServerArgs.disable_flashinfer,
-            chunked_prefill_size=8192,
+            chunked_prefill_size=-1,
        )

        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
            # A100 (PCIE) performance
-            assert res["output_throughput"] >= 1400
+            assert res["output_throughput"] > 1400

    def test_all_cases(self):
        for disable_radix_cache in [False, True]: