[CI] Split test cases in CI for better load balancing (#2180)

254fd130 · Lianmin Zheng · GitHub · 538fa0ae · 254fd130 · 254fd130
Unverified Commit 254fd130 authored Nov 25, 2024 by Lianmin Zheng Committed by GitHub Nov 25, 2024
6 changed files
--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -6,3 +6,7 @@ pip install --upgrade pip
 pip install -e "python[all]"
 pip install transformers==4.45.2 sentence_transformers accelerate peft
 pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+# for compling eagle kernels
+pip install cutex
+# for compling xgrammar kernels
+pip install cuda-python nvidia-cuda-nvrtc-cu12
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -17,7 +17,8 @@ suites = {
        "test_json_constrained.py",
        "test_large_max_new_tokens.py",
        "test_metrics.py",
-        "test_non_overlap_scheduler.py",
+        "test_no_chunked_prefill.py",
+        "test_no_overlap_scheduler.py",
        "test_openai_server.py",
        "test_pytorch_sampling_backend.py",
        "test_radix_attention.py",

--- a/test/srt/test_chunked_prefill.py
+++ b/test/srt/test_chunked_prefill.py
@@ -4,12 +4,7 @@ python3 -m unittest test_chunked_prefill.TestChunkedPrefill.test_mixed_chunked_p

 import unittest

-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    run_bench_serving,
-    run_mmlu_test,
-    run_mulit_request_test,
-)
+from sglang.test.test_utils import run_mmlu_test, run_mulit_request_test


 class TestChunkedPrefill(unittest.TestCase):
@@ -25,21 +20,6 @@ class TestChunkedPrefill(unittest.TestCase):
    def test_mixed_chunked_prefill_without_radix_cache(self):
        run_mmlu_test(disable_radix_cache=True, enable_mixed_chunk=True)

-    def test_no_chunked_prefill(self):
-        run_mmlu_test(
-            disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
-        )
-
-    def test_no_chunked_prefill_without_radix_cache(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=10,
-            request_rate=float("inf"),
-            other_server_args=["--disable-radix-cache", "--chunked-prefill-size", "-1"],
-        )
-
-        assert res["completed"] == 10
-
    def test_mixed_chunked_prefill_multi_requests(self):
        run_mulit_request_test(
            enable_mixed_chunk=True,

--- a/test/srt/test_no_chunked_prefill.py
+++ b/test/srt/test_no_chunked_prefill.py
+import unittest
+
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    run_bench_serving,
+    run_mmlu_test,
+)
+
+
+class TestNoChunkedPrefill(unittest.TestCase):
+
+    def test_no_chunked_prefill(self):
+        run_mmlu_test(
+            disable_radix_cache=False, enable_mixed_chunk=False, chunked_prefill_size=-1
+        )
+
+    def test_no_chunked_prefill_without_radix_cache(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=10,
+            request_rate=float("inf"),
+            other_server_args=["--disable-radix-cache", "--chunked-prefill-size", "-1"],
+        )
+
+        assert res["completed"] == 10
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_non_overlap_scheduler.py
+++ b/test/srt/test_non_overlap_scheduler.py
--- a/test/srt/test_srt_endpoint.py
+++ b/test/srt/test_srt_endpoint.py
@@ -211,7 +211,7 @@ class TestSRTEndpoint(unittest.TestCase):

        diff = np.abs(output_logprobs - output_logprobs_score)
        max_diff = np.max(diff)
-        self.assertLess(max_diff, 0.2)
+        self.assertLess(max_diff, 0.25)

    def test_get_server_info(self):
        response = requests.get(self.base_url + "/get_server_info")