[CI] Reorganize the 8 gpu tests (#6192)

03227c5f · Lianmin Zheng · GitHub · 01bdbf7f · 03227c5f · 03227c5f
Unverified Commit 03227c5f authored May 11, 2025 by Lianmin Zheng Committed by GitHub May 11, 2025
6 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -92,7 +92,7 @@ jobs:
  unittest-test-backend-8-gpu:
    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
        github.event.pull_request.draft == false
-    needs: [unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu]
+    needs: [unit-test-frontend, unit-test-backend-2-gpu]
    runs-on: 8-gpu-runner
    steps:
      - name: Checkout code
@@ -271,24 +271,6 @@ jobs:
          cd test/srt
          python3 test_moe_eval_accuracy_large.py

-  unit-test-backend-pd:
-    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
-        github.event.pull_request.draft == false
-    runs-on: 8-gpu-runner
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Install dependencies
-        run: |
-          bash scripts/ci_install_dependency.sh
-
-      - name: Run test
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_disaggregation.TestDisaggregationMooncake.test_gsm8k
-
  finish:
    if: always()
    needs: [

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -305,6 +305,12 @@ class ServerArgs:
        if self.grammar_backend is None:
            self.grammar_backend = "xgrammar"

+        if self.pp_size > 1:
+            self.disable_overlap_schedule = True
+            logger.warning(
+                "Overlap scheduler is disabled because of using pipeline parallelism."
+            )
+
        # Data parallelism attention
        if self.enable_dp_attention:
            self.schedule_conservativeness = self.schedule_conservativeness * 0.3

--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -5,25 +5,22 @@ set -euxo pipefail
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 bash "${SCRIPT_DIR}/killall_sglang.sh"

+# Update pip
+pip install --upgrade pip
+
 # Clean up existing installations
-pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm || true
+pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm
 pip cache purge
 rm -rf /root/.cache/flashinfer
 rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer*
 rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*

-# Update pip
-pip install --upgrade pip
-
-# Install sgl-kernel
-pip install sgl-kernel==0.1.2.post1 --no-cache-dir
-
 # Install the main package
 pip install -e "python[all]"

 # Install additional dependencies
 pip install torch_memory_saver
-pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
+pip install transformers==4.51.0 timm torchaudio==2.6.0 sentence_transformers accelerate peft pandas datasets mooncake-transfer-engine

 # For compiling xgrammar kernels
 pip install cuda-python nvidia-cuda-nvrtc-cu12

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -85,9 +85,6 @@ suites = {
        TestFile("test_w8a8_quantization.py", 46),
        TestFile("models/lora/test_lora_cuda_graph.py", 250),
    ],
-    "per-commit-pd": [
-        TestFile("test_disaggregation.py", 90),
-    ],
    "per-commit-2-gpu": [
        TestFile("models/lora/test_lora_tp.py", 116),
        TestFile("test_data_parallelism.py", 73),
@@ -105,6 +102,7 @@ suites = {
        # TestFile("test_deepep_low_latency.py", 50),
        # TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
        TestFile("test_local_attn.py", 250),
+        TestFile("test_disaggregation.py", 90),
        TestFile("test_full_deepseek_v3.py", 250),
        TestFile("test_pp_single_node.py", 150),
    ],

--- a/test/srt/test_disaggregation.py
+++ b/test/srt/test_disaggregation.py
 import subprocess
-import threading
 import time
 import unittest
 from types import SimpleNamespace

 import requests
-import torch

 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k

--- a/test/srt/test_pp_single_node.py
+++ b/test/srt/test_pp_single_node.py
@@ -9,13 +9,10 @@ import time
 import unittest
 from types import SimpleNamespace

-import requests
-
 from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import kill_process_tree
 from sglang.test.few_shot_gsm8k import run_eval
-from sglang.test.runners import DEFAULT_PROMPTS
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
@@ -28,17 +25,16 @@ from sglang.test.test_utils import (
 class TestPPAccuracy(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        # These config helps find a leak.
-        os.environ["SGLANG_IS_IN_CI"] = "1"
        cls.base_url = "http://127.0.0.1:23333"
        cls.process = popen_launch_server(
            DEFAULT_MODEL_NAME_FOR_TEST,
            cls.base_url,
            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
            other_args=[
+                "--tp-size",
+                2,
                "--pp-size",
                4,
-                "--disable-overlap-schedule",
                "--chunked-prefill-size",
                256,
            ],
@@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase):
        time.sleep(5)


-# class TestPPAccuracyFlashInfer(unittest.TestCase):
-#     @classmethod
-#     def setUpClass(cls):
-#         # These config helps find a leak.
-#         os.environ["SGLANG_IS_IN_CI"] = "1"
-#         cls.base_url = "http://127.0.0.1:23333"
-#         cls.process = popen_launch_server(
-#             DEFAULT_MODEL_NAME_FOR_TEST,
-#             cls.base_url,
-#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-#             other_args=[
-#                 "--pp-size",
-#                 4,
-#                 "--disable-overlap-schedule",
-#                 "--attention-backend",
-#                 "flashinfer",
-#                 "--chunked-prefill-size",
-#                 256,
-#             ],
-#         )
-#
-#     @classmethod
-#     def tearDownClass(cls):
-#         kill_process_tree(cls.process.pid)
-#
-#     def test_gsm8k(self):
-#         args = SimpleNamespace(
-#             num_shots=5,
-#             data_path=None,
-#             num_questions=200,
-#             max_new_tokens=512,
-#             parallel=128,
-#             host="http://127.0.0.1",
-#             port=int(self.base_url.split(":")[-1]),
-#         )
-#         metrics = run_eval(args)
-#         print(f"{metrics=}")
-#
-#         self.assertGreater(metrics["accuracy"], 0.75)
-#         # Wait a little bit so that the memory check happens.
-#         time.sleep(5)
-
-
 class TestFixedBugs(unittest.TestCase):
    def test_chunked_prefill_with_small_bs(self):
        model = DEFAULT_MODEL_NAME_FOR_TEST
@@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase):
            2,
            "--pp-size",
            2,
-            "--disable-overlap-schedule",
            "--chunked-prefill",
            256,
            "--max-running-requests",