chore: unskip several multi-gpu tests and enable sglang multi-gpu tests (#7443)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

chore: unskip several multi-gpu tests and enable sglang multi-gpu tests (#7443)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
9681225a · Dmitry Tokarev · GitHub · a800515d · 9681225a · 9681225a
Unverified Commit 9681225a authored Mar 17, 2026 by Dmitry Tokarev Committed by GitHub Mar 17, 2026
6 changed files
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -222,7 +222,9 @@ jobs:
      cpu_only_test_markers: 'pre_merge and sglang and gpu_0'
      run_single_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' }}
      single_gpu_test_markers: 'pre_merge and sglang and gpu_1'
-      run_multi_gpu_tests: false  # all sglang multi-GPU tests are currently skipped; re-enable when fixed
+      run_multi_gpu_tests: ${{ needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true' }}
+      multi_gpu_test_markers: 'pre_merge and sglang and (gpu_2 or gpu_4)'
+      multi_gpu_test_timeout_minutes: 60
    secrets: inherit
  # ============================================================================

--- a/tests/fault_tolerance/cancellation/test_sglang.py
+++ b/tests/fault_tolerance/cancellation/test_sglang.py
@@ -33,7 +33,6 @@ pytestmark = [
    pytest.mark.sglang,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.nightly,
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -187,6 +186,7 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(160)  # 3x average
 @pytest.mark.gpu_1
 @pytest.mark.skip(reason="DYN-2265")
+@pytest.mark.nightly
 def test_request_cancellation_sglang_aggregated(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -292,8 +292,9 @@ def test_request_cancellation_sglang_aggregated(
                logger.info(f"{description} detected successfully")
-@pytest.mark.timeout(185)  # 3x average
+@pytest.mark.timeout(300)  # 3x average
 @pytest.mark.gpu_2
+@pytest.mark.pre_merge
 def test_request_cancellation_sglang_decode_cancel(
    request, runtime_services_dynamic_ports, predownload_models
 ):

--- a/tests/router/test_router_e2e_with_sglang.py
+++ b/tests/router/test_router_e2e_with_sglang.py
@@ -425,7 +425,9 @@ def test_router_decisions_sglang_multiple_workers(
 @pytest.mark.pre_merge
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
 @pytest.mark.timeout(600)  # 10 min max (multi-GPU + DP startup variance)
-@pytest.mark.skip(reason="DYN-2265")
+@pytest.mark.skip(
+    reason="DYN-2265"
+)  # Currently fails probably due to SGLang startup issues when multiple workers on same GPU; re-enable when fixed
 def test_router_decisions_sglang_dp(
    request,
    runtime_services_dynamic_ports,

--- a/tests/serve/lora_utils.py
+++ b/tests/serve/lora_utils.py
@@ -237,6 +237,12 @@ class MinioService:
            f"Downloading LoRA {self.config.lora_repo} to {self._temp_download_dir}"
        )
+        # Run with HF_HUB_OFFLINE unset so the download works even when
+        # the predownload_models fixture has already enabled offline mode.
+        # This only affects the subprocess env; the parent process is unchanged.
+        env = os.environ.copy()
+        env.pop("HF_HUB_OFFLINE", None)
        result = subprocess.run(
            [
                "huggingface-cli",
@@ -249,6 +255,7 @@ class MinioService:
            ],
            capture_output=True,
            text=True,
+            env=env,
        )
        if result.returncode != 0:

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -76,7 +76,6 @@ sglang_configs = {
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.pre_merge,
-            pytest.mark.skip(reason="DYN-2265"),
        ],
        model="Qwen/Qwen3-0.6B",
        env={},
@@ -127,7 +126,6 @@ sglang_configs = {
        marks=[
            pytest.mark.gpu_2,
            pytest.mark.pre_merge,
-            pytest.mark.skip(reason="DYN-2265"),
        ],
        model="Qwen/Qwen3-0.6B",
        env={
@@ -137,7 +135,7 @@ sglang_configs = {
        request_payloads=[
            chat_payload_default(
                expected_log=[
-                    r"ZMQ listener .* received batch with \d+ events \(seq=\d+(?:, [^)]*)?\)",
+                    r"ZMQ listener .* received batch with \d+ events \(engine_seq=\d+(?:, [^)]*)?\)",
                    r"Event processor for worker_id \d+ processing event: Stored\(",
                    r"Selected worker: worker_type=\w+, worker_id=\d+ dp_rank=.*?, logit: ",
                ]

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -904,7 +904,6 @@ def test_lora_aggregated(
 @pytest.mark.timeout(600)
 @pytest.mark.pre_merge
 @pytest.mark.parametrize("num_system_ports", [2], indirect=True)
-@pytest.mark.skip(reason="DYN-2265")
 def test_lora_aggregated_router(
    request,
    runtime_services_dynamic_ports,