test: stabilize nightly — skip engine-init failures, convert xfails to skips,...

test: stabilize nightly — skip engine-init failures, convert xfails to skips, fix http URL validation regression (#8443) Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

test: stabilize nightly — skip engine-init failures, convert xfails to skips,...
test: stabilize nightly — skip engine-init failures, convert xfails to skips, fix http URL validation regression (#8443) Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
cc583b2f · Dmitry Tokarev · GitHub · 9514236c · cc583b2f · cc583b2f
Unverified Commit cc583b2f authored Apr 21, 2026 by Dmitry Tokarev Committed by GitHub Apr 21, 2026
9 changed files
--- a/tests/fault_tolerance/cancellation/test_trtllm.py
+++ b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -40,7 +40,7 @@ pytestmark = [
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.nightly,
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
-    pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
+    pytest.mark.skip(reason="Cancellation is temporarily disabled"),
 ]


@@ -473,7 +473,7 @@ def test_request_cancellation_trtllm_prefill_cancel(
                )


-@pytest.mark.xfail(reason="Test fails only on CI", strict=False)
+@pytest.mark.skip(reason="Test fails only on CI")
 @pytest.mark.timeout(195)  # 3x average
 def test_request_cancellation_trtllm_kv_transfer_cancel(
    request, runtime_services_dynamic_ports, predownload_models

--- a/tests/fault_tolerance/cancellation/test_vllm.py
+++ b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -396,6 +396,7 @@ def test_request_cancellation_vllm_decode_cancel(
                )


+@pytest.mark.skip(reason="Nightly CI failure: OPS-4448")
 @pytest.mark.timeout(150)  # 3x average
 @pytest.mark.nightly
 @pytest.mark.gpu_2

--- a/tests/fault_tolerance/migration/test_sglang.py
+++ b/tests/fault_tolerance/migration/test_sglang.py
@@ -235,20 +235,18 @@ def test_request_migration_sglang_aggregated(
        stream: True for streaming, False for non-streaming
    """

-    # TODO(<LINEAR-ID>): Flaky on NATS transport — first-token delay routinely
-    # exceeds the 6s threshold in utils.validate_response. Other parameter
-    # combinations (including the TCP variant) are stable.
+    # OPS-4446: first-token delay routinely exceeds the 6s threshold in
+    # utils.validate_response for this parameter combination. Originally only
+    # the NATS variant tripped; once the NATS skip landed, the TCP variant
+    # started failing the same way (now bears the cold-start cost first).
    if (
        migration_limit == 3
        and migration_max_seq_len is None
        and immediate_kill is True
        and request_api == "chat"
        and stream is True
-        and request.getfixturevalue("request_plane") == "nats"
    ):
-        pytest.skip(
-            "Flaky on NATS transport: first-token delay > 6s threshold. OPS-4446"
-        )
+        pytest.skip("Flaky: first-token delay > 6s threshold. OPS-4446")

    # Step 1: Start the frontend
    with DynamoFrontendProcess(

--- a/tests/fault_tolerance/migration/test_vllm.py
+++ b/tests/fault_tolerance/migration/test_vllm.py
@@ -271,7 +271,7 @@ def test_request_migration_vllm_aggregated(
                )


-@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
+@pytest.mark.skip(reason="Prefill migration not yet supported")
 @pytest.mark.timeout(350)  # 3x average
 @pytest.mark.nightly
 def test_request_migration_vllm_prefill(
@@ -346,8 +346,7 @@ def test_request_migration_vllm_prefill(
                    )


-@pytest.mark.xfail(
-    strict=False,
+@pytest.mark.skip(
    reason=(
        "Migration reuses the same request_id for vLLM, but the prefill worker's "
        "KV cache still holds the request due to delay_free_blocks in disaggregated mode. "
@@ -430,8 +429,7 @@ def test_request_migration_vllm_kv_transfer(
                    )


-@pytest.mark.xfail(
-    strict=False,
+@pytest.mark.skip(
    reason=(
        "Migration reuses the same request_id for vLLM, but the prefill worker's "
        "KV cache still holds the request due to delay_free_blocks in disaggregated mode. "

--- a/tests/gpu_memory_service/test_quiesce_resume.py
+++ b/tests/gpu_memory_service/test_quiesce_resume.py
@@ -131,6 +131,7 @@ def test_gms_basic_quiesce_resume_sglang(
 # ---------------------------------------------------------------------------


+@pytest.mark.skip(reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450")
 @pytest.mark.trtllm
 @pytest.mark.e2e
 @pytest.mark.gpu_1
@@ -177,6 +178,7 @@ def test_gms_basic_quiesce_resume_trtllm(
        )


+@pytest.mark.skip(reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450")
 @pytest.mark.trtllm
 @pytest.mark.e2e
 @pytest.mark.gpu_1

--- a/tests/gpu_memory_service/test_shadow_failover.py
+++ b/tests/gpu_memory_service/test_shadow_failover.py
@@ -302,6 +302,7 @@ def _trtllm_quiesce(
    return ws


+@pytest.mark.skip(reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450")
 @pytest.mark.trtllm
 @pytest.mark.e2e
 @pytest.mark.gpu_1

--- a/tests/serve/multimodal_profiles/vllm.py
+++ b/tests/serve/multimodal_profiles/vllm.py
@@ -29,12 +29,24 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
                profiled_vram_gib=9.6,
            ),
            "e_pd": TopologyConfig(
-                marks=[pytest.mark.pre_merge],
+                marks=[
+                    pytest.mark.skip(
+                        reason="vLLM engine core init fails on disagg e_pd. "
+                        "https://linear.app/nvidia/issue/OPS-4445"
+                    ),
+                    pytest.mark.pre_merge,
+                ],
                timeout_s=340,
                single_gpu=True,
            ),
            "epd": TopologyConfig(
-                marks=[pytest.mark.pre_merge],
+                marks=[
+                    pytest.mark.skip(
+                        reason="vLLM engine core init fails on disagg epd. "
+                        "https://linear.app/nvidia/issue/OPS-4445"
+                    ),
+                    pytest.mark.pre_merge,
+                ],
                timeout_s=300,
                single_gpu=True,
            ),
@@ -56,7 +68,13 @@ VLLM_MULTIMODAL_PROFILES: list[MultimodalModelProfile] = [
                delayed_start=60,
            ),
            "epd": TopologyConfig(
-                marks=[pytest.mark.pre_merge],
+                marks=[
+                    pytest.mark.skip(
+                        reason="vLLM engine core init fails on disagg epd. "
+                        "https://linear.app/nvidia/issue/OPS-4445"
+                    ),
+                    pytest.mark.pre_merge,
+                ],
                timeout_s=600,
                delayed_start=60,
                single_gpu=True,

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -138,6 +138,9 @@ trtllm_configs = {
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
        marks=[
+            pytest.mark.skip(
+                reason="Nightly CI failure: https://linear.app/nvidia/issue/OPS-4450"
+            ),
            pytest.mark.gpu_1,  # 1 GPU(s) used, peak 6.6 GiB
            pytest.mark.pre_merge,
            pytest.mark.trtllm,

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -422,6 +422,7 @@ vllm_configs = {
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
+        env={"DYN_MM_ALLOW_INTERNAL": "1"},
        delayed_start=0,
        timeout=360,
        request_payloads=[
@@ -471,6 +472,7 @@ vllm_configs = {
            "--dyn-tool-call-parser",
            "hermes",
        ],
+        env={"DYN_MM_ALLOW_INTERNAL": "1"},
        delayed_start=0,
        timeout=600,
        request_payloads=[