test: remove tcp test xfail for tcp after automatic port assignments (#4916)

44f57a22 · Biswa Panda · GitHub · aa1bc3c5 · 44f57a22 · 44f57a22
Unverified Commit 44f57a22 authored Dec 11, 2025 by Biswa Panda Committed by GitHub Dec 11, 2025
6 changed files
--- a/tests/fault_tolerance/cancellation/test_sglang.py
+++ b/tests/fault_tolerance/cancellation/test_sglang.py
@@ -33,6 +33,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -185,7 +186,6 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(160)  # 3x average
 @pytest.mark.gpu_1
 @pytest.mark.xfail(strict=False)
-@pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True)
 def test_request_cancellation_sglang_aggregated(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -293,17 +293,6 @@ def test_request_cancellation_sglang_aggregated(
 @pytest.mark.timeout(185)  # 3x average
 @pytest.mark.gpu_2
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_cancellation_sglang_decode_cancel(
    request, runtime_services_dynamic_ports, predownload_models
 ):

--- a/tests/fault_tolerance/cancellation/test_trtllm.py
+++ b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -36,6 +36,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -164,7 +165,6 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(140)  # 3x average
-@pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True)
 def test_request_cancellation_trtllm_aggregated(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -252,17 +252,6 @@ def test_request_cancellation_trtllm_aggregated(
 @pytest.mark.timeout(350)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_cancellation_trtllm_decode_cancel(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -347,17 +336,6 @@ def test_request_cancellation_trtllm_decode_cancel(
 @pytest.mark.timeout(350)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_cancellation_trtllm_prefill_cancel(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -450,7 +428,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
 @pytest.mark.timeout(350)  # 3x average
-@pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True)
 @pytest.mark.xfail(
    reason="May fail due to unknown reason with TRT-LLM or backend implementation",
    strict=False,

--- a/tests/fault_tolerance/cancellation/test_vllm.py
+++ b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -34,6 +34,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -166,7 +167,6 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(110)  # 3x average
-@pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True)
 def test_request_cancellation_vllm_aggregated(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -248,17 +248,6 @@ def test_request_cancellation_vllm_aggregated(
 @pytest.mark.timeout(150)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_cancellation_vllm_decode_cancel(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -340,17 +329,6 @@ def test_request_cancellation_vllm_decode_cancel(
 @pytest.mark.timeout(150)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_cancellation_vllm_prefill_cancel(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):

--- a/tests/fault_tolerance/migration/test_sglang.py
+++ b/tests/fault_tolerance/migration/test_sglang.py
@@ -38,6 +38,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -139,17 +140,6 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(235)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_migration_sglang_worker_failure(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -216,17 +206,6 @@ def test_request_migration_sglang_worker_failure(
 @pytest.mark.timeout(235)  # 3x average
 @pytest.mark.skip(reason="SGLang graceful shutdown not yet implemented")
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_migration_sglang_graceful_shutdown(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -296,17 +275,6 @@ def test_request_migration_sglang_graceful_shutdown(
 @pytest.mark.timeout(135)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_no_request_migration_sglang_worker_failure(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -391,17 +359,6 @@ def test_no_request_migration_sglang_worker_failure(
 @pytest.mark.timeout(135)  # 3x average
 @pytest.mark.skip(reason="SGLang graceful shutdown not yet implemented")
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_no_request_migration_sglang_graceful_shutdown(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):

--- a/tests/fault_tolerance/migration/test_trtllm.py
+++ b/tests/fault_tolerance/migration/test_trtllm.py
@@ -38,6 +38,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -137,17 +138,6 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(290)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_migration_trtllm_worker_failure(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -201,17 +191,6 @@ def test_request_migration_trtllm_worker_failure(
 @pytest.mark.skip(reason="TRT-LLM graceful shutdown not yet implemented")
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_migration_trtllm_graceful_shutdown(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -269,17 +248,6 @@ def test_request_migration_trtllm_graceful_shutdown(
 @pytest.mark.timeout(185)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_no_request_migration_trtllm_worker_failure(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -357,17 +325,6 @@ def test_no_request_migration_trtllm_worker_failure(
 @pytest.mark.skip(reason="TRT-LLM graceful shutdown not yet implemented")
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_no_request_migration_trtllm_graceful_shutdown(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):

--- a/tests/fault_tolerance/migration/test_vllm.py
+++ b/tests/fault_tolerance/migration/test_vllm.py
@@ -38,6 +38,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]
@@ -147,17 +148,6 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.timeout(290)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_migration_vllm_worker_failure(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -211,17 +201,6 @@ def test_request_migration_vllm_worker_failure(
 @pytest.mark.timeout(280)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_request_migration_vllm_graceful_shutdown(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -279,17 +258,6 @@ def test_request_migration_vllm_graceful_shutdown(
 @pytest.mark.timeout(150)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_no_request_migration_vllm_worker_failure(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -361,17 +329,6 @@ def test_no_request_migration_vllm_worker_failure(
 @pytest.mark.timeout(140)  # 3x average
-@pytest.mark.parametrize(
-    "request_plane",
-    [
-        "nats",
-        pytest.param(
-            "tcp",
-            marks=pytest.mark.xfail(reason="Multi-worker TCP unstable", strict=False),
-        ),
-    ],
-    indirect=True,
-)
 def test_no_request_migration_vllm_graceful_shutdown(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):