ci: Skip broken etcd_ha tests until fixed to unblock unrelated PRs (#4198)

560bb2fc · Ryan McCormick · GitHub · 93ada899 · 560bb2fc · 560bb2fc
Unverified Commit 560bb2fc authored Nov 07, 2025 by Ryan McCormick Committed by GitHub Nov 08, 2025
3 changed files
--- a/tests/fault_tolerance/etcd_ha/test_sglang.py
+++ b/tests/fault_tolerance/etcd_ha/test_sglang.py
@@ -149,6 +149,7 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_sglang_aggregated(request, predownload_models):
    """
    Test ETCD High Availability with leader failover using SGLang.
@@ -209,6 +210,7 @@ def test_etcd_ha_failover_sglang_aggregated(request, predownload_models):
 @pytest.mark.gpu_2
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_sglang_disaggregated(
    request, predownload_models, set_ucx_tls_no_mm
 ):
@@ -277,6 +279,7 @@ def test_etcd_ha_failover_sglang_disaggregated(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_sglang_aggregated(request, predownload_models):
    """
    Test that frontend and worker shut down when single ETCD node is terminated using SGLang.
@@ -333,6 +336,7 @@ def test_etcd_non_ha_shutdown_sglang_aggregated(request, predownload_models):
 @pytest.mark.gpu_2
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_sglang_disaggregated(
    request, predownload_models, set_ucx_tls_no_mm
 ):

--- a/tests/fault_tolerance/etcd_ha/test_trtllm.py
+++ b/tests/fault_tolerance/etcd_ha/test_trtllm.py
@@ -135,6 +135,7 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_trtllm_aggregated(request, predownload_models):
    """
    Test ETCD High Availability with leader failover for TRT-LLM in aggregated mode.
@@ -195,6 +196,7 @@ def test_etcd_ha_failover_trtllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_trtllm_disaggregated(
    request, predownload_models, set_ucx_tls_no_mm
 ):
@@ -262,6 +264,7 @@ def test_etcd_ha_failover_trtllm_disaggregated(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_trtllm_aggregated(request, predownload_models):
    """
    Test that frontend and worker shut down when single ETCD node is terminated for TRT-LLM in aggregated mode.
@@ -321,6 +324,7 @@ def test_etcd_non_ha_shutdown_trtllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_trtllm_disaggregated(
    request, predownload_models, set_ucx_tls_no_mm
 ):

--- a/tests/fault_tolerance/etcd_ha/test_vllm.py
+++ b/tests/fault_tolerance/etcd_ha/test_vllm.py
@@ -117,6 +117,7 @@ class DynamoWorkerProcess(ManagedProcess):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_vllm_aggregated(request, predownload_models):
    """
    Test ETCD High Availability with leader failover.
@@ -175,6 +176,7 @@ def test_etcd_ha_failover_vllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_ha_failover_vllm_disaggregated(
    request, predownload_models, set_ucx_tls_no_mm
 ):
@@ -239,6 +241,7 @@ def test_etcd_ha_failover_vllm_disaggregated(
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_vllm_aggregated(request, predownload_models):
    """
    Test that frontend and worker shut down when single ETCD node is terminated.
@@ -293,6 +296,7 @@ def test_etcd_non_ha_shutdown_vllm_aggregated(request, predownload_models):
 @pytest.mark.gpu_1
 @pytest.mark.e2e
 @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
+@pytest.mark.skip(reason="Broken, temporarily disabled")
 def test_etcd_non_ha_shutdown_vllm_disaggregated(
    request, predownload_models, set_ucx_tls_no_mm
 ):