chore: Move some fault tolerance tests to nightly (#6636)

Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>

chore: Move some fault tolerance tests to nightly (#6636)
Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com>
f9d57094 · Dmitry Tokarev · GitHub · a671dbd4 · f9d57094 · f9d57094
Unverified Commit f9d57094 authored Feb 26, 2026 by Dmitry Tokarev Committed by GitHub Feb 26, 2026
6 changed files
--- a/tests/fault_tolerance/cancellation/test_sglang.py
+++ b/tests/fault_tolerance/cancellation/test_sglang.py
@@ -33,7 +33,7 @@ pytestmark = [
    pytest.mark.sglang,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.nightly,
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]


--- a/tests/fault_tolerance/cancellation/test_trtllm.py
+++ b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -36,7 +36,7 @@ pytestmark = [
    pytest.mark.gpu_1,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
+    pytest.mark.nightly,
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
    pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
 ]

--- a/tests/fault_tolerance/cancellation/test_vllm.py
+++ b/tests/fault_tolerance/cancellation/test_vllm.py
@@ -35,7 +35,6 @@ pytestmark = [
    pytest.mark.gpu_1,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
    pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
 ]

@@ -179,6 +178,7 @@ class DynamoWorkerProcess(ManagedProcess):


 @pytest.mark.timeout(110)  # 3x average
+@pytest.mark.post_merge
 def test_request_cancellation_vllm_aggregated(
    request, runtime_services_dynamic_ports, predownload_models
 ):
@@ -260,6 +260,7 @@ def test_request_cancellation_vllm_aggregated(


 @pytest.mark.timeout(150)  # 3x average
+@pytest.mark.nightly
 def test_request_cancellation_vllm_decode_cancel(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):
@@ -341,6 +342,7 @@ def test_request_cancellation_vllm_decode_cancel(


 @pytest.mark.timeout(150)  # 3x average
+@pytest.mark.nightly
 def test_request_cancellation_vllm_prefill_cancel(
    request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
 ):

--- a/tests/fault_tolerance/migration/test_sglang.py
+++ b/tests/fault_tolerance/migration/test_sglang.py
@@ -31,7 +31,6 @@ pytestmark = [
    pytest.mark.gpu_1,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
    pytest.mark.parametrize(
        "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
    ),
@@ -211,6 +210,7 @@ class DynamoWorkerProcess(ManagedProcess):


 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.post_merge
 def test_request_migration_sglang_aggregated(
    request,
    runtime_services_dynamic_ports,
@@ -262,6 +262,7 @@ def test_request_migration_sglang_aggregated(
 @pytest.mark.skip(reason="Cannot reliably migrate at Prefill that finish < 1 ms")
 @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_sglang_prefill(
    request,
    runtime_services_dynamic_ports,
@@ -330,6 +331,7 @@ def test_request_migration_sglang_prefill(

 @pytest.mark.skip(reason="KV cache transfer may fail")
 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_sglang_kv_transfer(
    request,
    runtime_services_dynamic_ports,
@@ -397,6 +399,7 @@ def test_request_migration_sglang_kv_transfer(


 @pytest.mark.timeout(230)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_sglang_decode(
    request,
    runtime_services_dynamic_ports,

--- a/tests/fault_tolerance/migration/test_trtllm.py
+++ b/tests/fault_tolerance/migration/test_trtllm.py
@@ -31,7 +31,6 @@ pytestmark = [
    pytest.mark.gpu_1,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
    pytest.mark.parametrize(
        "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
    ),
@@ -198,7 +197,8 @@ class DynamoWorkerProcess(ManagedProcess):
        return False


-@pytest.mark.timeout(290)  # 3x average
+@pytest.mark.timeout(290)
+@pytest.mark.post_merge  # 3x average
 def test_request_migration_trtllm_aggregated(
    request,
    runtime_services_dynamic_ports,
@@ -249,6 +249,7 @@ def test_request_migration_trtllm_aggregated(

 @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_trtllm_prefill(
    request,
    runtime_services_dynamic_ports,
@@ -317,6 +318,7 @@ def test_request_migration_trtllm_prefill(

 @pytest.mark.skip(reason="Decode worker can get stuck downloading kv cache")
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_trtllm_kv_transfer(
    request,
    runtime_services_dynamic_ports,
@@ -384,6 +386,7 @@ def test_request_migration_trtllm_kv_transfer(


 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.post_merge
 def test_request_migration_trtllm_decode(
    request,
    runtime_services_dynamic_ports,

--- a/tests/fault_tolerance/migration/test_vllm.py
+++ b/tests/fault_tolerance/migration/test_vllm.py
@@ -32,7 +32,6 @@ pytestmark = [
    pytest.mark.gpu_1,
    pytest.mark.e2e,
    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
-    pytest.mark.post_merge,  # post_merge to pinpoint failure commit
    pytest.mark.parametrize(
        "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
    ),
@@ -208,6 +207,7 @@ class DynamoWorkerProcess(ManagedProcess):


 @pytest.mark.timeout(290)  # 3x average
+@pytest.mark.post_merge
 def test_request_migration_vllm_aggregated(
    request,
    runtime_services_dynamic_ports,
@@ -258,6 +258,7 @@ def test_request_migration_vllm_aggregated(

 @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_vllm_prefill(
    request,
    runtime_services_dynamic_ports,
@@ -335,6 +336,7 @@ def test_request_migration_vllm_prefill(
    ),
 )
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_vllm_kv_transfer(
    request,
    runtime_services_dynamic_ports,
@@ -412,6 +414,7 @@ def test_request_migration_vllm_kv_transfer(
    ),
 )
 @pytest.mark.timeout(350)  # 3x average
+@pytest.mark.nightly
 def test_request_migration_vllm_decode(
    request,
    runtime_services_dynamic_ports,