Unverified Commit f9d57094 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: Move some fault tolerance tests to nightly (#6636)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent a671dbd4
...@@ -33,7 +33,7 @@ pytestmark = [ ...@@ -33,7 +33,7 @@ pytestmark = [
pytest.mark.sglang, pytest.mark.sglang,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit pytest.mark.nightly,
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True), pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
] ]
......
...@@ -36,7 +36,7 @@ pytestmark = [ ...@@ -36,7 +36,7 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit pytest.mark.nightly,
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True), pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True), pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
] ]
......
...@@ -35,7 +35,6 @@ pytestmark = [ ...@@ -35,7 +35,6 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True), pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
] ]
...@@ -179,6 +178,7 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -179,6 +178,7 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(110) # 3x average @pytest.mark.timeout(110) # 3x average
@pytest.mark.post_merge
def test_request_cancellation_vllm_aggregated( def test_request_cancellation_vllm_aggregated(
request, runtime_services_dynamic_ports, predownload_models request, runtime_services_dynamic_ports, predownload_models
): ):
...@@ -260,6 +260,7 @@ def test_request_cancellation_vllm_aggregated( ...@@ -260,6 +260,7 @@ def test_request_cancellation_vllm_aggregated(
@pytest.mark.timeout(150) # 3x average @pytest.mark.timeout(150) # 3x average
@pytest.mark.nightly
def test_request_cancellation_vllm_decode_cancel( def test_request_cancellation_vllm_decode_cancel(
request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
): ):
...@@ -341,6 +342,7 @@ def test_request_cancellation_vllm_decode_cancel( ...@@ -341,6 +342,7 @@ def test_request_cancellation_vllm_decode_cancel(
@pytest.mark.timeout(150) # 3x average @pytest.mark.timeout(150) # 3x average
@pytest.mark.nightly
def test_request_cancellation_vllm_prefill_cancel( def test_request_cancellation_vllm_prefill_cancel(
request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
): ):
......
...@@ -31,7 +31,6 @@ pytestmark = [ ...@@ -31,7 +31,6 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize( pytest.mark.parametrize(
"migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"] "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
), ),
...@@ -211,6 +210,7 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -211,6 +210,7 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(230) # 3x average @pytest.mark.timeout(230) # 3x average
@pytest.mark.post_merge
def test_request_migration_sglang_aggregated( def test_request_migration_sglang_aggregated(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -262,6 +262,7 @@ def test_request_migration_sglang_aggregated( ...@@ -262,6 +262,7 @@ def test_request_migration_sglang_aggregated(
@pytest.mark.skip(reason="Cannot reliably migrate at Prefill that finish < 1 ms") @pytest.mark.skip(reason="Cannot reliably migrate at Prefill that finish < 1 ms")
@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported") @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
@pytest.mark.timeout(230) # 3x average @pytest.mark.timeout(230) # 3x average
@pytest.mark.nightly
def test_request_migration_sglang_prefill( def test_request_migration_sglang_prefill(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -330,6 +331,7 @@ def test_request_migration_sglang_prefill( ...@@ -330,6 +331,7 @@ def test_request_migration_sglang_prefill(
@pytest.mark.skip(reason="KV cache transfer may fail") @pytest.mark.skip(reason="KV cache transfer may fail")
@pytest.mark.timeout(230) # 3x average @pytest.mark.timeout(230) # 3x average
@pytest.mark.nightly
def test_request_migration_sglang_kv_transfer( def test_request_migration_sglang_kv_transfer(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -397,6 +399,7 @@ def test_request_migration_sglang_kv_transfer( ...@@ -397,6 +399,7 @@ def test_request_migration_sglang_kv_transfer(
@pytest.mark.timeout(230) # 3x average @pytest.mark.timeout(230) # 3x average
@pytest.mark.nightly
def test_request_migration_sglang_decode( def test_request_migration_sglang_decode(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
......
...@@ -31,7 +31,6 @@ pytestmark = [ ...@@ -31,7 +31,6 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize( pytest.mark.parametrize(
"migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"] "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
), ),
...@@ -198,7 +197,8 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -198,7 +197,8 @@ class DynamoWorkerProcess(ManagedProcess):
return False return False
@pytest.mark.timeout(290) # 3x average @pytest.mark.timeout(290)
@pytest.mark.post_merge # 3x average
def test_request_migration_trtllm_aggregated( def test_request_migration_trtllm_aggregated(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -249,6 +249,7 @@ def test_request_migration_trtllm_aggregated( ...@@ -249,6 +249,7 @@ def test_request_migration_trtllm_aggregated(
@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported") @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_trtllm_prefill( def test_request_migration_trtllm_prefill(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -317,6 +318,7 @@ def test_request_migration_trtllm_prefill( ...@@ -317,6 +318,7 @@ def test_request_migration_trtllm_prefill(
@pytest.mark.skip(reason="Decode worker can get stuck downloading kv cache") @pytest.mark.skip(reason="Decode worker can get stuck downloading kv cache")
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_trtllm_kv_transfer( def test_request_migration_trtllm_kv_transfer(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -384,6 +386,7 @@ def test_request_migration_trtllm_kv_transfer( ...@@ -384,6 +386,7 @@ def test_request_migration_trtllm_kv_transfer(
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(350) # 3x average
@pytest.mark.post_merge
def test_request_migration_trtllm_decode( def test_request_migration_trtllm_decode(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
......
...@@ -32,7 +32,6 @@ pytestmark = [ ...@@ -32,7 +32,6 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize( pytest.mark.parametrize(
"migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"] "migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
), ),
...@@ -208,6 +207,7 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -208,6 +207,7 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(290) # 3x average @pytest.mark.timeout(290) # 3x average
@pytest.mark.post_merge
def test_request_migration_vllm_aggregated( def test_request_migration_vllm_aggregated(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -258,6 +258,7 @@ def test_request_migration_vllm_aggregated( ...@@ -258,6 +258,7 @@ def test_request_migration_vllm_aggregated(
@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported") @pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_vllm_prefill( def test_request_migration_vllm_prefill(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -335,6 +336,7 @@ def test_request_migration_vllm_prefill( ...@@ -335,6 +336,7 @@ def test_request_migration_vllm_prefill(
), ),
) )
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_vllm_kv_transfer( def test_request_migration_vllm_kv_transfer(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
...@@ -412,6 +414,7 @@ def test_request_migration_vllm_kv_transfer( ...@@ -412,6 +414,7 @@ def test_request_migration_vllm_kv_transfer(
), ),
) )
@pytest.mark.timeout(350) # 3x average @pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_vllm_decode( def test_request_migration_vllm_decode(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment