Unverified Commit f9d57094 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: Move some fault tolerance tests to nightly (#6636)


Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
parent a671dbd4
......@@ -33,7 +33,7 @@ pytestmark = [
pytest.mark.sglang,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.nightly,
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
]
......
......@@ -36,7 +36,7 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.nightly,
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
pytest.mark.xfail(reason="Cancellation is temporarily disabled", strict=True),
]
......
......@@ -35,7 +35,6 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize("request_plane", ["nats", "tcp"], indirect=True),
]
......@@ -179,6 +178,7 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(110) # 3x average
@pytest.mark.post_merge
def test_request_cancellation_vllm_aggregated(
request, runtime_services_dynamic_ports, predownload_models
):
......@@ -260,6 +260,7 @@ def test_request_cancellation_vllm_aggregated(
@pytest.mark.timeout(150) # 3x average
@pytest.mark.nightly
def test_request_cancellation_vllm_decode_cancel(
request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
):
......@@ -341,6 +342,7 @@ def test_request_cancellation_vllm_decode_cancel(
@pytest.mark.timeout(150) # 3x average
@pytest.mark.nightly
def test_request_cancellation_vllm_prefill_cancel(
request, runtime_services_dynamic_ports, set_ucx_tls_no_mm, predownload_models
):
......
......@@ -31,7 +31,6 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize(
"migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
),
......@@ -211,6 +210,7 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(230) # 3x average
@pytest.mark.post_merge
def test_request_migration_sglang_aggregated(
request,
runtime_services_dynamic_ports,
......@@ -262,6 +262,7 @@ def test_request_migration_sglang_aggregated(
@pytest.mark.skip(reason="Cannot reliably migrate at Prefill that finish < 1 ms")
@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
@pytest.mark.timeout(230) # 3x average
@pytest.mark.nightly
def test_request_migration_sglang_prefill(
request,
runtime_services_dynamic_ports,
......@@ -330,6 +331,7 @@ def test_request_migration_sglang_prefill(
@pytest.mark.skip(reason="KV cache transfer may fail")
@pytest.mark.timeout(230) # 3x average
@pytest.mark.nightly
def test_request_migration_sglang_kv_transfer(
request,
runtime_services_dynamic_ports,
......@@ -397,6 +399,7 @@ def test_request_migration_sglang_kv_transfer(
@pytest.mark.timeout(230) # 3x average
@pytest.mark.nightly
def test_request_migration_sglang_decode(
request,
runtime_services_dynamic_ports,
......
......@@ -31,7 +31,6 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize(
"migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
),
......@@ -198,7 +197,8 @@ class DynamoWorkerProcess(ManagedProcess):
return False
@pytest.mark.timeout(290) # 3x average
@pytest.mark.timeout(290)
@pytest.mark.post_merge # 3x average
def test_request_migration_trtllm_aggregated(
request,
runtime_services_dynamic_ports,
......@@ -249,6 +249,7 @@ def test_request_migration_trtllm_aggregated(
@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
@pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_trtllm_prefill(
request,
runtime_services_dynamic_ports,
......@@ -317,6 +318,7 @@ def test_request_migration_trtllm_prefill(
@pytest.mark.skip(reason="Decode worker can get stuck downloading kv cache")
@pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_trtllm_kv_transfer(
request,
runtime_services_dynamic_ports,
......@@ -384,6 +386,7 @@ def test_request_migration_trtllm_kv_transfer(
@pytest.mark.timeout(350) # 3x average
@pytest.mark.post_merge
def test_request_migration_trtllm_decode(
request,
runtime_services_dynamic_ports,
......
......@@ -32,7 +32,6 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.post_merge, # post_merge to pinpoint failure commit
pytest.mark.parametrize(
"migration_limit", [3, 0], ids=["migration_enabled", "migration_disabled"]
),
......@@ -208,6 +207,7 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(290) # 3x average
@pytest.mark.post_merge
def test_request_migration_vllm_aggregated(
request,
runtime_services_dynamic_ports,
......@@ -258,6 +258,7 @@ def test_request_migration_vllm_aggregated(
@pytest.mark.xfail(strict=False, reason="Prefill migration not yet supported")
@pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_vllm_prefill(
request,
runtime_services_dynamic_ports,
......@@ -335,6 +336,7 @@ def test_request_migration_vllm_prefill(
),
)
@pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_vllm_kv_transfer(
request,
runtime_services_dynamic_ports,
......@@ -412,6 +414,7 @@ def test_request_migration_vllm_kv_transfer(
),
)
@pytest.mark.timeout(350) # 3x average
@pytest.mark.nightly
def test_request_migration_vllm_decode(
request,
runtime_services_dynamic_ports,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment