Unverified Commit d66e6755 authored by Jacky's avatar Jacky Committed by GitHub
Browse files

fix: Extra requests received by workers during FT tests (#4780)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent 2472aa4a
......@@ -25,7 +25,7 @@ pytestmark = [
pytest.mark.sglang,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly,
pytest.mark.post_merge, # post_merge to pinpoint failure commit
]
......@@ -92,6 +92,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port
......
......@@ -26,7 +26,7 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly,
pytest.mark.post_merge, # post_merge to pinpoint failure commit
]
......@@ -88,6 +88,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port
......
......@@ -25,7 +25,7 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly,
pytest.mark.post_merge, # post_merge to pinpoint failure commit
]
......@@ -68,6 +68,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port
......
......@@ -29,6 +29,11 @@ class DynamoFrontendProcess(ManagedProcess):
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
# Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
env.pop("DYN_SYSTEM_PORT", None)
......
......@@ -28,7 +28,7 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.pre_merge, # can be moved to nightly once stable for a week
pytest.mark.post_merge, # post_merge to pinpoint failure commit
]
......@@ -59,6 +59,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}"
......@@ -109,10 +114,6 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(235) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_request_migration_sglang_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......@@ -205,10 +206,6 @@ def test_request_migration_sglang_graceful_shutdown(
@pytest.mark.timeout(135) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_no_request_migration_sglang_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......
......@@ -28,7 +28,7 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.pre_merge, # can be moved to nightly once stable for a week
pytest.mark.post_merge, # post_merge to pinpoint failure commit
]
......@@ -57,6 +57,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment
env = os.environ.copy()
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}"
......@@ -105,10 +110,6 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(290) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_request_migration_trtllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......@@ -201,10 +202,6 @@ def test_request_migration_trtllm_graceful_shutdown(
@pytest.mark.timeout(185) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_no_request_migration_trtllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
......
......@@ -28,7 +28,7 @@ pytestmark = [
pytest.mark.gpu_1,
pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly,
pytest.mark.post_merge, # post_merge to pinpoint failure commit
]
......@@ -59,6 +59,11 @@ class DynamoWorkerProcess(ManagedProcess):
env["VLLM_NIXL_SIDE_CHANNEL_PORT"] = f"560{worker_id[-1]}"
env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}"
......
......@@ -25,6 +25,11 @@ class DynamoFrontendProcess(ManagedProcess):
# Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
env = os.environ.copy()
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env.pop("DYN_SYSTEM_PORT", None)
log_dir = f"{request.node.name}_frontend"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment