Unverified Commit d66e6755 authored by Jacky's avatar Jacky Committed by GitHub
Browse files

fix: Extra requests received by workers during FT tests (#4780)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent 2472aa4a
...@@ -25,7 +25,7 @@ pytestmark = [ ...@@ -25,7 +25,7 @@ pytestmark = [
pytest.mark.sglang, pytest.mark.sglang,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly, pytest.mark.post_merge, # post_merge to pinpoint failure commit
] ]
...@@ -92,6 +92,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -92,6 +92,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment # Set debug logging environment
env = os.environ.copy() env = os.environ.copy()
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port env["DYN_SYSTEM_PORT"] = port
......
...@@ -26,7 +26,7 @@ pytestmark = [ ...@@ -26,7 +26,7 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly, pytest.mark.post_merge, # post_merge to pinpoint failure commit
] ]
...@@ -88,6 +88,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -88,6 +88,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment # Set debug logging environment
env = os.environ.copy() env = os.environ.copy()
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port env["DYN_SYSTEM_PORT"] = port
......
...@@ -25,7 +25,7 @@ pytestmark = [ ...@@ -25,7 +25,7 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly, pytest.mark.post_merge, # post_merge to pinpoint failure commit
] ]
...@@ -68,6 +68,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -68,6 +68,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment # Set debug logging environment
env = os.environ.copy() env = os.environ.copy()
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = port env["DYN_SYSTEM_PORT"] = port
......
...@@ -29,6 +29,11 @@ class DynamoFrontendProcess(ManagedProcess): ...@@ -29,6 +29,11 @@ class DynamoFrontendProcess(ManagedProcess):
# Set debug logging environment # Set debug logging environment
env = os.environ.copy() env = os.environ.copy()
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
# Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server # Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
env.pop("DYN_SYSTEM_PORT", None) env.pop("DYN_SYSTEM_PORT", None)
......
...@@ -28,7 +28,7 @@ pytestmark = [ ...@@ -28,7 +28,7 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.pre_merge, # can be moved to nightly once stable for a week pytest.mark.post_merge, # post_merge to pinpoint failure commit
] ]
...@@ -59,6 +59,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -59,6 +59,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment # Set debug logging environment
env = os.environ.copy() env = os.environ.copy()
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}" env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}"
...@@ -109,10 +114,6 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -109,10 +114,6 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(235) # 3x average @pytest.mark.timeout(235) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_request_migration_sglang_worker_failure( def test_request_migration_sglang_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
...@@ -205,10 +206,6 @@ def test_request_migration_sglang_graceful_shutdown( ...@@ -205,10 +206,6 @@ def test_request_migration_sglang_graceful_shutdown(
@pytest.mark.timeout(135) # 3x average @pytest.mark.timeout(135) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_no_request_migration_sglang_worker_failure( def test_no_request_migration_sglang_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
......
...@@ -28,7 +28,7 @@ pytestmark = [ ...@@ -28,7 +28,7 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.pre_merge, # can be moved to nightly once stable for a week pytest.mark.post_merge, # post_merge to pinpoint failure commit
] ]
...@@ -57,6 +57,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -57,6 +57,11 @@ class DynamoWorkerProcess(ManagedProcess):
# Set debug logging environment # Set debug logging environment
env = os.environ.copy() env = os.environ.copy()
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}" env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}"
...@@ -105,10 +110,6 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -105,10 +110,6 @@ class DynamoWorkerProcess(ManagedProcess):
@pytest.mark.timeout(290) # 3x average @pytest.mark.timeout(290) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_request_migration_trtllm_worker_failure( def test_request_migration_trtllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
...@@ -201,10 +202,6 @@ def test_request_migration_trtllm_graceful_shutdown( ...@@ -201,10 +202,6 @@ def test_request_migration_trtllm_graceful_shutdown(
@pytest.mark.timeout(185) # 3x average @pytest.mark.timeout(185) # 3x average
@pytest.mark.xfail(
reason="For some reason both replicas received the request where only one should",
strict=False,
)
def test_no_request_migration_trtllm_worker_failure( def test_no_request_migration_trtllm_worker_failure(
request, runtime_services, predownload_models, set_ucx_tls_no_mm request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
......
...@@ -28,7 +28,7 @@ pytestmark = [ ...@@ -28,7 +28,7 @@ pytestmark = [
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.e2e, pytest.mark.e2e,
pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME), pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
pytest.mark.nightly, pytest.mark.post_merge, # post_merge to pinpoint failure commit
] ]
...@@ -59,6 +59,11 @@ class DynamoWorkerProcess(ManagedProcess): ...@@ -59,6 +59,11 @@ class DynamoWorkerProcess(ManagedProcess):
env["VLLM_NIXL_SIDE_CHANNEL_PORT"] = f"560{worker_id[-1]}" env["VLLM_NIXL_SIDE_CHANNEL_PORT"] = f"560{worker_id[-1]}"
env["DYN_LOG"] = "debug" env["DYN_LOG"] = "debug"
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]' env["DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS"] = '["generate"]'
env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}" env["DYN_SYSTEM_PORT"] = f"808{worker_id[-1]}"
......
...@@ -25,6 +25,11 @@ class DynamoFrontendProcess(ManagedProcess): ...@@ -25,6 +25,11 @@ class DynamoFrontendProcess(ManagedProcess):
# Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server # Unset DYN_SYSTEM_PORT - frontend doesn't use system metrics server
env = os.environ.copy() env = os.environ.copy()
# Disable canary health check - these tests expect full control over requests
# sent to the workers where canary health check intermittently sends dummy
# requests to workers interfering with the test process which may cause
# intermittent failures
env["DYN_HEALTH_CHECK_ENABLED"] = "false"
env.pop("DYN_SYSTEM_PORT", None) env.pop("DYN_SYSTEM_PORT", None)
log_dir = f"{request.node.name}_frontend" log_dir = f"{request.node.name}_frontend"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment