Unverified Commit 2472aa4a authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

test: add timeout markers to pytest.mark.gpu_1 serve tests (#4768)


Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent a9078ed0
......@@ -37,6 +37,9 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/sglang"
)
# SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
sglang_configs = {
"aggregated": SGLangConfig(
# Uses backend agg.sh (with metrics enabled) for testing standard
......@@ -44,7 +47,11 @@ sglang_configs = {
name="aggregated",
directory=sglang_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(120), # 3x measured time (39s)
],
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
......@@ -120,7 +127,12 @@ sglang_configs = {
name="template_verification",
directory=SERVE_TEST_DIR, # special directory for test-specific scripts
script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(60), # 3x measured time (20s)
],
model="Qwen/Qwen3-0.6B",
env={},
models_port=8000,
......@@ -163,10 +175,14 @@ sglang_configs = {
name="embedding_agg",
directory=sglang_dir,
script_name="agg_embed.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(90), # 3x measured time (29s)
],
model="Qwen/Qwen3-Embedding-4B",
delayed_start=0,
timeout=180,
models_port=8000,
request_payloads=[
# Test default payload with multiple inputs
......@@ -196,7 +212,12 @@ sglang_configs = {
name="completions_only",
directory=sglang_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(
160
), # Total test timeout: 2x measured average (79.36s)
],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model-path",
......
......@@ -34,13 +34,20 @@ trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/trtllm"
)
# trtllm test configurations
# TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
trtllm_configs = {
"aggregated": TRTLLMConfig(
name="aggregated",
directory=trtllm_dir,
script_name="agg_metrics.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.timeout(140), # 3x measured time (44.66s)
],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
......@@ -65,7 +72,12 @@ trtllm_configs = {
name="disaggregated_same_gpu",
directory=trtllm_dir,
script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.timeout(320), # 3x measured time (103.66s)
],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
......@@ -79,7 +91,12 @@ trtllm_configs = {
name="aggregated_router",
directory=trtllm_dir,
script_name="agg_router.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.timeout(120), # 3x measured time (37.91s)
],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
......@@ -121,7 +138,11 @@ trtllm_configs = {
name="completions_only",
directory=trtllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
marks=[
pytest.mark.gpu_1,
pytest.mark.trtllm,
pytest.mark.timeout(260), # 3x measured time (83.85s)
],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=["--dyn-endpoint-types", "completions"],
env={
......@@ -156,6 +177,7 @@ def test_deployment(trtllm_config_test, request, runtime_services, predownload_m
@pytest.mark.e2e
@pytest.mark.gpu_1
@pytest.mark.trtllm
@pytest.mark.timeout(480) # 3x measured time (159.68s)
def test_chat_only_aggregated_with_test_logits_processor(
request, runtime_services, predownload_models, monkeypatch
):
......
......@@ -40,12 +40,18 @@ vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
# vLLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
vllm_configs = {
"aggregated": VLLMConfig(
name="aggregated",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(130), # 3x measured time (43s)
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
......@@ -57,7 +63,11 @@ vllm_configs = {
name="aggregated_lmcache",
directory=vllm_dir,
script_name="agg_lmcache.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(210), # 3x estimated time (70s)
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
chat_payload_default(),
......@@ -70,7 +80,10 @@ vllm_configs = {
name="aggregated_lmcache_multiproc",
directory=vllm_dir,
script_name="agg_lmcache_multiproc.sh",
marks=[pytest.mark.gpu_1],
marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(210), # 3x estimated time (70s)
],
model="Qwen/Qwen3-0.6B",
env={
"PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}"
......@@ -86,7 +99,11 @@ vllm_configs = {
name="agg-request-plane-tcp",
directory=vllm_dir,
script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(130), # 3x measured time (43s)
],
model="Qwen/Qwen3-0.6B",
script_args=["--tcp"],
request_payloads=[
......@@ -98,7 +115,11 @@ vllm_configs = {
name="agg-request-plane-http",
directory=vllm_dir,
script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(130), # 3x measured time (43s)
],
model="Qwen/Qwen3-0.6B",
script_args=["--http"],
request_payloads=[
......@@ -416,7 +437,10 @@ vllm_configs = {
name="completions_only",
directory=vllm_dir,
script_name="agg.sh",
marks=[pytest.mark.gpu_1],
marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(180), # 3x estimated time (60s) for 7B model
],
model="deepseek-ai/deepseek-llm-7b-base",
script_args=[
"--model",
......
......@@ -448,9 +448,16 @@ class ManagedProcess:
elapsed = time.time() - start_time
self._logger.error(
"FAILED: Check URL: %s (attempts=%d, elapsed=%.1fs)", url, attempt, elapsed
"TIMEOUT: Check URL: %s failed after %.1fs (attempts=%d, timeout=%.1fs)",
url,
elapsed,
attempt,
timeout,
)
raise RuntimeError(
"TIMEOUT: Check URL: %s failed after %.1fs (timeout=%.1fs)"
% (url, elapsed, timeout)
)
raise RuntimeError("FAILED: Check URL: %s" % url)
def _check_funcs(self, timeout):
elapsed = 0.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment