test: add timeout markers to pytest.mark.gpu_1 serve tests (#4768)

Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

test: add timeout markers to pytest.mark.gpu_1 serve tests (#4768)
Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
2472aa4a · Keiven C · GitHub · a9078ed0 · 2472aa4a · 2472aa4a
Unverified Commit 2472aa4a authored Dec 05, 2025 by Keiven C Committed by GitHub Dec 05, 2025
4 changed files
--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -37,6 +37,9 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/sglang"
 )
+# SGLang test configurations
+# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
+# TODO: Parallelize these tests to reduce total execution time
 sglang_configs = {
    "aggregated": SGLangConfig(
        # Uses backend agg.sh (with metrics enabled) for testing standard
@@ -44,7 +47,11 @@ sglang_configs = {
        name="aggregated",
        directory=sglang_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(120),  # 3x measured time (39s)
+        ],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -120,7 +127,12 @@ sglang_configs = {
        name="template_verification",
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
        script_name="template_verifier.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.nightly,
+            pytest.mark.timeout(60),  # 3x measured time (20s)
+        ],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -163,10 +175,14 @@ sglang_configs = {
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.nightly,
+            pytest.mark.timeout(90),  # 3x measured time (29s)
+        ],
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
-        timeout=180,
        models_port=8000,
        request_payloads=[
            # Test default payload with multiple inputs
@@ -196,7 +212,12 @@ sglang_configs = {
        name="completions_only",
        directory=sglang_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.timeout(
+                160
+            ),  # Total test timeout: 2x measured average (79.36s)
+        ],
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model-path",

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -34,13 +34,20 @@ trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
    WORKSPACE_DIR, "examples/backends/trtllm"
 )
-# trtllm test configurations
+# TensorRT-LLM test configurations
+# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
+# TODO: Parallelize these tests to reduce total execution time
 trtllm_configs = {
    "aggregated": TRTLLMConfig(
        name="aggregated",
        directory=trtllm_dir,
        script_name="agg_metrics.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.trtllm,
+            pytest.mark.timeout(140),  # 3x measured time (44.66s)
+        ],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -65,7 +72,12 @@ trtllm_configs = {
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.trtllm,
+            pytest.mark.timeout(320),  # 3x measured time (103.66s)
+        ],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -79,7 +91,12 @@ trtllm_configs = {
        name="aggregated_router",
        directory=trtllm_dir,
        script_name="agg_router.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.trtllm,
+            pytest.mark.timeout(120),  # 3x measured time (37.91s)
+        ],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -121,7 +138,11 @@ trtllm_configs = {
        name="completions_only",
        directory=trtllm_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.trtllm,
+            pytest.mark.timeout(260),  # 3x measured time (83.85s)
+        ],
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=["--dyn-endpoint-types", "completions"],
        env={
@@ -156,6 +177,7 @@ def test_deployment(trtllm_config_test, request, runtime_services, predownload_m
 @pytest.mark.e2e
 @pytest.mark.gpu_1
 @pytest.mark.trtllm
+@pytest.mark.timeout(480)  # 3x measured time (159.68s)
 def test_chat_only_aggregated_with_test_logits_processor(
    request, runtime_services, predownload_models, monkeypatch
 ):

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -40,12 +40,18 @@ vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
 # vLLM test configurations
+# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
+# TODO: Parallelize these tests to reduce total execution time
 vllm_configs = {
    "aggregated": VLLMConfig(
        name="aggregated",
        directory=vllm_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(130),  # 3x measured time (43s)
+        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
@@ -57,7 +63,11 @@ vllm_configs = {
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(210),  # 3x estimated time (70s)
+        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
@@ -70,7 +80,10 @@ vllm_configs = {
        name="aggregated_lmcache_multiproc",
        directory=vllm_dir,
        script_name="agg_lmcache_multiproc.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.timeout(210),  # 3x estimated time (70s)
+        ],
        model="Qwen/Qwen3-0.6B",
        env={
            "PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}"
@@ -86,7 +99,11 @@ vllm_configs = {
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(130),  # 3x measured time (43s)
+        ],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
@@ -98,7 +115,11 @@ vllm_configs = {
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(130),  # 3x measured time (43s)
+        ],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[
@@ -416,7 +437,10 @@ vllm_configs = {
        name="completions_only",
        directory=vllm_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.timeout(180),  # 3x estimated time (60s) for 7B model
+        ],
        model="deepseek-ai/deepseek-llm-7b-base",
        script_args=[
            "--model",

--- a/tests/utils/managed_process.py
+++ b/tests/utils/managed_process.py
@@ -448,9 +448,16 @@ class ManagedProcess:
            elapsed = time.time() - start_time
        self._logger.error(
-            "FAILED: Check URL: %s (attempts=%d, elapsed=%.1fs)", url, attempt, elapsed
+            "TIMEOUT: Check URL: %s failed after %.1fs (attempts=%d, timeout=%.1fs)",
+            url,
+            elapsed,
+            attempt,
+            timeout,
+        )
+        raise RuntimeError(
+            "TIMEOUT: Check URL: %s failed after %.1fs (timeout=%.1fs)"
+            % (url, elapsed, timeout)
        )
-        raise RuntimeError("FAILED: Check URL: %s" % url)
    def _check_funcs(self, timeout):
        elapsed = 0.0