Unverified Commit 2472aa4a authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

test: add timeout markers to pytest.mark.gpu_1 serve tests (#4768)


Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent a9078ed0
...@@ -37,6 +37,9 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join( ...@@ -37,6 +37,9 @@ sglang_dir = os.environ.get("SGLANG_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/sglang" WORKSPACE_DIR, "examples/backends/sglang"
) )
# SGLang test configurations
# NOTE: pytest.mark.gpu_1 tests take ~167s (2m 47s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
sglang_configs = { sglang_configs = {
"aggregated": SGLangConfig( "aggregated": SGLangConfig(
# Uses backend agg.sh (with metrics enabled) for testing standard # Uses backend agg.sh (with metrics enabled) for testing standard
...@@ -44,7 +47,11 @@ sglang_configs = { ...@@ -44,7 +47,11 @@ sglang_configs = {
name="aggregated", name="aggregated",
directory=sglang_dir, directory=sglang_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(120), # 3x measured time (39s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
...@@ -120,7 +127,12 @@ sglang_configs = { ...@@ -120,7 +127,12 @@ sglang_configs = {
name="template_verification", name="template_verification",
directory=SERVE_TEST_DIR, # special directory for test-specific scripts directory=SERVE_TEST_DIR, # special directory for test-specific scripts
script_name="template_verifier.sh", script_name="template_verifier.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(60), # 3x measured time (20s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
...@@ -163,10 +175,14 @@ sglang_configs = { ...@@ -163,10 +175,14 @@ sglang_configs = {
name="embedding_agg", name="embedding_agg",
directory=sglang_dir, directory=sglang_dir,
script_name="agg_embed.sh", script_name="agg_embed.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.nightly,
pytest.mark.timeout(90), # 3x measured time (29s)
],
model="Qwen/Qwen3-Embedding-4B", model="Qwen/Qwen3-Embedding-4B",
delayed_start=0, delayed_start=0,
timeout=180,
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
# Test default payload with multiple inputs # Test default payload with multiple inputs
...@@ -196,7 +212,12 @@ sglang_configs = { ...@@ -196,7 +212,12 @@ sglang_configs = {
name="completions_only", name="completions_only",
directory=sglang_dir, directory=sglang_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1], marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(
160
), # Total test timeout: 2x measured average (79.36s)
],
model="deepseek-ai/deepseek-llm-7b-base", model="deepseek-ai/deepseek-llm-7b-base",
script_args=[ script_args=[
"--model-path", "--model-path",
......
...@@ -34,13 +34,20 @@ trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join( ...@@ -34,13 +34,20 @@ trtllm_dir = os.environ.get("TRTLLM_DIR") or os.path.join(
WORKSPACE_DIR, "examples/backends/trtllm" WORKSPACE_DIR, "examples/backends/trtllm"
) )
# trtllm test configurations # TensorRT-LLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~442s (7m 22s) total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
trtllm_configs = { trtllm_configs = {
"aggregated": TRTLLMConfig( "aggregated": TRTLLMConfig(
name="aggregated", name="aggregated",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg_metrics.sh", script_name="agg_metrics.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.timeout(140), # 3x measured time (44.66s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
...@@ -65,7 +72,12 @@ trtllm_configs = { ...@@ -65,7 +72,12 @@ trtllm_configs = {
name="disaggregated_same_gpu", name="disaggregated_same_gpu",
directory=trtllm_dir, directory=trtllm_dir,
script_name="disagg_same_gpu.sh", script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.timeout(320), # 3x measured time (103.66s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
...@@ -79,7 +91,12 @@ trtllm_configs = { ...@@ -79,7 +91,12 @@ trtllm_configs = {
name="aggregated_router", name="aggregated_router",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg_router.sh", script_name="agg_router.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.trtllm,
pytest.mark.timeout(120), # 3x measured time (37.91s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
models_port=8000, models_port=8000,
request_payloads=[ request_payloads=[
...@@ -121,7 +138,11 @@ trtllm_configs = { ...@@ -121,7 +138,11 @@ trtllm_configs = {
name="completions_only", name="completions_only",
directory=trtllm_dir, directory=trtllm_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm], marks=[
pytest.mark.gpu_1,
pytest.mark.trtllm,
pytest.mark.timeout(260), # 3x measured time (83.85s)
],
model="deepseek-ai/deepseek-llm-7b-base", model="deepseek-ai/deepseek-llm-7b-base",
script_args=["--dyn-endpoint-types", "completions"], script_args=["--dyn-endpoint-types", "completions"],
env={ env={
...@@ -156,6 +177,7 @@ def test_deployment(trtllm_config_test, request, runtime_services, predownload_m ...@@ -156,6 +177,7 @@ def test_deployment(trtllm_config_test, request, runtime_services, predownload_m
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.trtllm @pytest.mark.trtllm
@pytest.mark.timeout(480) # 3x measured time (159.68s)
def test_chat_only_aggregated_with_test_logits_processor( def test_chat_only_aggregated_with_test_logits_processor(
request, runtime_services, predownload_models, monkeypatch request, runtime_services, predownload_models, monkeypatch
): ):
......
...@@ -40,12 +40,18 @@ vllm_dir = os.environ.get("VLLM_DIR") or os.path.join( ...@@ -40,12 +40,18 @@ vllm_dir = os.environ.get("VLLM_DIR") or os.path.join(
# vLLM test configurations # vLLM test configurations
# NOTE: pytest.mark.gpu_1 tests take ~5.5 minutes total to run sequentially (with models pre-cached)
# TODO: Parallelize these tests to reduce total execution time
vllm_configs = { vllm_configs = {
"aggregated": VLLMConfig( "aggregated": VLLMConfig(
name="aggregated", name="aggregated",
directory=vllm_dir, directory=vllm_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(130), # 3x measured time (43s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
...@@ -57,7 +63,11 @@ vllm_configs = { ...@@ -57,7 +63,11 @@ vllm_configs = {
name="aggregated_lmcache", name="aggregated_lmcache",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_lmcache.sh", script_name="agg_lmcache.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(210), # 3x estimated time (70s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
request_payloads=[ request_payloads=[
chat_payload_default(), chat_payload_default(),
...@@ -70,7 +80,10 @@ vllm_configs = { ...@@ -70,7 +80,10 @@ vllm_configs = {
name="aggregated_lmcache_multiproc", name="aggregated_lmcache_multiproc",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_lmcache_multiproc.sh", script_name="agg_lmcache_multiproc.sh",
marks=[pytest.mark.gpu_1], marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(210), # 3x estimated time (70s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={ env={
"PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}" "PROMETHEUS_MULTIPROC_DIR": f"/tmp/prometheus_multiproc_test_{os.getpid()}_{random.randint(0, 10000)}"
...@@ -86,7 +99,11 @@ vllm_configs = { ...@@ -86,7 +99,11 @@ vllm_configs = {
name="agg-request-plane-tcp", name="agg-request-plane-tcp",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_request_planes.sh", script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(130), # 3x measured time (43s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
script_args=["--tcp"], script_args=["--tcp"],
request_payloads=[ request_payloads=[
...@@ -98,7 +115,11 @@ vllm_configs = { ...@@ -98,7 +115,11 @@ vllm_configs = {
name="agg-request-plane-http", name="agg-request-plane-http",
directory=vllm_dir, directory=vllm_dir,
script_name="agg_request_planes.sh", script_name="agg_request_planes.sh",
marks=[pytest.mark.gpu_1, pytest.mark.pre_merge], marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(130), # 3x measured time (43s)
],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
script_args=["--http"], script_args=["--http"],
request_payloads=[ request_payloads=[
...@@ -416,7 +437,10 @@ vllm_configs = { ...@@ -416,7 +437,10 @@ vllm_configs = {
name="completions_only", name="completions_only",
directory=vllm_dir, directory=vllm_dir,
script_name="agg.sh", script_name="agg.sh",
marks=[pytest.mark.gpu_1], marks=[
pytest.mark.gpu_1,
pytest.mark.timeout(180), # 3x estimated time (60s) for 7B model
],
model="deepseek-ai/deepseek-llm-7b-base", model="deepseek-ai/deepseek-llm-7b-base",
script_args=[ script_args=[
"--model", "--model",
......
...@@ -448,9 +448,16 @@ class ManagedProcess: ...@@ -448,9 +448,16 @@ class ManagedProcess:
elapsed = time.time() - start_time elapsed = time.time() - start_time
self._logger.error( self._logger.error(
"FAILED: Check URL: %s (attempts=%d, elapsed=%.1fs)", url, attempt, elapsed "TIMEOUT: Check URL: %s failed after %.1fs (attempts=%d, timeout=%.1fs)",
url,
elapsed,
attempt,
timeout,
)
raise RuntimeError(
"TIMEOUT: Check URL: %s failed after %.1fs (timeout=%.1fs)"
% (url, elapsed, timeout)
) )
raise RuntimeError("FAILED: Check URL: %s" % url)
def _check_funcs(self, timeout): def _check_funcs(self, timeout):
elapsed = 0.0 elapsed = 0.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment