fix(tests): re-enable vLLM LoRA tests and fix gpu_1 test flakiness (#8094)

Signed-off-by: Keiven Chang <keivenc@nvidia.com>

fix(tests): re-enable vLLM LoRA tests and fix gpu_1 test flakiness (#8094)
Signed-off-by: Keiven Chang <keivenc@nvidia.com>
326a702d · Keiven C · GitHub · a5b384f7 · 326a702d · 326a702d
Unverified Commit 326a702d authored Apr 14, 2026 by Keiven C Committed by GitHub Apr 14, 2026
8 changed files
--- a/examples/backends/vllm/launch/agg.sh
+++ b/examples/backends/vllm/launch/agg.sh
@@ -33,6 +33,9 @@ done
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
+# Profiler/test framework overrides via env
+: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"

--- a/examples/backends/vllm/launch/agg_lmcache.sh
+++ b/examples/backends/vllm/launch/agg_lmcache.sh
@@ -17,6 +17,9 @@ MODEL="Qwen/Qwen3-0.6B"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
+# Profiler/test framework overrides via env
+: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"

--- a/examples/backends/vllm/launch/agg_lmcache_multiproc.sh
+++ b/examples/backends/vllm/launch/agg_lmcache_multiproc.sh
@@ -27,6 +27,9 @@ MODEL="Qwen/Qwen3-0.6B"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
+# Profiler/test framework overrides via env
+: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"

--- a/examples/backends/vllm/launch/agg_multimodal.sh
+++ b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -97,6 +97,9 @@ case "$MODEL_NAME" in
        MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
 esac
+# Default KV cache cap from profiling (2x safety over min=461 MiB); ~9.6 GiB peak VRAM
+# Uses smallest profiled value across multimodal tests; profiler/test framework overrides via env
+: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=922354000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 # Start vLLM worker with vision model

--- a/examples/backends/vllm/launch/agg_request_planes.sh
+++ b/examples/backends/vllm/launch/agg_request_planes.sh
@@ -50,6 +50,9 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
 export DYN_REQUEST_PLANE=$REQUEST_PLANE
 echo "Using request plane mode: $REQUEST_PLANE"
+# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
+# Profiler/test framework overrides via env
+: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 HTTP_PORT="${DYN_HTTP_PORT:-8000}"

--- a/examples/backends/vllm/launch/lora/agg_lora.sh
+++ b/examples/backends/vllm/launch/lora/agg_lora.sh
@@ -63,12 +63,15 @@ python -m dynamo.frontend &
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
 MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
+# Default KV cache cap from profiling (2x safety over min=471 MiB); ~4.0 GiB peak VRAM
+# Profiler/test framework overrides via env
+: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=941712000}"
 GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
    python -m dynamo.vllm --model "$MODEL" --enforce-eager \
    --max-model-len "$MAX_MODEL_LEN" \
    --max-num-seqs "$MAX_CONCURRENT_SEQS" \
-    $GPU_MEM_ARGS & \
+    $GPU_MEM_ARGS \
    --enable-lora \
    --max-lora-rank 64 &

--- a/tests/serve/common.py
+++ b/tests/serve/common.py
@@ -53,6 +53,16 @@ def run_serve_deployment(
    if extra_env:
        merged_env.update(extra_env)
+    # In serial mode (no parallel scheduler), pass the marker's KV cache budget
+    # so the launch script's small default doesn't starve larger models.
+    # The parallel scheduler already sets this env var per-test.
+    if "_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES" not in os.environ:
+        kv_mark = request.node.get_closest_marker("requested_vllm_kv_cache_bytes")
+        if kv_mark:
+            merged_env.setdefault(
+                "_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES", str(int(kv_mark.args[0]))
+            )
    # Stagger engine startup under xdist to avoid vLLM profiling race
    # (vLLM bug #10643: concurrent profilers miscount each other's memory).
    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "")

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -403,16 +403,12 @@ vllm_configs = {
        script_name="agg_multimodal.sh",
        marks=[
            pytest.mark.gpu_1,
-            pytest.mark.profiled_vram_gib(14.9),  # actual profiled peak with kv-bytes
+            pytest.mark.profiled_vram_gib(19.2),  # actual profiled peak with kv-bytes
            pytest.mark.requested_vllm_kv_cache_bytes(
-                922_354_000
+                4_318_854_000
-            ),  # KV cache cap (2x safety over min=461_176_832)
+            ),  # KV cache cap (2x safety over min=2_159_426_560)
-            pytest.mark.timeout(
+            pytest.mark.timeout(360),  # 7B model; L4 machines need more headroom
-                300
-            ),  # ~7x observed 42.7s; 7B model loads ~48s on CI (A10G/L4)
            pytest.mark.nightly,
-            # https://github.com/ai-dynamo/dynamo/issues/4501
-            pytest.mark.xfail(strict=False),
        ],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
@@ -511,6 +507,7 @@ vllm_configs = {
                    ],
                    "tool_choice": "required",
                    "max_tokens": 1024,
+                    "temperature": 0,
                },
                repeat_count=1,
                expected_response=[
@@ -821,9 +818,12 @@ def lora_chat_payload(
 @pytest.mark.e2e
 @pytest.mark.gpu_1
 @pytest.mark.model("Qwen/Qwen3-0.6B")
-@pytest.mark.timeout(600)
+@pytest.mark.profiled_vram_gib(4.0)  # actual nvidia-smi peak with kv-bytes cap
+@pytest.mark.requested_vllm_kv_cache_bytes(
+    941_712_000
+)  # 2x safety over min=470_855_680
+@pytest.mark.timeout(300)  # LoRA setup adds overhead; L4 machines are slower
 @pytest.mark.post_merge
-@pytest.mark.skip(reason="DYN-2260")
 def test_lora_aggregated(
    request,
    runtime_services_dynamic_ports,