Unverified Commit 326a702d authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix(tests): re-enable vLLM LoRA tests and fix gpu_1 test flakiness (#8094)


Signed-off-by: default avatarKeiven Chang <keivenc@nvidia.com>
parent a5b384f7
...@@ -33,6 +33,9 @@ done ...@@ -33,6 +33,9 @@ done
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
......
...@@ -17,6 +17,9 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -17,6 +17,9 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
......
...@@ -27,6 +27,9 @@ MODEL="Qwen/Qwen3-0.6B" ...@@ -27,6 +27,9 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
......
...@@ -97,6 +97,9 @@ case "$MODEL_NAME" in ...@@ -97,6 +97,9 @@ case "$MODEL_NAME" in
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;; MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac esac
# Default KV cache cap from profiling (2x safety over min=461 MiB); ~9.6 GiB peak VRAM
# Uses smallest profiled value across multimodal tests; profiler/test framework overrides via env
: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=922354000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
# Start vLLM worker with vision model # Start vLLM worker with vision model
......
...@@ -50,6 +50,9 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" ...@@ -50,6 +50,9 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
export DYN_REQUEST_PLANE=$REQUEST_PLANE export DYN_REQUEST_PLANE=$REQUEST_PLANE
echo "Using request plane mode: $REQUEST_PLANE" echo "Using request plane mode: $REQUEST_PLANE"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=1119388000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
HTTP_PORT="${DYN_HTTP_PORT:-8000}" HTTP_PORT="${DYN_HTTP_PORT:-8000}"
......
...@@ -63,12 +63,15 @@ python -m dynamo.frontend & ...@@ -63,12 +63,15 @@ python -m dynamo.frontend &
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}" MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
# Default KV cache cap from profiling (2x safety over min=471 MiB); ~4.0 GiB peak VRAM
# Profiler/test framework overrides via env
: "${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:=941712000}"
GPU_MEM_ARGS=$(build_vllm_gpu_mem_args) GPU_MEM_ARGS=$(build_vllm_gpu_mem_args)
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \ DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=${SYSTEM_PORT} \
python -m dynamo.vllm --model "$MODEL" --enforce-eager \ python -m dynamo.vllm --model "$MODEL" --enforce-eager \
--max-model-len "$MAX_MODEL_LEN" \ --max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_CONCURRENT_SEQS" \ --max-num-seqs "$MAX_CONCURRENT_SEQS" \
$GPU_MEM_ARGS & \ $GPU_MEM_ARGS \
--enable-lora \ --enable-lora \
--max-lora-rank 64 & --max-lora-rank 64 &
......
...@@ -53,6 +53,16 @@ def run_serve_deployment( ...@@ -53,6 +53,16 @@ def run_serve_deployment(
if extra_env: if extra_env:
merged_env.update(extra_env) merged_env.update(extra_env)
# In serial mode (no parallel scheduler), pass the marker's KV cache budget
# so the launch script's small default doesn't starve larger models.
# The parallel scheduler already sets this env var per-test.
if "_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES" not in os.environ:
kv_mark = request.node.get_closest_marker("requested_vllm_kv_cache_bytes")
if kv_mark:
merged_env.setdefault(
"_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES", str(int(kv_mark.args[0]))
)
# Stagger engine startup under xdist to avoid vLLM profiling race # Stagger engine startup under xdist to avoid vLLM profiling race
# (vLLM bug #10643: concurrent profilers miscount each other's memory). # (vLLM bug #10643: concurrent profilers miscount each other's memory).
worker_id = os.environ.get("PYTEST_XDIST_WORKER", "") worker_id = os.environ.get("PYTEST_XDIST_WORKER", "")
......
...@@ -403,16 +403,12 @@ vllm_configs = { ...@@ -403,16 +403,12 @@ vllm_configs = {
script_name="agg_multimodal.sh", script_name="agg_multimodal.sh",
marks=[ marks=[
pytest.mark.gpu_1, pytest.mark.gpu_1,
pytest.mark.profiled_vram_gib(14.9), # actual profiled peak with kv-bytes pytest.mark.profiled_vram_gib(19.2), # actual profiled peak with kv-bytes
pytest.mark.requested_vllm_kv_cache_bytes( pytest.mark.requested_vllm_kv_cache_bytes(
922_354_000 4_318_854_000
), # KV cache cap (2x safety over min=461_176_832) ), # KV cache cap (2x safety over min=2_159_426_560)
pytest.mark.timeout( pytest.mark.timeout(360), # 7B model; L4 machines need more headroom
300
), # ~7x observed 42.7s; 7B model loads ~48s on CI (A10G/L4)
pytest.mark.nightly, pytest.mark.nightly,
# https://github.com/ai-dynamo/dynamo/issues/4501
pytest.mark.xfail(strict=False),
], ],
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
script_args=["--model", "llava-hf/llava-1.5-7b-hf"], script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
...@@ -511,6 +507,7 @@ vllm_configs = { ...@@ -511,6 +507,7 @@ vllm_configs = {
], ],
"tool_choice": "required", "tool_choice": "required",
"max_tokens": 1024, "max_tokens": 1024,
"temperature": 0,
}, },
repeat_count=1, repeat_count=1,
expected_response=[ expected_response=[
...@@ -821,9 +818,12 @@ def lora_chat_payload( ...@@ -821,9 +818,12 @@ def lora_chat_payload(
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.model("Qwen/Qwen3-0.6B") @pytest.mark.model("Qwen/Qwen3-0.6B")
@pytest.mark.timeout(600) @pytest.mark.profiled_vram_gib(4.0) # actual nvidia-smi peak with kv-bytes cap
@pytest.mark.requested_vllm_kv_cache_bytes(
941_712_000
) # 2x safety over min=470_855_680
@pytest.mark.timeout(300) # LoRA setup adds overhead; L4 machines are slower
@pytest.mark.post_merge @pytest.mark.post_merge
@pytest.mark.skip(reason="DYN-2260")
def test_lora_aggregated( def test_lora_aggregated(
request, request,
runtime_services_dynamic_ports, runtime_services_dynamic_ports,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment