Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
326a702d
Unverified
Commit
326a702d
authored
Apr 14, 2026
by
Keiven C
Committed by
GitHub
Apr 14, 2026
Browse files
fix(tests): re-enable vLLM LoRA tests and fix gpu_1 test flakiness (#8094)
Signed-off-by:
Keiven Chang
<
keivenc@nvidia.com
>
parent
a5b384f7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
39 additions
and
11 deletions
+39
-11
examples/backends/vllm/launch/agg.sh
examples/backends/vllm/launch/agg.sh
+3
-0
examples/backends/vllm/launch/agg_lmcache.sh
examples/backends/vllm/launch/agg_lmcache.sh
+3
-0
examples/backends/vllm/launch/agg_lmcache_multiproc.sh
examples/backends/vllm/launch/agg_lmcache_multiproc.sh
+3
-0
examples/backends/vllm/launch/agg_multimodal.sh
examples/backends/vllm/launch/agg_multimodal.sh
+3
-0
examples/backends/vllm/launch/agg_request_planes.sh
examples/backends/vllm/launch/agg_request_planes.sh
+3
-0
examples/backends/vllm/launch/lora/agg_lora.sh
examples/backends/vllm/launch/lora/agg_lora.sh
+4
-1
tests/serve/common.py
tests/serve/common.py
+10
-0
tests/serve/test_vllm.py
tests/serve/test_vllm.py
+10
-10
No files found.
examples/backends/vllm/launch/agg.sh
View file @
326a702d
...
@@ -33,6 +33,9 @@ done
...
@@ -33,6 +33,9 @@ done
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
:
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:
=1119388000
}
"
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
...
...
examples/backends/vllm/launch/agg_lmcache.sh
View file @
326a702d
...
@@ -17,6 +17,9 @@ MODEL="Qwen/Qwen3-0.6B"
...
@@ -17,6 +17,9 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
:
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:
=1119388000
}
"
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
...
...
examples/backends/vllm/launch/agg_lmcache_multiproc.sh
View file @
326a702d
...
@@ -27,6 +27,9 @@ MODEL="Qwen/Qwen3-0.6B"
...
@@ -27,6 +27,9 @@ MODEL="Qwen/Qwen3-0.6B"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
:
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:
=1119388000
}
"
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
...
...
examples/backends/vllm/launch/agg_multimodal.sh
View file @
326a702d
...
@@ -97,6 +97,9 @@ case "$MODEL_NAME" in
...
@@ -97,6 +97,9 @@ case "$MODEL_NAME" in
MODEL_EXTRA_ARGS
=
"--tensor-parallel-size=8"
;;
MODEL_EXTRA_ARGS
=
"--tensor-parallel-size=8"
;;
esac
esac
# Default KV cache cap from profiling (2x safety over min=461 MiB); ~9.6 GiB peak VRAM
# Uses smallest profiled value across multimodal tests; profiler/test framework overrides via env
:
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:
=922354000
}
"
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
# Start vLLM worker with vision model
# Start vLLM worker with vision model
...
...
examples/backends/vllm/launch/agg_request_planes.sh
View file @
326a702d
...
@@ -50,6 +50,9 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
...
@@ -50,6 +50,9 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
export
DYN_REQUEST_PLANE
=
$REQUEST_PLANE
export
DYN_REQUEST_PLANE
=
$REQUEST_PLANE
echo
"Using request plane mode:
$REQUEST_PLANE
"
echo
"Using request plane mode:
$REQUEST_PLANE
"
# Default KV cache cap from profiling (2x safety over min=560 MiB); ~3.8 GiB peak VRAM
# Profiler/test framework overrides via env
:
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:
=1119388000
}
"
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
...
...
examples/backends/vllm/launch/lora/agg_lora.sh
View file @
326a702d
...
@@ -63,12 +63,15 @@ python -m dynamo.frontend &
...
@@ -63,12 +63,15 @@ python -m dynamo.frontend &
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
MAX_CONCURRENT_SEQS
=
"
${
MAX_CONCURRENT_SEQS
:-
2
}
"
# Default KV cache cap from profiling (2x safety over min=471 MiB); ~4.0 GiB peak VRAM
# Profiler/test framework overrides via env
:
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:
=941712000
}
"
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
GPU_MEM_ARGS
=
$(
build_vllm_gpu_mem_args
)
DYN_SYSTEM_ENABLED
=
true
DYN_SYSTEM_PORT
=
${
SYSTEM_PORT
}
\
DYN_SYSTEM_ENABLED
=
true
DYN_SYSTEM_PORT
=
${
SYSTEM_PORT
}
\
python
-m
dynamo.vllm
--model
"
$MODEL
"
--enforce-eager
\
python
-m
dynamo.vllm
--model
"
$MODEL
"
--enforce-eager
\
--max-model-len
"
$MAX_MODEL_LEN
"
\
--max-model-len
"
$MAX_MODEL_LEN
"
\
--max-num-seqs
"
$MAX_CONCURRENT_SEQS
"
\
--max-num-seqs
"
$MAX_CONCURRENT_SEQS
"
\
$GPU_MEM_ARGS
&
\
$GPU_MEM_ARGS
\
--enable-lora
\
--enable-lora
\
--max-lora-rank
64 &
--max-lora-rank
64 &
...
...
tests/serve/common.py
View file @
326a702d
...
@@ -53,6 +53,16 @@ def run_serve_deployment(
...
@@ -53,6 +53,16 @@ def run_serve_deployment(
if
extra_env
:
if
extra_env
:
merged_env
.
update
(
extra_env
)
merged_env
.
update
(
extra_env
)
# In serial mode (no parallel scheduler), pass the marker's KV cache budget
# so the launch script's small default doesn't starve larger models.
# The parallel scheduler already sets this env var per-test.
if
"_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"
not
in
os
.
environ
:
kv_mark
=
request
.
node
.
get_closest_marker
(
"requested_vllm_kv_cache_bytes"
)
if
kv_mark
:
merged_env
.
setdefault
(
"_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES"
,
str
(
int
(
kv_mark
.
args
[
0
]))
)
# Stagger engine startup under xdist to avoid vLLM profiling race
# Stagger engine startup under xdist to avoid vLLM profiling race
# (vLLM bug #10643: concurrent profilers miscount each other's memory).
# (vLLM bug #10643: concurrent profilers miscount each other's memory).
worker_id
=
os
.
environ
.
get
(
"PYTEST_XDIST_WORKER"
,
""
)
worker_id
=
os
.
environ
.
get
(
"PYTEST_XDIST_WORKER"
,
""
)
...
...
tests/serve/test_vllm.py
View file @
326a702d
...
@@ -403,16 +403,12 @@ vllm_configs = {
...
@@ -403,16 +403,12 @@ vllm_configs = {
script_name
=
"agg_multimodal.sh"
,
script_name
=
"agg_multimodal.sh"
,
marks
=
[
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
profiled_vram_gib
(
1
4.9
),
# actual profiled peak with kv-bytes
pytest
.
mark
.
profiled_vram_gib
(
1
9.2
),
# actual profiled peak with kv-bytes
pytest
.
mark
.
requested_vllm_kv_cache_bytes
(
pytest
.
mark
.
requested_vllm_kv_cache_bytes
(
922_354_000
4_318_854_000
),
# KV cache cap (2x safety over min=461_176_832)
),
# KV cache cap (2x safety over min=2_159_426_560)
pytest
.
mark
.
timeout
(
pytest
.
mark
.
timeout
(
360
),
# 7B model; L4 machines need more headroom
300
),
# ~7x observed 42.7s; 7B model loads ~48s on CI (A10G/L4)
pytest
.
mark
.
nightly
,
pytest
.
mark
.
nightly
,
# https://github.com/ai-dynamo/dynamo/issues/4501
pytest
.
mark
.
xfail
(
strict
=
False
),
],
],
model
=
"llava-hf/llava-1.5-7b-hf"
,
model
=
"llava-hf/llava-1.5-7b-hf"
,
script_args
=
[
"--model"
,
"llava-hf/llava-1.5-7b-hf"
],
script_args
=
[
"--model"
,
"llava-hf/llava-1.5-7b-hf"
],
...
@@ -511,6 +507,7 @@ vllm_configs = {
...
@@ -511,6 +507,7 @@ vllm_configs = {
],
],
"tool_choice"
:
"required"
,
"tool_choice"
:
"required"
,
"max_tokens"
:
1024
,
"max_tokens"
:
1024
,
"temperature"
:
0
,
},
},
repeat_count
=
1
,
repeat_count
=
1
,
expected_response
=
[
expected_response
=
[
...
@@ -821,9 +818,12 @@ def lora_chat_payload(
...
@@ -821,9 +818,12 @@ def lora_chat_payload(
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
model
(
"Qwen/Qwen3-0.6B"
)
@
pytest
.
mark
.
model
(
"Qwen/Qwen3-0.6B"
)
@
pytest
.
mark
.
timeout
(
600
)
@
pytest
.
mark
.
profiled_vram_gib
(
4.0
)
# actual nvidia-smi peak with kv-bytes cap
@
pytest
.
mark
.
requested_vllm_kv_cache_bytes
(
941_712_000
)
# 2x safety over min=470_855_680
@
pytest
.
mark
.
timeout
(
300
)
# LoRA setup adds overhead; L4 machines are slower
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
post_merge
@
pytest
.
mark
.
skip
(
reason
=
"DYN-2260"
)
def
test_lora_aggregated
(
def
test_lora_aggregated
(
request
,
request
,
runtime_services_dynamic_ports
,
runtime_services_dynamic_ports
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment