Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
b1930a61
Unverified
Commit
b1930a61
authored
Feb 27, 2026
by
KrishnanPrash
Committed by
GitHub
Feb 27, 2026
Browse files
chore: migrate vllm e/p/d test from gpu_2 -> gpu_1 (#6638)
Signed-off-by:
Krishnan Prashanth
<
kprashanth@nvidia.com
>
parent
f1d5c95a
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
39 additions
and
9 deletions
+39
-9
examples/backends/vllm/launch/disagg_multimodal_epd.sh
examples/backends/vllm/launch/disagg_multimodal_epd.sh
+31
-4
tests/serve/test_vllm.py
tests/serve/test_vllm.py
+8
-5
No files found.
examples/backends/vllm/launch/disagg_multimodal_epd.sh
View file @
b1930a61
...
@@ -7,6 +7,16 @@ trap 'echo Cleaning up...; kill 0' EXIT
...
@@ -7,6 +7,16 @@ trap 'echo Cleaning up...; kill 0' EXIT
# Default values
# Default values
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
# --single-gpu: Packs all 3 workers (encode, prefill, decode) onto a single GPU.
# This is intended for functional testing with small models (e.g. 2B) where CI
# only has 1 GPU available. It reduces performance by:
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
# - Using lower gpu-memory-utilization fractions to share the GPU
SINGLE_GPU
=
false
# Parse command line arguments
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
while
[[
$#
-gt
0
]]
;
do
case
$1
in
case
$1
in
...
@@ -14,6 +24,10 @@ while [[ $# -gt 0 ]]; do
...
@@ -14,6 +24,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME
=
$2
MODEL_NAME
=
$2
shift
2
shift
2
;;
;;
--single-gpu
)
SINGLE_GPU
=
true
shift
;;
-h
|
--help
)
-h
|
--help
)
echo
"Usage:
$0
[OPTIONS]"
echo
"Usage:
$0
[OPTIONS]"
echo
""
echo
""
...
@@ -22,12 +36,14 @@ while [[ $# -gt 0 ]]; do
...
@@ -22,12 +36,14 @@ while [[ $# -gt 0 ]]; do
echo
"Options:"
echo
"Options:"
echo
" --model <model_name> Specify the VLM model to use (default:
$MODEL_NAME
)"
echo
" --model <model_name> Specify the VLM model to use (default:
$MODEL_NAME
)"
echo
" LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
echo
" LLaVA 1.5 7B, Qwen2.5-VL, and Phi3V models have predefined templates"
echo
" --single-gpu Pack all 3 workers on 1 GPU (for small models, e.g. 2B)"
echo
" -h, --help Show this help message"
echo
" -h, --help Show this help message"
echo
""
echo
""
echo
"Examples:"
echo
"Examples:"
echo
"
$0
--model llava-hf/llava-1.5-7b-hf"
echo
"
$0
--model llava-hf/llava-1.5-7b-hf"
echo
"
$0
--model microsoft/Phi-3.5-vision-instruct"
echo
"
$0
--model microsoft/Phi-3.5-vision-instruct"
echo
"
$0
--model Qwen/Qwen2.5-VL-7B-Instruct"
echo
"
$0
--model Qwen/Qwen2.5-VL-7B-Instruct"
echo
"
$0
--model Qwen/Qwen3-VL-2B-Instruct --single-gpu"
echo
""
echo
""
exit
0
exit
0
;;
;;
...
@@ -41,7 +57,7 @@ done
...
@@ -41,7 +57,7 @@ done
echo
"=================================================="
echo
"=================================================="
echo
"Disaggregated Multimodal Serving"
echo
"Disaggregated Multimodal Serving
(E + P + D)
"
echo
"=================================================="
echo
"=================================================="
echo
"Model:
$MODEL_NAME
"
echo
"Model:
$MODEL_NAME
"
echo
"=================================================="
echo
"=================================================="
...
@@ -53,6 +69,7 @@ echo "Starting frontend..."
...
@@ -53,6 +69,7 @@ echo "Starting frontend..."
python
-m
dynamo.frontend &
python
-m
dynamo.frontend &
EXTRA_ARGS
=
""
EXTRA_ARGS
=
""
PD_EXTRA_ARGS
=
""
# GPU assignments (override via environment variables)
# GPU assignments (override via environment variables)
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
...
@@ -64,6 +81,17 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
...
@@ -64,6 +81,17 @@ DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM
=
${
DYN_PREFILL_GPU_MEM
:-
0
.9
}
DYN_PREFILL_GPU_MEM
=
${
DYN_PREFILL_GPU_MEM
:-
0
.9
}
DYN_DECODE_GPU_MEM
=
${
DYN_DECODE_GPU_MEM
:-
0
.9
}
DYN_DECODE_GPU_MEM
=
${
DYN_DECODE_GPU_MEM
:-
0
.9
}
# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# memory profiling entirely (both language model and multimodal encoder), which avoids
# OOM during profiling when 3 workers share a GPU. 512 MB covers the
# minimum vLLM requires for max_model_len=4096 on Qwen3-VL-2B.
PD_KV_CACHE_BYTES
=
$((
512
*
1024
*
1024
))
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
EXTRA_ARGS
=
"--enforce-eager"
PD_EXTRA_ARGS
=
"--max-model-len 4096 --kv-cache-memory-bytes
$PD_KV_CACHE_BYTES
--limit-mm-per-prompt {
\"
image
\"
:1,
\"
video
\"
:0,
\"
audio
\"
:0}"
fi
# Start encode worker
# Start encode worker
echo
"Starting encode worker on GPU
$DYN_ENCODE_WORKER_GPU
(GPU mem:
$DYN_ENCODE_GPU_MEM
)..."
echo
"Starting encode worker on GPU
$DYN_ENCODE_WORKER_GPU
(GPU mem:
$DYN_ENCODE_GPU_MEM
)..."
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
CUDA_VISIBLE_DEVICES
=
$DYN_ENCODE_WORKER_GPU
python
-m
dynamo.vllm
--multimodal-encode-worker
--enable-multimodal
--model
$MODEL_NAME
--gpu-memory-utilization
$DYN_ENCODE_GPU_MEM
$EXTRA_ARGS
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'
&
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
CUDA_VISIBLE_DEVICES
=
$DYN_ENCODE_WORKER_GPU
python
-m
dynamo.vllm
--multimodal-encode-worker
--enable-multimodal
--model
$MODEL_NAME
--gpu-memory-utilization
$DYN_ENCODE_GPU_MEM
$EXTRA_ARGS
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'
&
...
@@ -71,12 +99,12 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py
...
@@ -71,12 +99,12 @@ VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=$DYN_ENCODE_WORKER_GPU py
# Start prefill worker (also handles encode routing via --route-to-encoder)
# Start prefill worker (also handles encode routing via --route-to-encoder)
echo
"Starting prefill worker on GPU
$DYN_PREFILL_WORKER_GPU
(GPU mem:
$DYN_PREFILL_GPU_MEM
)..."
echo
"Starting prefill worker on GPU
$DYN_PREFILL_WORKER_GPU
(GPU mem:
$DYN_PREFILL_GPU_MEM
)..."
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20098
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20098
\
CUDA_VISIBLE_DEVICES
=
$DYN_PREFILL_WORKER_GPU
python
-m
dynamo.vllm
--multimodal-worker
--route-to-encoder
--disaggregation-mode
prefill
--enable-multimodal
--enable-mm-embeds
--model
$MODEL_NAME
--gpu-memory-utilization
$DYN_PREFILL_GPU_MEM
$EXTRA_ARGS
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
&
CUDA_VISIBLE_DEVICES
=
$DYN_PREFILL_WORKER_GPU
python
-m
dynamo.vllm
--multimodal-worker
--route-to-encoder
--disaggregation-mode
prefill
--enable-multimodal
--enable-mm-embeds
--model
$MODEL_NAME
--gpu-memory-utilization
$DYN_PREFILL_GPU_MEM
$EXTRA_ARGS
$PD_EXTRA_ARGS
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
&
# Start decode worker
# Start decode worker
echo
"Starting decode worker on GPU
$DYN_DECODE_WORKER_GPU
(GPU mem:
$DYN_DECODE_GPU_MEM
)..."
echo
"Starting decode worker on GPU
$DYN_DECODE_WORKER_GPU
(GPU mem:
$DYN_DECODE_GPU_MEM
)..."
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20099
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20099
\
CUDA_VISIBLE_DEVICES
=
$DYN_DECODE_WORKER_GPU
python
-m
dynamo.vllm
--multimodal-decode-worker
--enable-multimodal
--enable-mm-embeds
--model
$MODEL_NAME
--gpu-memory-utilization
$DYN_DECODE_GPU_MEM
$EXTRA_ARGS
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
&
CUDA_VISIBLE_DEVICES
=
$DYN_DECODE_WORKER_GPU
python
-m
dynamo.vllm
--multimodal-decode-worker
--enable-multimodal
--enable-mm-embeds
--model
$MODEL_NAME
--gpu-memory-utilization
$DYN_DECODE_GPU_MEM
$EXTRA_ARGS
$PD_EXTRA_ARGS
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
&
echo
"=================================================="
echo
"=================================================="
echo
"All components started. Waiting for initialization..."
echo
"All components started. Waiting for initialization..."
...
@@ -84,4 +112,3 @@ echo "=================================================="
...
@@ -84,4 +112,3 @@ echo "=================================================="
# Wait for all background processes to complete
# Wait for all background processes to complete
wait
wait
tests/serve/test_vllm.py
View file @
b1930a61
...
@@ -276,6 +276,7 @@ vllm_configs = {
...
@@ -276,6 +276,7 @@ vllm_configs = {
completion_payload_default
(),
completion_payload_default
(),
],
],
),
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_disagg_qwen3vl_2b_e_pd"
:
VLLMConfig
(
"multimodal_disagg_qwen3vl_2b_e_pd"
:
VLLMConfig
(
name
=
"multimodal_disagg_qwen3vl_2b_e_pd"
,
name
=
"multimodal_disagg_qwen3vl_2b_e_pd"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
...
@@ -335,20 +336,22 @@ vllm_configs = {
...
@@ -335,20 +336,22 @@ vllm_configs = {
)
)
],
],
),
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_disagg_qwen3vl_2b_epd"
:
VLLMConfig
(
"multimodal_disagg_qwen3vl_2b_epd"
:
VLLMConfig
(
name
=
"multimodal_disagg_qwen3vl_2b_epd"
,
name
=
"multimodal_disagg_qwen3vl_2b_epd"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"disagg_multimodal_epd.sh"
,
script_name
=
"disagg_multimodal_epd.sh"
,
marks
=
[
pytest
.
mark
.
gpu_
2
,
pytest
.
mark
.
pre_merge
],
marks
=
[
pytest
.
mark
.
gpu_
1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen3-VL-2B-Instruct"
,
model
=
"Qwen/Qwen3-VL-2B-Instruct"
,
script_args
=
[
"--model"
,
"Qwen/Qwen3-VL-2B-Instruct"
],
script_args
=
[
"--model"
,
"Qwen/Qwen3-VL-2B-Instruct"
,
"--single-gpu"
],
timeout
=
360
,
env
=
{
env
=
{
"DYN_ENCODE_WORKER_GPU"
:
"0"
,
"DYN_ENCODE_WORKER_GPU"
:
"0"
,
"DYN_PREFILL_WORKER_GPU"
:
"0"
,
"DYN_PREFILL_WORKER_GPU"
:
"0"
,
"DYN_DECODE_WORKER_GPU"
:
"
1
"
,
"DYN_DECODE_WORKER_GPU"
:
"
0
"
,
"DYN_ENCODE_GPU_MEM"
:
"0.
4
"
,
"DYN_ENCODE_GPU_MEM"
:
"0.
1
"
,
"DYN_PREFILL_GPU_MEM"
:
"0.4"
,
"DYN_PREFILL_GPU_MEM"
:
"0.4"
,
"DYN_DECODE_GPU_MEM"
:
"0.
85
"
,
"DYN_DECODE_GPU_MEM"
:
"0.
4
"
,
},
},
request_payloads
=
[
request_payloads
=
[
chat_payload
(
chat_payload
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment