Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
dacb2980
Unverified
Commit
dacb2980
authored
Apr 03, 2026
by
Ryan McCormick
Committed by
GitHub
Apr 03, 2026
Browse files
chore(multimodal): Cleanup multimodal docs and consolidate launch scripts (#7845)
parent
2075eb67
Changes
8
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
119 additions
and
713 deletions
+119
-713
docs/features/multimodal/multimodal-vllm.md
docs/features/multimodal/multimodal-vllm.md
+54
-341
examples/backends/vllm/launch/agg_multimodal.sh
examples/backends/vllm/launch/agg_multimodal.sh
+34
-6
examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
+3
-0
examples/backends/vllm/launch/disagg_multimodal_epd.sh
examples/backends/vllm/launch/disagg_multimodal_epd.sh
+25
-9
examples/backends/vllm/launch/disagg_multimodal_llama.sh
examples/backends/vllm/launch/disagg_multimodal_llama.sh
+0
-112
examples/backends/vllm/launch/video_agg.sh
examples/backends/vllm/launch/video_agg.sh
+0
-96
examples/backends/vllm/launch/video_disagg.sh
examples/backends/vllm/launch/video_disagg.sh
+0
-138
tests/serve/test_vllm.py
tests/serve/test_vllm.py
+3
-11
No files found.
docs/features/multimodal/multimodal-vllm.md
View file @
dacb2980
This diff is collapsed.
Click to expand it.
examples/backends/vllm/launch/agg_multimodal.sh
View file @
dacb2980
...
@@ -2,11 +2,11 @@
...
@@ -2,11 +2,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
#
#
# Aggregated multimodal serving with standard Dynamo preprocessing
# Aggregated multimodal
image/video
serving with standard Dynamo preprocessing
#
#
# Architecture: Single-worker PD (Prefill-Decode)
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor
handles image URLs (HTTP and data:// base64)
# - Frontend: Rust OpenAIPreprocessor
forwards multimodal requests
# - Worker: Standard vLLM worker with
vision
model support
# - Worker: Standard vLLM worker with
multimodal
model support
#
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh
# see agg_multimodal_epd.sh
...
@@ -19,7 +19,7 @@ source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
...
@@ -19,7 +19,7 @@ source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
# Default values
# Default values
MODEL_NAME
=
"Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
MODEL_NAME
=
"
${
DYN_MODEL_NAME
:-
Qwen
/Qwen3-VL-30B-A3B-Instruct-FP8
}
"
# Parse command line arguments
# Parse command line arguments
# Extra arguments are passed through to the vLLM worker
# Extra arguments are passed through to the vLLM worker
...
@@ -48,13 +48,41 @@ while [[ $# -gt 0 ]]; do
...
@@ -48,13 +48,41 @@ while [[ $# -gt 0 ]]; do
done
done
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
print_launch_banner
--multimodal
"Launching Aggregated Multimodal Serving"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
# Use TCP transport (instead of default NATS)
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export
DYN_REQUEST_PLANE
=
tcp
export
DYN_REQUEST_PLANE
=
tcp
print_launch_banner
--no-curl
"Launching Aggregated Multimodal Serving"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
\
"Backend: dynamo.vllm --enable-multimodal"
\
"Media: image_url and video_url (model support dependent)"
print_curl_footer
<<
CURL
curl http://localhost:
${
HTTP_PORT
}
/v1/chat/completions
\\
-H 'Content-Type: application/json'
\\
-d '{
"model": "
${
MODEL_NAME
}
",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the image"},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"}}
]}],
"max_tokens": 50
}'
# For video-capable models such as Qwen/Qwen3-VL-2B-Instruct:
curl http://localhost:
${
HTTP_PORT
}
/v1/chat/completions
\\
-H 'Content-Type: application/json'
\\
-d '{
"model": "Qwen/Qwen3-VL-2B-Instruct",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the video in detail"},
{"type": "video_url", "video_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"}}
]}],
"max_tokens": 128
}'
CURL
# Start frontend with Rust OpenAIPreprocessor
# Start frontend with Rust OpenAIPreprocessor
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python
-m
dynamo.frontend &
python
-m
dynamo.frontend &
...
@@ -65,7 +93,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
...
@@ -65,7 +93,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS
=
""
MODEL_EXTRA_ARGS
=
""
case
"
$MODEL_NAME
"
in
case
"
$MODEL_NAME
"
in
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
)
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8
)
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
108960
}
"
MAX_MODEL_LEN
=
"108960"
MODEL_EXTRA_ARGS
=
"--tensor-parallel-size=8"
;;
MODEL_EXTRA_ARGS
=
"--tensor-parallel-size=8"
;;
esac
esac
...
...
examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
View file @
dacb2980
...
@@ -7,6 +7,9 @@ trap 'echo Cleaning up...; kill 0' EXIT
...
@@ -7,6 +7,9 @@ trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR
=
"
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
"
SCRIPT_DIR
=
"
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
# Use TCP transport for multimodal workloads (base64 images can exceed NATS 1MB limit)
export
DYN_REQUEST_PLANE
=
tcp
# Default values
# Default values
MODEL_NAME
=
"Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
MODEL_NAME
=
"Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
SINGLE_GPU
=
false
SINGLE_GPU
=
false
...
...
examples/backends/vllm/launch/disagg_multimodal_epd.sh
View file @
dacb2980
...
@@ -8,6 +8,9 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
...
@@ -8,6 +8,9 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source
"
$SCRIPT_DIR
/../../../common/gpu_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/gpu_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
# Use TCP transport for multimodal workloads (base64 images can exceed NATS 1MB limit)
export
DYN_REQUEST_PLANE
=
tcp
# Default values
# Default values
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
MODEL_NAME
=
"llava-hf/llava-1.5-7b-hf"
...
@@ -17,7 +20,7 @@ MODEL_NAME="llava-hf/llava-1.5-7b-hf"
...
@@ -17,7 +20,7 @@ MODEL_NAME="llava-hf/llava-1.5-7b-hf"
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting P/D workers to image=
1
,video=
0
,audio=0 (--limit-mm-per-prompt)
# - Limiting P/D workers to image=
3
,video=
3
,audio=0 (--limit-mm-per-prompt)
# - Using lower gpu-memory-utilization fractions to share the GPU
# - Using lower gpu-memory-utilization fractions to share the GPU
SINGLE_GPU
=
false
SINGLE_GPU
=
false
...
@@ -77,10 +80,17 @@ python -m dynamo.frontend &
...
@@ -77,10 +80,17 @@ python -m dynamo.frontend &
EXTRA_ARGS
=
""
EXTRA_ARGS
=
""
PD_EXTRA_ARGS
=
""
PD_EXTRA_ARGS
=
""
# GPU assignments (override via environment variables)
# GPU assignments (override via environment variables).
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
# In single-GPU mode all 3 workers default to GPU 0.
DYN_PREFILL_WORKER_GPU
=
${
DYN_PREFILL_WORKER_GPU
:-
1
}
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
DYN_DECODE_WORKER_GPU
=
${
DYN_DECODE_WORKER_GPU
:-
2
}
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
DYN_PREFILL_WORKER_GPU
=
${
DYN_PREFILL_WORKER_GPU
:-
0
}
DYN_DECODE_WORKER_GPU
=
${
DYN_DECODE_WORKER_GPU
:-
0
}
else
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
DYN_PREFILL_WORKER_GPU
=
${
DYN_PREFILL_WORKER_GPU
:-
1
}
DYN_DECODE_WORKER_GPU
=
${
DYN_DECODE_WORKER_GPU
:-
2
}
fi
# GPU memory utilization for workers.
# GPU memory utilization for workers.
# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
...
@@ -93,9 +103,15 @@ if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
...
@@ -93,9 +103,15 @@ if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
echo
"WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because"
>
&2
echo
"WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because"
>
&2
echo
" --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM."
>
&2
echo
" --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM."
>
&2
fi
fi
DYN_ENCODE_GPU_MEM
=
${
DYN_ENCODE_GPU_MEM
:-
0
.9
}
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
DYN_PREFILL_GPU_MEM
=
${
DYN_PREFILL_GPU_MEM
:-
0
.9
}
DYN_ENCODE_GPU_MEM
=
${
DYN_ENCODE_GPU_MEM
:-
0
.1
}
DYN_DECODE_GPU_MEM
=
${
DYN_DECODE_GPU_MEM
:-
0
.9
}
DYN_PREFILL_GPU_MEM
=
${
DYN_PREFILL_GPU_MEM
:-
0
.4
}
DYN_DECODE_GPU_MEM
=
${
DYN_DECODE_GPU_MEM
:-
0
.4
}
else
DYN_ENCODE_GPU_MEM
=
${
DYN_ENCODE_GPU_MEM
:-
0
.9
}
DYN_PREFILL_GPU_MEM
=
${
DYN_PREFILL_GPU_MEM
:-
0
.9
}
DYN_DECODE_GPU_MEM
=
${
DYN_DECODE_GPU_MEM
:-
0
.9
}
fi
# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# memory profiling entirely (both language model and multimodal encoder), which avoids
# memory profiling entirely (both language model and multimodal encoder), which avoids
...
@@ -105,7 +121,7 @@ PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
...
@@ -105,7 +121,7 @@ PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
EXTRA_ARGS
=
"--enforce-eager"
EXTRA_ARGS
=
"--enforce-eager"
PD_EXTRA_ARGS
=
"--max-model-len 4096 --kv-cache-memory-bytes
$PD_KV_CACHE_BYTES
--limit-mm-per-prompt {
\"
image
\"
:
1
,
\"
video
\"
:
0
,
\"
audio
\"
:0}"
PD_EXTRA_ARGS
=
"--max-model-len 4096 --kv-cache-memory-bytes
$PD_KV_CACHE_BYTES
--limit-mm-per-prompt {
\"
image
\"
:
3
,
\"
video
\"
:
3
,
\"
audio
\"
:0}"
fi
fi
# Start encode worker
# Start encode worker
...
...
examples/backends/vllm/launch/disagg_multimodal_llama.sh
deleted
100755 → 0
View file @
2075eb67
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set
-ex
SCRIPT_DIR
=
"
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
# Default values
HEAD_NODE
=
0
MODEL_NAME
=
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
EXTRA_ARGS
=()
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--head-node
)
HEAD_NODE
=
1
shift
1
;;
--model
)
MODEL_NAME
=
$2
shift
2
;;
-h
|
--help
)
echo
"Usage:
$0
[OPTIONS]"
echo
""
echo
"Disaggregated multimodal serving with separate Prefill/Decode workers for Llama 4"
echo
""
echo
"Options:"
echo
" --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker."
echo
" --model <model_name> Specify the VLM model to use (default:
$MODEL_NAME
)"
echo
" -h, --help Show this help message"
echo
""
echo
"Examples:"
echo
" # On head node:"
echo
"
$0
--head-node"
echo
""
echo
" # On worker node (requires NATS_SERVER and ETCD_ENDPOINTS pointing to head node):"
echo
"
$0
"
echo
""
exit
0
;;
*
)
EXTRA_ARGS+
=(
"
$1
"
)
shift
;;
esac
done
trap
'echo Cleaning up...; kill 0'
EXIT
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
if
[[
$HEAD_NODE
-eq
1
]]
;
then
print_launch_banner
--multimodal
"Launching Disaggregated Multimodal Llama 4 (Multi-Node)"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
else
print_launch_banner
--no-curl
"Launching Disaggregated Multimodal Llama 4 (Multi-Node)"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
fi
# Use TCP transport to avoid NATS payload limits for multimodal
export
DYN_REQUEST_PLANE
=
tcp
# Configure model-specific args
GPU_MEM
=
"0.80"
KV_BYTES
=
"
${
_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES
:-}
"
if
[[
-n
"
$KV_BYTES
"
]]
;
then
GPU_MEM_ARGS
=
"--kv-cache-memory-bytes
$KV_BYTES
--gpu-memory-utilization 0.01"
else
GPU_MEM_ARGS
=
"--gpu-memory-utilization
$GPU_MEM
"
fi
MODEL_SPECIFIC_ARGS
=
""
if
[[
"
$MODEL_NAME
"
==
"meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
]]
;
then
MODEL_SPECIFIC_ARGS
=
"--tensor-parallel-size=8 --max-model-len=208960
$GPU_MEM_ARGS
"
fi
if
[[
$HEAD_NODE
-eq
1
]]
;
then
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python
-m
dynamo.frontend &
# run processor (CPU-only to avoid competing for GPU memory with workers)
CUDA_VISIBLE_DEVICES
=
""
\
python
-m
dynamo.vllm
--route-to-encoder
--enable-multimodal
--model
$MODEL_NAME
&
# Prefill worker handles prompt processing and image encoding
# Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20097
\
python
-m
dynamo.vllm
\
--enable-multimodal
\
--model
$MODEL_NAME
\
--disaggregation-mode
prefill
\
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
\
$MODEL_SPECIFIC_ARGS
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}'
\
"
${
EXTRA_ARGS
[@]
}
"
&
else
# run decode worker on non-head node
# Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
\
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20098
\
python
-m
dynamo.vllm
\
--enable-multimodal
\
--model
$MODEL_NAME
\
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
\
$MODEL_SPECIFIC_ARGS
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
\
"
${
EXTRA_ARGS
[@]
}
"
&
fi
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
examples/backends/vllm/launch/video_agg.sh
deleted
100755 → 0
View file @
2075eb67
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated video serving with standard Dynamo preprocessing and vLLM backend.
set
-euo
pipefail
cleanup
()
{
echo
"Cleaning up..."
local
pids
pids
=
"
$(
jobs
-pr
)
"
if
[[
-n
"
$pids
"
]]
;
then
kill
$pids
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
REPO_ROOT
=
"
$(
cd
"
$SCRIPT_DIR
/../../../.."
&&
pwd
)
"
source
"
$SCRIPT_DIR
/../../../common/gpu_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
export
PYTHONPATH
=
"
${
REPO_ROOT
}
/components/src:
${
REPO_ROOT
}
/lib/bindings/python/src
${
PYTHONPATH
:+:
${
PYTHONPATH
}}
"
MODEL_NAME
=
"
${
DYN_MODEL_NAME
:-
Qwen
/Qwen3-VL-2B-Instruct
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
GPU_DEVICE
=
"
${
CUDA_VISIBLE_DEVICES
:-
0
}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
8192
}
"
MAX_NUM_SEQS
=
"
${
MAX_NUM_SEQS
:-
2
}
"
EXTRA_ARGS
=()
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--model
)
MODEL_NAME
=
$2
shift
2
;;
-h
|
--help
)
cat
<<
USAGE
Usage:
$0
[OPTIONS] [-- EXTRA_VLLM_ARGS]
Options:
--model <model_name> Video-capable VLM to serve (default:
$MODEL_NAME
)
-h, --help Show this help message
Any arguments after '--' are passed through to the vLLM worker.
USAGE
exit
0
;;
--
)
shift
EXTRA_ARGS+
=(
"
$@
"
)
break
;;
*
)
EXTRA_ARGS+
=(
"
$1
"
)
shift
;;
esac
done
export
DYN_REQUEST_PLANE
=
tcp
GPU_MEM_ARGS
=
$(
build_gpu_mem_args vllm
)
print_launch_banner
--no-curl
"Launching Aggregated Video Serving"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
\
"Backend: dynamo.vllm --enable-multimodal"
\
"Video path: Standard TokensPrompt multi_modal_data flow"
print_curl_footer
<<
CURL
curl http://localhost:
${
HTTP_PORT
}
/v1/chat/completions
\\
-H 'Content-Type: application/json'
\\
-d '{
"model": "
${
MODEL_NAME
}
",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the video in detail"},
{"type": "video_url", "video_url": {"url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"}}
]}],
"max_tokens": 128
}'
CURL
python
-m
dynamo.frontend &
CUDA_VISIBLE_DEVICES
=
"
$GPU_DEVICE
"
\
python
-m
dynamo.vllm
\
--enable-multimodal
\
--model
"
$MODEL_NAME
"
\
--max-model-len
"
$MAX_MODEL_LEN
"
\
--max-num-seqs
"
$MAX_NUM_SEQS
"
\
$GPU_MEM_ARGS
\
"
${
EXTRA_ARGS
[@]
}
"
&
wait_any_exit
examples/backends/vllm/launch/video_disagg.sh
deleted
100755 → 0
View file @
2075eb67
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Disaggregated video serving with standard Dynamo preprocessing and vLLM backend.
set
-euo
pipefail
cleanup
()
{
echo
"Cleaning up..."
local
pids
pids
=
"
$(
jobs
-pr
)
"
if
[[
-n
"
$pids
"
]]
;
then
kill
$pids
2>/dev/null
||
true
fi
}
trap
cleanup EXIT
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
REPO_ROOT
=
"
$(
cd
"
$SCRIPT_DIR
/../../../.."
&&
pwd
)
"
source
"
$SCRIPT_DIR
/../../../common/gpu_utils.sh"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
export
PYTHONPATH
=
"
${
REPO_ROOT
}
/components/src:
${
REPO_ROOT
}
/lib/bindings/python/src
${
PYTHONPATH
:+:
${
PYTHONPATH
}}
"
MODEL_NAME
=
"
${
DYN_MODEL_NAME
:-
Qwen
/Qwen3-VL-2B-Instruct
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
SINGLE_GPU
=
false
EXTRA_ARGS
=()
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--model
)
MODEL_NAME
=
$2
shift
2
;;
--single-gpu
)
SINGLE_GPU
=
true
shift
;;
-h
|
--help
)
cat
<<
USAGE
Usage:
$0
[OPTIONS] [-- EXTRA_VLLM_ARGS]
Options:
--model <model_name> Video-capable VLM to serve (default:
$MODEL_NAME
)
--single-gpu Run prefill and decode on one GPU for functional testing
-h, --help Show this help message
Any arguments after '--' are passed through to both vLLM workers.
USAGE
exit
0
;;
--
)
shift
EXTRA_ARGS+
=(
"
$@
"
)
break
;;
*
)
EXTRA_ARGS+
=(
"
$1
"
)
shift
;;
esac
done
export
DYN_REQUEST_PLANE
=
tcp
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
GPU_LABEL
=
"1 GPU"
PREFILL_GPU
=
"
${
DYN_PREFILL_WORKER_GPU
:-${
CUDA_VISIBLE_DEVICES
:-
0
}}
"
DECODE_GPU
=
"
${
DYN_DECODE_WORKER_GPU
:-${
CUDA_VISIBLE_DEVICES
:-
0
}}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
4096
}
"
PD_KV_CACHE_BYTES
=
$((
512
*
1024
*
1024
))
SHARED_GPU_FRACTION
=
$(
build_gpu_mem_args vllm
--workers-per-gpu
2
)
PREFILL_GPU_MEM
=
"
${
DYN_PREFILL_GPU_MEM
:-${
SHARED_GPU_FRACTION
:-
0
.45
}}
"
DECODE_GPU_MEM
=
"
${
DYN_DECODE_GPU_MEM
:-${
SHARED_GPU_FRACTION
:-
0
.45
}}
"
SHARED_ARGS
=(
--enforce-eager
--max-model-len
"
$MAX_MODEL_LEN
"
--kv-cache-memory-bytes
"
$PD_KV_CACHE_BYTES
"
--limit-mm-per-prompt
'{"image":1,"video":1,"audio":0}'
)
else
GPU_LABEL
=
"2 GPUs"
PREFILL_GPU
=
"
${
DYN_PREFILL_WORKER_GPU
:-
0
}
"
DECODE_GPU
=
"
${
DYN_DECODE_WORKER_GPU
:-
1
}
"
MAX_MODEL_LEN
=
"
${
MAX_MODEL_LEN
:-
8192
}
"
GPU_MEM_ARGS
=
$(
build_gpu_mem_args vllm
)
PREFILL_GPU_MEM
=
"
${
DYN_PREFILL_GPU_MEM
:-${
GPU_MEM_ARGS
:-
0
.9
}}
"
DECODE_GPU_MEM
=
"
${
DYN_DECODE_GPU_MEM
:-${
GPU_MEM_ARGS
:-
0
.9
}}
"
SHARED_ARGS
=(
--max-model-len
"
$MAX_MODEL_LEN
"
)
fi
print_launch_banner
--no-curl
"Launching Disaggregated Video Serving (
$GPU_LABEL
)"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
\
"Backend: Prefill + decode workers via dynamo.vllm"
\
"Video path: Standard TokensPrompt multi_modal_data flow"
print_curl_footer
<<
CURL
curl http://localhost:
${
HTTP_PORT
}
/v1/chat/completions
\\
-H 'Content-Type: application/json'
\\
-d '{
"model": "
${
MODEL_NAME
}
",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the video in detail"},
{"type": "video_url", "video_url": {"url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"}}
]}],
"max_tokens": 128
}'
CURL
python
-m
dynamo.frontend &
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20098
\
CUDA_VISIBLE_DEVICES
=
"
$PREFILL_GPU
"
\
python
-m
dynamo.vllm
\
--disaggregation-mode
prefill
\
--enable-multimodal
\
--model
"
$MODEL_NAME
"
\
--gpu-memory-utilization
"
$PREFILL_GPU_MEM
"
\
"
${
SHARED_ARGS
[@]
}
"
\
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}'
\
"
${
EXTRA_ARGS
[@]
}
"
&
VLLM_NIXL_SIDE_CHANNEL_PORT
=
20099
\
CUDA_VISIBLE_DEVICES
=
"
$DECODE_GPU
"
\
python
-m
dynamo.vllm
\
--disaggregation-mode
decode
\
--enable-multimodal
\
--model
"
$MODEL_NAME
"
\
--gpu-memory-utilization
"
$DECODE_GPU_MEM
"
\
"
${
SHARED_ARGS
[@]
}
"
\
--kv-transfer-config
'{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
\
--kv-events-config
'{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}'
\
"
${
EXTRA_ARGS
[@]
}
"
&
wait_any_exit
tests/serve/test_vllm.py
View file @
dacb2980
...
@@ -428,14 +428,6 @@ vllm_configs = {
...
@@ -428,14 +428,6 @@ vllm_configs = {
model
=
"Qwen/Qwen3-VL-2B-Instruct"
,
model
=
"Qwen/Qwen3-VL-2B-Instruct"
,
script_args
=
[
"--model"
,
"Qwen/Qwen3-VL-2B-Instruct"
,
"--single-gpu"
],
script_args
=
[
"--model"
,
"Qwen/Qwen3-VL-2B-Instruct"
,
"--single-gpu"
],
timeout
=
300
,
timeout
=
300
,
env
=
{
"DYN_ENCODE_WORKER_GPU"
:
"0"
,
"DYN_PREFILL_WORKER_GPU"
:
"0"
,
"DYN_DECODE_WORKER_GPU"
:
"0"
,
"DYN_ENCODE_GPU_MEM"
:
"0.1"
,
"DYN_PREFILL_GPU_MEM"
:
"0.4"
,
"DYN_DECODE_GPU_MEM"
:
"0.4"
,
},
request_payloads
=
[
request_payloads
=
[
chat_payload
(
chat_payload
(
[
[
...
@@ -536,11 +528,11 @@ vllm_configs = {
...
@@ -536,11 +528,11 @@ vllm_configs = {
),
),
],
],
),
),
# Video multimodal tests for CI us
ing
the
vLLM video launch scripts
.
# Video multimodal tests for CI us
e
the
canonical aggregated multimodal launcher
.
"multimodal_video_agg"
:
VLLMConfig
(
"multimodal_video_agg"
:
VLLMConfig
(
name
=
"multimodal_video_agg"
,
name
=
"multimodal_video_agg"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"
video_agg
.sh"
,
script_name
=
"
agg_multimodal
.sh"
,
marks
=
[
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
pre_merge
,
...
@@ -568,7 +560,7 @@ vllm_configs = {
...
@@ -568,7 +560,7 @@ vllm_configs = {
"multimodal_video_disagg"
:
VLLMConfig
(
"multimodal_video_disagg"
:
VLLMConfig
(
name
=
"multimodal_video_disagg"
,
name
=
"multimodal_video_disagg"
,
directory
=
vllm_dir
,
directory
=
vllm_dir
,
script_name
=
"
video_
disagg.sh"
,
script_name
=
"disagg
_multimodal_epd
.sh"
,
marks
=
[
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
pre_merge
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment