Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d92d5c77
Unverified
Commit
d92d5c77
authored
Mar 17, 2026
by
KrishnanPrash
Committed by
GitHub
Mar 17, 2026
Browse files
ci: add E/P/D multimodal CI coverage for SGLang (#7444)
Signed-off-by:
Krishnan Prashanth
<
kprashanth@nvidia.com
>
parent
930721c8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
105 additions
and
8 deletions
+105
-8
examples/backends/sglang/launch/multimodal_disagg.sh
examples/backends/sglang/launch/multimodal_disagg.sh
+72
-8
tests/serve/test_sglang.py
tests/serve/test_sglang.py
+33
-0
No files found.
examples/backends/sglang/launch/multimodal_disagg.sh
View file @
d92d5c77
...
...
@@ -2,8 +2,8 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Multimodal E/P/D:
encoder (GPU 0), prefill (GPU 1),
decode
(GPU 2)
.
#
GPUs: 3
# Multimodal E/P/D:
separate encoder, prefill, and
decode
workers
.
#
Default: 3 GPUs (one per worker). Use --single-gpu to co-locate all on GPU 0.
set
-e
trap
'echo Cleaning up...; kill 0'
EXIT
...
...
@@ -16,6 +16,12 @@ MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
CHAT_TEMPLATE
=
"qwen2-vl"
PROVIDED_CHAT_TEMPLATE
=
""
# --single-gpu: Packs all workers (encode, prefill, decode) onto a single GPU.
# This is intended for functional testing with small models (e.g. 2B) where CI
# only has 1 GPU available. It uses lower mem-fraction-static values to share the GPU
# and enables memory-saving options.
SINGLE_GPU
=
false
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
case
$1
in
...
...
@@ -31,12 +37,17 @@ while [[ $# -gt 0 ]]; do
PROVIDED_CHAT_TEMPLATE
=
$2
shift
2
;;
--single-gpu
)
SINGLE_GPU
=
true
shift
;;
-h
|
--help
)
echo
"Usage:
$0
[OPTIONS]"
echo
"Options:"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" --single-gpu Pack all workers on 1 GPU (for small models, e.g. 2B)"
echo
" -h, --help Show this help message"
exit
0
;;
...
...
@@ -59,8 +70,41 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
SERVED_MODEL_ARG
=
"--served-model-name
$SERVED_MODEL_NAME
"
fi
# GPU assignments (override via environment variables)
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
DYN_PREFILL_WORKER_GPU
=
${
DYN_PREFILL_WORKER_GPU
:-
0
}
DYN_DECODE_WORKER_GPU
=
${
DYN_DECODE_WORKER_GPU
:-
0
}
else
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
DYN_PREFILL_WORKER_GPU
=
${
DYN_PREFILL_WORKER_GPU
:-
1
}
DYN_DECODE_WORKER_GPU
=
${
DYN_DECODE_WORKER_GPU
:-
2
}
fi
# GPU memory fractions for workers (used with --mem-fraction-static)
DYN_ENCODE_GPU_MEM
=
${
DYN_ENCODE_GPU_MEM
:-
0
.9
}
DYN_PREFILL_GPU_MEM
=
${
DYN_PREFILL_GPU_MEM
:-
0
.9
}
DYN_DECODE_GPU_MEM
=
${
DYN_DECODE_GPU_MEM
:-
0
.9
}
ENCODE_EXTRA_ARGS
=
""
PREFILL_EXTRA_ARGS
=
""
DECODE_EXTRA_ARGS
=
""
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
# 3 workers share one GPU. --max-total-tokens caps the KV cache to a small
# functional-test size so the last worker can initialize without OOM.
# --context-length keeps the per-request token pool allocation small.
ENCODE_EXTRA_ARGS
=
""
PREFILL_EXTRA_ARGS
=
"--mem-fraction-static
${
DYN_PREFILL_GPU_MEM
}
--delete-ckpt-after-loading --max-running-requests 2 --context-length 2048 --max-total-tokens 1024"
DECODE_EXTRA_ARGS
=
"--mem-fraction-static
${
DYN_DECODE_GPU_MEM
}
--delete-ckpt-after-loading --max-running-requests 2 --context-length 2048 --max-total-tokens 1024"
fi
# Prevent port collisions: the test framework exports DYN_SYSTEM_PORT which all
# child processes would inherit. Unset it so only workers that need it set their own.
unset
DYN_SYSTEM_PORT
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
print_launch_banner
--multimodal
"Launching Disaggregated Multimodal E/P/D
(3 GPUs)
"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
print_launch_banner
--multimodal
"Launching Disaggregated Multimodal E/P/D"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
...
...
@@ -70,12 +114,23 @@ python3 -m dynamo.frontend &
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
# run SGLang multimodal encode worker
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
echo
"Starting encode worker on GPU
$DYN_ENCODE_WORKER_GPU
(GPU mem:
$DYN_ENCODE_GPU_MEM
)..."
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT1
:-
8081
}
\
CUDA_VISIBLE_DEVICES
=
$DYN_ENCODE_WORKER_GPU
python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
$ENCODE_EXTRA_ARGS
&
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
# Wait for encode worker to initialize before starting prefill worker.
# This prevents workers from competing for GPU memory simultaneously, which can cause OOM.
echo
"Waiting for encode worker to initialize..."
sleep
5
fi
# run SGLang multimodal prefill worker
# TODO: Remove disable-radix-cache once the issue is fixed.
# See https://github.com/sgl-project/sglang/pull/11203.
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.sglang
\
echo
"Starting prefill worker on GPU
$DYN_PREFILL_WORKER_GPU
(GPU mem:
$DYN_PREFILL_GPU_MEM
)..."
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT2
:-
8082
}
\
CUDA_VISIBLE_DEVICES
=
$DYN_PREFILL_WORKER_GPU
python3
-m
dynamo.sglang
\
--multimodal-worker
\
--model-path
"
$MODEL_NAME
"
\
$SERVED_MODEL_ARG
\
...
...
@@ -87,10 +142,18 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--disaggregation-bootstrap-port
12345
\
--host
0.0.0.0
\
--disable-radix-cache
\
--disaggregation-transfer-backend
nixl &
--disaggregation-transfer-backend
nixl
\
$PREFILL_EXTRA_ARGS
&
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
# Wait for prefill worker to initialize before starting decode worker.
echo
"Waiting for prefill worker to initialize..."
sleep
5
fi
# run SGLang multimodal decode worker
CUDA_VISIBLE_DEVICES
=
2 python3
-m
dynamo.sglang
\
echo
"Starting decode worker on GPU
$DYN_DECODE_WORKER_GPU
(GPU mem:
$DYN_DECODE_GPU_MEM
)..."
CUDA_VISIBLE_DEVICES
=
$DYN_DECODE_WORKER_GPU
python3
-m
dynamo.sglang
\
--multimodal-worker
\
--model-path
"
$MODEL_NAME
"
\
$SERVED_MODEL_ARG
\
...
...
@@ -101,7 +164,8 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
--disaggregation-mode
decode
\
--disaggregation-bootstrap-port
12345
\
--host
0.0.0.0
\
--disaggregation-transfer-backend
nixl &
--disaggregation-transfer-backend
nixl
\
$DECODE_EXTRA_ARGS
&
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
tests/serve/test_sglang.py
View file @
d92d5c77
...
...
@@ -205,6 +205,39 @@ sglang_configs = {
)
],
),
"multimodal_disagg_qwen"
:
SGLangConfig
(
# E/P/D architecture: Encode, Prefill, Decode workers all on GPU 0
name
=
"multimodal_disagg_qwen"
,
directory
=
sglang_dir
,
script_name
=
"multimodal_disagg.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
timeout
(
360
),
],
model
=
"Qwen/Qwen3-VL-2B-Instruct"
,
script_args
=
[
"--model"
,
"Qwen/Qwen3-VL-2B-Instruct"
,
"--single-gpu"
],
timeout
=
360
,
env
=
{},
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
request_payloads
=
[
chat_payload
(
[
{
"type"
:
"text"
,
"text"
:
"What is in this image?"
},
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
"http://images.cocodataset.org/test2017/000000155781.jpg"
},
},
],
repeat_count
=
1
,
expected_response
=
[
"image"
],
temperature
=
0.0
,
max_tokens
=
100
,
)
],
),
"multimodal_agg_qwen"
:
SGLangConfig
(
# Tests single-process aggregated multimodal inference using DecodeWorkerHandler
# with in-process vision encoding (no separate encode worker)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment