Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
be2f1dc1
Unverified
Commit
be2f1dc1
authored
Mar 13, 2026
by
KrishnanPrash
Committed by
GitHub
Mar 13, 2026
Browse files
ci: fit sglang multimodal EPD test on 1 GPU (#7046)
Signed-off-by:
Krishnan Prashanth
<
kprashanth@nvidia.com
>
parent
f744c7c3
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
65 additions
and
9 deletions
+65
-9
examples/backends/sglang/launch/multimodal_epd.sh
examples/backends/sglang/launch/multimodal_epd.sh
+53
-5
tests/serve/test_sglang.py
tests/serve/test_sglang.py
+12
-4
No files found.
examples/backends/sglang/launch/multimodal_epd.sh
View file @
be2f1dc1
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
#
#
# Multimodal E/PD: separate vision encoder (GPU 0) + combined PD worker (GPU 1).
# Multimodal E/PD: separate vision encoder (GPU 0) + combined PD worker (GPU 1).
# GPUs: 2
# GPUs: 2
(or 1 with --single-gpu)
set
-e
set
-e
trap
'echo Cleaning up...; kill 0'
EXIT
trap
'echo Cleaning up...; kill 0'
EXIT
...
@@ -16,6 +16,12 @@ MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
...
@@ -16,6 +16,12 @@ MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
CHAT_TEMPLATE
=
"qwen2-vl"
CHAT_TEMPLATE
=
"qwen2-vl"
PROVIDED_CHAT_TEMPLATE
=
""
PROVIDED_CHAT_TEMPLATE
=
""
# --single-gpu: Packs both workers (encode, PD) onto a single GPU.
# This is intended for functional testing with small models (e.g. 2B) where CI
# only has 1 GPU available. It uses lower mem-fraction-static values to share the GPU
# and enables memory-saving options.
SINGLE_GPU
=
false
# Parse command line arguments
# Parse command line arguments
while
[[
$#
-gt
0
]]
;
do
while
[[
$#
-gt
0
]]
;
do
case
$1
in
case
$1
in
...
@@ -31,12 +37,17 @@ while [[ $# -gt 0 ]]; do
...
@@ -31,12 +37,17 @@ while [[ $# -gt 0 ]]; do
PROVIDED_CHAT_TEMPLATE
=
$2
PROVIDED_CHAT_TEMPLATE
=
$2
shift
2
shift
2
;;
;;
--single-gpu
)
SINGLE_GPU
=
true
shift
;;
-h
|
--help
)
-h
|
--help
)
echo
"Usage:
$0
[OPTIONS]"
echo
"Usage:
$0
[OPTIONS]"
echo
"Options:"
echo
"Options:"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo
" --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" --single-gpu Pack both workers on 1 GPU (for small models, e.g. 2B)"
echo
" -h, --help Show this help message"
echo
" -h, --help Show this help message"
exit
0
exit
0
;;
;;
...
@@ -59,8 +70,33 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
...
@@ -59,8 +70,33 @@ if [[ -n "$SERVED_MODEL_NAME" ]]; then
SERVED_MODEL_ARG
=
"--served-model-name
$SERVED_MODEL_NAME
"
SERVED_MODEL_ARG
=
"--served-model-name
$SERVED_MODEL_NAME
"
fi
fi
# GPU assignments (override via environment variables)
DYN_ENCODE_WORKER_GPU
=
${
DYN_ENCODE_WORKER_GPU
:-
0
}
DYN_WORKER_GPU
=
${
DYN_WORKER_GPU
:-
1
}
# GPU memory fractions for workers (used with --mem-fraction-static)
DYN_ENCODE_GPU_MEM
=
${
DYN_ENCODE_GPU_MEM
:-
0
.9
}
DYN_WORKER_GPU_MEM
=
${
DYN_WORKER_GPU_MEM
:-
0
.9
}
ENCODE_EXTRA_ARGS
=
""
WORKER_EXTRA_ARGS
=
""
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
ENCODE_EXTRA_ARGS
=
"--mem-fraction-static
${
DYN_ENCODE_GPU_MEM
}
"
WORKER_EXTRA_ARGS
=
"--mem-fraction-static
${
DYN_WORKER_GPU_MEM
}
--delete-ckpt-after-loading --max-running-requests 2 --chunked-prefill-size 4096 --max-prefill-tokens 4096"
fi
# Prevent port collisions: the test framework exports DYN_SYSTEM_PORT which all
# child processes would inherit. Unset it so only workers that need it set their own.
unset
DYN_SYSTEM_PORT
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
print_launch_banner
--multimodal
"Launching Multimodal E/PD (2 GPUs)"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
GPU_LABEL
=
"1 GPU"
else
GPU_LABEL
=
"2 GPUs"
fi
print_launch_banner
--multimodal
"Launching Multimodal E/PD (
$GPU_LABEL
)"
"
$MODEL_NAME
"
"
$HTTP_PORT
"
# run ingress
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
...
@@ -70,12 +106,23 @@ python3 -m dynamo.frontend &
...
@@ -70,12 +106,23 @@ python3 -m dynamo.frontend &
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
# run SGLang multimodal encode worker
# run SGLang multimodal encode worker
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
echo
"Starting encode worker on GPU
$DYN_ENCODE_WORKER_GPU
(GPU mem:
$DYN_ENCODE_GPU_MEM
)..."
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT1
:-
8081
}
\
CUDA_VISIBLE_DEVICES
=
$DYN_ENCODE_WORKER_GPU
python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
$ENCODE_EXTRA_ARGS
&
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
# Wait for encode worker to initialize before starting PD worker.
# This prevents both workers from competing for GPU memory simultaneously, which can cause OOM.
echo
"Waiting for encode worker to initialize..."
sleep
5
fi
# run SGLang multimodal inference worker
# run SGLang multimodal inference worker
# TODO: Remove disable-radix-cache once the issue is fixed.
# TODO: Remove disable-radix-cache once the issue is fixed.
# See https://github.com/sgl-project/sglang/pull/11203.
# See https://github.com/sgl-project/sglang/pull/11203.
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.sglang
\
echo
"Starting PD worker on GPU
$DYN_WORKER_GPU
(GPU mem:
$DYN_WORKER_GPU_MEM
)..."
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT2
:-
8082
}
\
CUDA_VISIBLE_DEVICES
=
$DYN_WORKER_GPU
python3
-m
dynamo.sglang
\
--multimodal-worker
\
--multimodal-worker
\
--model-path
"
$MODEL_NAME
"
\
--model-path
"
$MODEL_NAME
"
\
$SERVED_MODEL_ARG
\
$SERVED_MODEL_ARG
\
...
@@ -84,7 +131,8 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
...
@@ -84,7 +131,8 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--trust-remote-code
\
--trust-remote-code
\
--skip-tokenizer-init
\
--skip-tokenizer-init
\
--disable-radix-cache
\
--disable-radix-cache
\
--disaggregation-transfer-backend
nixl &
--disaggregation-transfer-backend
nixl
\
$WORKER_EXTRA_ARGS
&
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
wait_any_exit
tests/serve/test_sglang.py
View file @
be2f1dc1
...
@@ -169,15 +169,22 @@ sglang_configs = {
...
@@ -169,15 +169,22 @@ sglang_configs = {
)
)
],
],
),
),
# NOTE: Pack all workers on 1 GPU for lower CI resource requirements
"multimodal_epd_qwen"
:
SGLangConfig
(
"multimodal_epd_qwen"
:
SGLangConfig
(
# E/PD architecture: Encode
worker (GPU 0) +
Prefill
/
Decode worker
(
GPU
1)
# E/P
/
D architecture: Encode
,
Prefill
,
Decode worker
s all on
GPU
0
name
=
"multimodal_epd_qwen"
,
name
=
"multimodal_epd_qwen"
,
directory
=
sglang_dir
,
directory
=
sglang_dir
,
script_name
=
"multimodal_epd.sh"
,
script_name
=
"multimodal_epd.sh"
,
marks
=
[
pytest
.
mark
.
gpu_
2
,
pytest
.
mark
.
nightly
],
marks
=
[
pytest
.
mark
.
gpu_
1
,
pytest
.
mark
.
pre_merge
],
model
=
"Qwen/Qwen
2.5
-VL-
7
B-Instruct"
,
model
=
"Qwen/Qwen
3
-VL-
2
B-Instruct"
,
delayed_start
=
0
,
script_args
=
[
"--model"
,
"Qwen/Qwen3-VL-2B-Instruct"
,
"--single-gpu"
]
,
timeout
=
360
,
timeout
=
360
,
env
=
{
"DYN_ENCODE_WORKER_GPU"
:
"0"
,
"DYN_WORKER_GPU"
:
"0"
,
"DYN_ENCODE_GPU_MEM"
:
"0.1"
,
"DYN_WORKER_GPU_MEM"
:
"0.4"
,
},
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
request_payloads
=
[
request_payloads
=
[
chat_payload
(
chat_payload
(
...
@@ -196,6 +203,7 @@ sglang_configs = {
...
@@ -196,6 +203,7 @@ sglang_configs = {
# approach to validation for this test to be stable.
# approach to validation for this test to be stable.
expected_response
=
[
"image"
],
expected_response
=
[
"image"
],
temperature
=
0.0
,
temperature
=
0.0
,
max_tokens
=
100
,
)
)
],
],
),
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment