Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
77cecf4e
Unverified
Commit
77cecf4e
authored
Oct 24, 2025
by
Kris Hung
Committed by
GitHub
Oct 24, 2025
Browse files
fix: Fix sglang multimodal test (#3862)
Signed-off-by:
krishung5
<
krish@nvidia.com
>
parent
cbe0b177
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
105 additions
and
15 deletions
+105
-15
components/backends/sglang/launch/multimodal_agg.sh
components/backends/sglang/launch/multimodal_agg.sh
+13
-6
components/backends/sglang/launch/multimodal_disagg.sh
components/backends/sglang/launch/multimodal_disagg.sh
+14
-6
components/src/dynamo/sglang/multimodal_utils/multimodal_encode_utils.py
...dynamo/sglang/multimodal_utils/multimodal_encode_utils.py
+76
-2
components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
...lang/request_handlers/multimodal/encode_worker_handler.py
+2
-1
No files found.
components/backends/sglang/launch/multimodal_agg.sh
View file @
77cecf4e
...
@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
...
@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME
=
$2
MODEL_NAME
=
$2
shift
2
shift
2
;;
;;
--served-model-name
)
SERVED_MODEL_NAME
=
$2
shift
2
;;
--chat-template
)
--chat-template
)
PROVIDED_CHAT_TEMPLATE
=
$2
PROVIDED_CHAT_TEMPLATE
=
$2
shift
2
shift
2
...
@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
...
@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
echo
"Usage:
$0
[OPTIONS]"
echo
"Usage:
$0
[OPTIONS]"
echo
"Options:"
echo
"Options:"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" -h, --help Show this help message"
echo
" -h, --help Show this help message"
exit
0
exit
0
...
@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
...
@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
CHAT_TEMPLATE
=
"
$PROVIDED_CHAT_TEMPLATE
"
CHAT_TEMPLATE
=
"
$PROVIDED_CHAT_TEMPLATE
"
fi
fi
# Get the directory where this script is located
# Prepare served-model-name argument if provided
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
SERVED_MODEL_ARG
=
""
SGLANG_BACKEND_DIR
=
"
$SCRIPT_DIR
/src"
if
[[
-n
"
$SERVED_MODEL_NAME
"
]]
;
then
SERVED_MODEL_ARG
=
"--served-model-name
$SERVED_MODEL_NAME
"
fi
# run ingress
# run ingress
python3
-m
dynamo.frontend
--http-port
=
8000 &
python3
-m
dynamo.frontend
--http-port
=
8000 &
DYNAMO_PID
=
$!
DYNAMO_PID
=
$!
# run SGLang multimodal processor
# run SGLang multimodal processor
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
--chat-template
"
$CHAT_TEMPLATE
"
&
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
# run SGLang multimodal encode worker
# run SGLang multimodal encode worker
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
--chat-template
"
$CHAT_TEMPLATE
"
&
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
# run SGLang multimodal inference worker
# run SGLang multimodal inference worker
# TODO: Remove disable-radix-cache once the issue is fixed.
# TODO: Remove disable-radix-cache once the issue is fixed.
...
@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
...
@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.sglang
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.sglang
\
--multimodal-worker
\
--multimodal-worker
\
--model-path
"
$MODEL_NAME
"
\
--model-path
"
$MODEL_NAME
"
\
$SERVED_MODEL_ARG
\
--page-size
16
\
--page-size
16
\
--tp
1
\
--tp
1
\
--trust-remote-code
\
--trust-remote-code
\
...
...
components/backends/sglang/launch/multimodal_disagg.sh
View file @
77cecf4e
...
@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
...
@@ -23,6 +23,10 @@ while [[ $# -gt 0 ]]; do
MODEL_NAME
=
$2
MODEL_NAME
=
$2
shift
2
shift
2
;;
;;
--served-model-name
)
SERVED_MODEL_NAME
=
$2
shift
2
;;
--chat-template
)
--chat-template
)
PROVIDED_CHAT_TEMPLATE
=
$2
PROVIDED_CHAT_TEMPLATE
=
$2
shift
2
shift
2
...
@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
...
@@ -31,6 +35,7 @@ while [[ $# -gt 0 ]]; do
echo
"Usage:
$0
[OPTIONS]"
echo
"Usage:
$0
[OPTIONS]"
echo
"Options:"
echo
"Options:"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --model <model_name> Specify the model to use (default:
$MODEL_NAME
)"
echo
" --served-model-name <served_model_name> Specify the served model name to use (default: empty)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" --chat-template <template> Specify the SGLang chat template to use (default:
$CHAT_TEMPLATE
)"
echo
" -h, --help Show this help message"
echo
" -h, --help Show this help message"
exit
0
exit
0
...
@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
...
@@ -48,20 +53,21 @@ if [[ -n "$PROVIDED_CHAT_TEMPLATE" ]]; then
CHAT_TEMPLATE
=
"
$PROVIDED_CHAT_TEMPLATE
"
CHAT_TEMPLATE
=
"
$PROVIDED_CHAT_TEMPLATE
"
fi
fi
# Get the directory where this script is located
# Prepare served-model-name argument if provided
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
SERVED_MODEL_ARG
=
""
SGLANG_BACKEND_DIR
=
"
$SCRIPT_DIR
/src"
if
[[
-n
"
$SERVED_MODEL_NAME
"
]]
;
then
SERVED_MODEL_ARG
=
"--served-model-name
$SERVED_MODEL_NAME
"
fi
# run ingress
# run ingress
python3
-m
dynamo.frontend
--http-port
=
8000 &
python3
-m
dynamo.frontend
--http-port
=
8000 &
DYNAMO_PID
=
$!
DYNAMO_PID
=
$!
# run SGLang multimodal processor
# run SGLang multimodal processor
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
--chat-template
"
$CHAT_TEMPLATE
"
&
python3
-m
dynamo.sglang
--multimodal-processor
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
# run SGLang multimodal encode worker
# run SGLang multimodal encode worker
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
--chat-template
"
$CHAT_TEMPLATE
"
&
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.sglang
--multimodal-encode-worker
--model-path
"
$MODEL_NAME
"
$SERVED_MODEL_ARG
--chat-template
"
$CHAT_TEMPLATE
"
&
# run SGLang multimodal prefill worker
# run SGLang multimodal prefill worker
# TODO: Remove disable-radix-cache once the issue is fixed.
# TODO: Remove disable-radix-cache once the issue is fixed.
...
@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
...
@@ -69,6 +75,7 @@ CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.sglang --multimodal-encode-worker --mod
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.sglang
\
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.sglang
\
--multimodal-worker
\
--multimodal-worker
\
--model-path
"
$MODEL_NAME
"
\
--model-path
"
$MODEL_NAME
"
\
$SERVED_MODEL_ARG
\
--page-size
16
\
--page-size
16
\
--tp
1
\
--tp
1
\
--trust-remote-code
\
--trust-remote-code
\
...
@@ -83,6 +90,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
...
@@ -83,6 +90,7 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
CUDA_VISIBLE_DEVICES
=
2 python3
-m
dynamo.sglang
\
CUDA_VISIBLE_DEVICES
=
2 python3
-m
dynamo.sglang
\
--multimodal-worker
\
--multimodal-worker
\
--model-path
"
$MODEL_NAME
"
\
--model-path
"
$MODEL_NAME
"
\
$SERVED_MODEL_ARG
\
--page-size
16
\
--page-size
16
\
--tp
1
\
--tp
1
\
--trust-remote-code
\
--trust-remote-code
\
...
...
components/src/dynamo/sglang/multimodal_utils/multimodal_encode_utils.py
View file @
77cecf4e
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
logging
import
logging
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
Optional
from
typing
import
Any
,
Dict
,
Optional
import
torch
import
torch
...
@@ -15,6 +16,75 @@ class SupportedModels:
...
@@ -15,6 +16,75 @@ class SupportedModels:
QWEN_2_5_VL_7B
=
"Qwen/Qwen2.5-VL-7B-Instruct"
QWEN_2_5_VL_7B
=
"Qwen/Qwen2.5-VL-7B-Instruct"
def
normalize_model_name
(
model_name
:
str
)
->
str
:
"""
Extract and normalize model name from various formats including HuggingFace cache paths.
Args:
model_name: Model identifier which can be:
- A simple model name: "Qwen/Qwen2.5-VL-7B-Instruct"
- A HuggingFace cache path: "/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/..."
- A local path to a model directory
Returns:
Normalized model name in the format "organization/model-name"
Examples:
>>> normalize_model_name("Qwen/Qwen2.5-VL-7B-Instruct")
"Qwen/Qwen2.5-VL-7B-Instruct"
>>> normalize_model_name("/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-7B-Instruct/snapshots/...")
"Qwen/Qwen2.5-VL-7B-Instruct"
"""
# If it's already a simple model name (org/model format), return as-is
if
"/"
in
model_name
and
not
model_name
.
startswith
(
"/"
):
return
model_name
# Handle HuggingFace cache paths
if
"models--"
in
model_name
:
# Extract from cache path format: models--ORG--MODEL-NAME
# Split on "models--" then on "--" to handle dashes in org/model names
parts_after_models
=
model_name
.
split
(
"models--"
,
1
)
if
len
(
parts_after_models
)
>
1
:
# Split the remaining part on "--" and take the last two segments
segments
=
parts_after_models
[
1
].
split
(
"--"
)
if
len
(
segments
)
>=
2
:
# Take all segments except the last as org (rejoined with dashes)
# and the last segment (before any slash) as model name
org_segments
=
segments
[:
-
1
]
model_segment
=
segments
[
-
1
].
split
(
"/"
)[
0
]
# Remove any path after model name
org
=
"--"
.
join
(
org_segments
)
# Rejoin org parts with dashes
model
=
model_segment
return
f
"
{
org
}
/
{
model
}
"
# Handle local directory paths - extract the last directory name
path
=
Path
(
model_name
)
if
path
.
exists
()
and
path
.
is_dir
():
return
path
.
name
# If no pattern matches, return the original name
return
model_name
def
is_model_supported
(
model_name
:
str
,
supported_model
:
str
)
->
bool
:
"""
Check if a model name matches a supported model, handling various naming formats.
Args:
model_name: The model name to check (may be path, cache name, etc.)
supported_model: The supported model identifier
Returns:
True if the model is supported, False otherwise
"""
normalized_name
=
normalize_model_name
(
model_name
).
lower
()
normalized_supported
=
normalize_model_name
(
supported_model
).
lower
()
return
normalized_name
==
normalized_supported
def
get_qwen_image_features
(
def
get_qwen_image_features
(
vision_encoder
:
torch
.
nn
.
Module
,
image_embeds
:
Dict
[
str
,
Any
]
vision_encoder
:
torch
.
nn
.
Module
,
image_embeds
:
Dict
[
str
,
Any
]
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
...
@@ -71,11 +141,15 @@ def encode_image_embeddings(
...
@@ -71,11 +141,15 @@ def encode_image_embeddings(
"""
"""
with
torch
.
no_grad
():
with
torch
.
no_grad
():
# Route through the correct encoder based on model
# Route through the correct encoder based on model
if
model_name
==
SupportedModels
.
QWEN_2_5_VL_7B
:
if
is_model_supported
(
model_name
,
SupportedModels
.
QWEN_2_5_VL_7B
)
:
embeddings
=
get_qwen_image_features
(
vision_encoder
,
image_embeds
)
embeddings
=
get_qwen_image_features
(
vision_encoder
,
image_embeds
)
else
:
else
:
raise
NotImplementedError
(
f
"Model not supported:
{
model_name
}
"
)
# Provide more helpful error message with normalized model name
normalized_name
=
normalize_model_name
(
model_name
)
raise
NotImplementedError
(
f
"Model not supported:
{
normalized_name
}
(original:
{
model_name
}
)"
)
# Normalize output shape
# Normalize output shape
if
isinstance
(
embeddings
,
(
tuple
,
list
)):
if
isinstance
(
embeddings
,
(
tuple
,
list
)):
...
...
components/src/dynamo/sglang/request_handlers/multimodal/encode_worker_handler.py
View file @
77cecf4e
...
@@ -49,6 +49,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
...
@@ -49,6 +49,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
super
().
__init__
(
component
,
engine
=
None
,
config
=
config
)
super
().
__init__
(
component
,
engine
=
None
,
config
=
config
)
self
.
pd_worker_client
=
pd_worker_client
self
.
pd_worker_client
=
pd_worker_client
self
.
model
=
config
.
server_args
.
model_path
self
.
model
=
config
.
server_args
.
model_path
self
.
served_model_name
=
config
.
server_args
.
served_model_name
self
.
image_loader
=
ImageLoader
(
cache_size
=
CACHE_SIZE_MAXIMUM
)
self
.
image_loader
=
ImageLoader
(
cache_size
=
CACHE_SIZE_MAXIMUM
)
...
@@ -124,7 +125,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
...
@@ -124,7 +125,7 @@ class MultimodalEncodeWorkerHandler(BaseWorkerHandler):
image_embeds
=
self
.
image_processor
(
images
=
image
,
return_tensors
=
"pt"
)
image_embeds
=
self
.
image_processor
(
images
=
image
,
return_tensors
=
"pt"
)
precomputed_embeddings
=
encode_image_embeddings
(
precomputed_embeddings
=
encode_image_embeddings
(
model_name
=
self
.
model
,
model_name
=
self
.
served_model_name
,
image_embeds
=
image_embeds
,
image_embeds
=
image_embeds
,
vision_encoder
=
self
.
vision_model
,
vision_encoder
=
self
.
vision_model
,
projector
=
None
,
projector
=
None
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment