Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
842f0f15
Unverified
Commit
842f0f15
authored
Jan 28, 2026
by
Indrajit Bhosale
Committed by
GitHub
Jan 29, 2026
Browse files
fix: VLLM Multimodal minor fixes (#5748)
Signed-off-by:
Indrajit Bhosale
<
iamindrajitb@gmail.com
>
parent
c5e30afb
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
17 additions
and
4 deletions
+17
-4
examples/backends/vllm/launch/agg_multimodal_epd.sh
examples/backends/vllm/launch/agg_multimodal_epd.sh
+12
-2
examples/multimodal/utils/args.py
examples/multimodal/utils/args.py
+5
-2
No files found.
examples/backends/vllm/launch/agg_multimodal_epd.sh
View file @
842f0f15
...
@@ -50,15 +50,25 @@ done
...
@@ -50,15 +50,25 @@ done
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python
-m
dynamo.frontend &
python
-m
dynamo.frontend &
# Set max model length based on model name
MAX_MODEL_LEN
=
""
if
[[
"
$MODEL_NAME
"
==
"Qwen/Qwen2.5-VL-7B-Instruct"
]]
;
then
MAX_MODEL_LEN
=
"4096"
elif
[[
"
$MODEL_NAME
"
==
"llava-hf/llava-1.5-7b-hf"
]]
;
then
MAX_MODEL_LEN
=
"2048"
else
MAX_MODEL_LEN
=
"30426"
fi
# Set GPU memory utilization and model length based on deployment mode
# Set GPU memory utilization and model length based on deployment mode
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Single-GPU mode: Both workers share GPU 0, so use reduced memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
# Multi-GPU mode: Each worker gets its own GPU, so use higher memory settings
EXTRA_ARGS
=
""
EXTRA_ARGS
=
""
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
if
[[
"
$SINGLE_GPU
"
==
"true"
]]
;
then
EXTRA_ARGS
=
"--gpu-memory-utilization 0.4 --enforce-eager --max-model-len
30426
"
EXTRA_ARGS
=
"--gpu-memory-utilization 0.4 --enforce-eager --max-model-len
$MAX_MODEL_LEN
"
else
else
# Multi-GPU mode: standard memory settings
# Multi-GPU mode: standard memory settings
EXTRA_ARGS
=
"--gpu-memory-utilization 0.85 --max-model-len
30426
"
EXTRA_ARGS
=
"--gpu-memory-utilization 0.85 --max-model-len
$MAX_MODEL_LEN
"
fi
fi
# Start processor (Python-based preprocessing, handles prompt templating)
# Start processor (Python-based preprocessing, handles prompt templating)
...
...
examples/multimodal/utils/args.py
View file @
842f0f15
...
@@ -153,7 +153,8 @@ def overwrite_args(config):
...
@@ -153,7 +153,8 @@ def overwrite_args(config):
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
dp_rank
=
config
.
engine_args
.
data_parallel_rank
or
0
defaults
=
{
defaults
=
{
"task"
:
"generate"
,
# vLLM 0.13+ renamed 'task' to 'runner'
"runner"
:
"generate"
,
"skip_tokenizer_init"
:
False
,
"skip_tokenizer_init"
:
False
,
"enable_log_requests"
:
False
,
"enable_log_requests"
:
False
,
"enable_prefix_caching"
:
True
,
"enable_prefix_caching"
:
True
,
...
@@ -178,4 +179,6 @@ def overwrite_args(config):
...
@@ -178,4 +179,6 @@ def overwrite_args(config):
setattr
(
config
.
engine_args
,
key
,
value
)
setattr
(
config
.
engine_args
,
key
,
value
)
logger
.
debug
(
f
" engine_args.
{
key
}
=
{
value
}
"
)
logger
.
debug
(
f
" engine_args.
{
key
}
=
{
value
}
"
)
else
:
else
:
raise
ValueError
(
f
"
{
key
}
not found in AsyncEngineArgs from vLLM."
)
logger
.
debug
(
f
" Skipping engine_args.
{
key
}
(not available in this vLLM version)"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment