Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
22fb3398
Unverified
Commit
22fb3398
authored
Mar 17, 2026
by
KrishnanPrash
Committed by
GitHub
Mar 17, 2026
Browse files
ci(trtllm): fix aggregated_multimodal_router test (#7460)
Signed-off-by:
Krishnan Prashanth
<
kprashanth@nvidia.com
>
parent
45be2fdc
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
76 additions
and
11 deletions
+76
-11
examples/backends/trtllm/launch/agg_multimodal_router.sh
examples/backends/trtllm/launch/agg_multimodal_router.sh
+54
-0
examples/backends/trtllm/mm_router_worker/mm_processor.py
examples/backends/trtllm/mm_router_worker/mm_processor.py
+10
-4
tests/mm_router/test_mm_router_e2e.py
tests/mm_router/test_mm_router_e2e.py
+3
-3
tests/serve/test_trtllm.py
tests/serve/test_trtllm.py
+9
-4
No files found.
examples/backends/trtllm/launch/agg_multimodal_router.sh
0 → 100755
View file @
22fb3398
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Launch script for Aggregated Multimodal with MM Router Worker
#
# Architecture:
# Frontend --> MM Router Worker --> TRT-LLM Worker
# (KV-aware routing) (aggregated multimodal)
#
# The MM Router Worker sits between frontend and TRT-LLM, computing
# mm_hash for images and routing to the best worker based on KV cache overlap.
set
-e
trap
'echo Cleaning up...; kill 0'
EXIT
SCRIPT_DIR
=
"
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
"
source
"
$SCRIPT_DIR
/../../../common/launch_utils.sh"
export
DYNAMO_HOME
=
${
DYNAMO_HOME
:-
"/workspace"
}
export
MODEL_PATH
=
${
MODEL_PATH
:-
"Qwen/Qwen3-VL-2B-Instruct"
}
export
SERVED_MODEL_NAME
=
${
SERVED_MODEL_NAME
:-
"Qwen/Qwen3-VL-2B-Instruct"
}
export
AGG_ENGINE_ARGS
=
${
AGG_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml"
}
export
MODALITY
=
${
MODALITY
:-
"multimodal"
}
export
MODEL_TYPE
=
${
MODEL_TYPE
:-
"qwen3_vl"
}
export
BLOCK_SIZE
=
${
BLOCK_SIZE
:-
32
}
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
print_launch_banner
--multimodal
"Launching Aggregated Multimodal + MM Router"
"
$MODEL_PATH
"
"
$HTTP_PORT
"
# TRT-LLM worker: "__internal" suffix hides it from frontend discovery.
python3
-m
dynamo.trtllm
\
--model-path
"
$MODEL_PATH
"
\
--served-model-name
"
${
SERVED_MODEL_NAME
}
__internal"
\
--extra-engine-args
"
$AGG_ENGINE_ARGS
"
\
--modality
"
$MODALITY
"
\
--publish-events-and-metrics
\
--kv-block-size
"
$BLOCK_SIZE
"
&
# MM Router Worker: registers with the real model name; does KV-aware routing internally.
(
cd
"
$DYNAMO_HOME
"
&&
python3
-m
examples.backends.trtllm.mm_router_worker
\
--model
"
$MODEL_PATH
"
\
--model-type
"
$MODEL_TYPE
"
\
--namespace
dynamo
\
--component
mm_router
\
--endpoint
generate
\
--downstream-component
tensorrt_llm
\
--downstream-endpoint
generate
\
--block-size
"
$BLOCK_SIZE
"
)
&
# Frontend: round-robin to mm_router (KV routing happens inside mm_router, not here).
python3
-m
dynamo.frontend
--router-mode
round-robin &
wait_any_exit
examples/backends/trtllm/mm_router_worker/mm_processor.py
View file @
22fb3398
...
@@ -231,11 +231,11 @@ def _compute_tokens_per_image(
...
@@ -231,11 +231,11 @@ def _compute_tokens_per_image(
processor_output
:
dict
,
processor
:
Any
,
model_type
:
str
processor_output
:
dict
,
processor
:
Any
,
model_type
:
str
)
->
list
[
int
]:
)
->
list
[
int
]:
"""Compute the number of visual tokens for each image from processor output."""
"""Compute the number of visual tokens for each image from processor output."""
if
model_type
==
"qwen2_vl"
:
if
model_type
in
(
"qwen2_vl"
,
"qwen3_vl"
)
:
grid_thw
=
processor_output
.
get
(
"image_grid_thw"
)
grid_thw
=
processor_output
.
get
(
"image_grid_thw"
)
if
grid_thw
is
None
:
if
grid_thw
is
None
:
raise
ValueError
(
raise
ValueError
(
"image_grid_thw not found in processor output for
Qwen2-VL
"
f
"image_grid_thw not found in processor output for
{
model_type
}
"
)
)
merge_size
=
getattr
(
processor
.
image_processor
,
"merge_size"
,
2
)
merge_size
=
getattr
(
processor
.
image_processor
,
"merge_size"
,
2
)
...
@@ -254,8 +254,14 @@ def _get_replacement_id(model_path: str) -> int:
...
@@ -254,8 +254,14 @@ def _get_replacement_id(model_path: str) -> int:
try
:
try
:
config
=
AutoConfig
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
config
=
AutoConfig
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
replacement_id
=
config
.
vocab_size
+
1
# Some models (e.g. Qwen3-VL) store vocab_size in text_config, not top-level.
logger
.
info
(
f
"Got vocab_size=
{
config
.
vocab_size
}
from AutoConfig"
)
vocab_size
=
getattr
(
config
,
"vocab_size"
,
None
)
if
vocab_size
is
None
and
hasattr
(
config
,
"text_config"
):
vocab_size
=
getattr
(
config
.
text_config
,
"vocab_size"
,
None
)
if
vocab_size
is
None
:
raise
AttributeError
(
"vocab_size not found in config or config.text_config"
)
replacement_id
=
vocab_size
+
1
logger
.
info
(
f
"Got vocab_size=
{
vocab_size
}
from AutoConfig"
)
return
replacement_id
return
replacement_id
except
Exception
as
e
:
except
Exception
as
e
:
raise
RuntimeError
(
raise
RuntimeError
(
...
...
tests/mm_router/test_mm_router_e2e.py
View file @
22fb3398
...
@@ -30,11 +30,11 @@ from tests.utils.managed_process import ManagedProcess
...
@@ -30,11 +30,11 @@ from tests.utils.managed_process import ManagedProcess
from
tests.utils.payloads
import
check_models_api
from
tests.utils.payloads
import
check_models_api
from
tests.utils.port_utils
import
allocate_ports
from
tests.utils.port_utils
import
allocate_ports
TRTLLM_MM_MODEL
=
"Qwen/Qwen
2
-VL-2B-Instruct"
TRTLLM_MM_MODEL
=
"Qwen/Qwen
3
-VL-2B-Instruct"
TRTLLM_MM_MODEL_TYPE
=
"qwen
2
_vl"
TRTLLM_MM_MODEL_TYPE
=
"qwen
3
_vl"
BLOCK_SIZE
=
32
BLOCK_SIZE
=
32
NAMESPACE
=
"test-mm"
NAMESPACE
=
"test-mm"
# Broad guardrails for TRT-LLM + Qwen
2
-VL-2B under block size 32.
# Broad guardrails for TRT-LLM + Qwen
3
-VL-2B under block size 32.
THREE_IMAGE_TOTAL_BLOCKS_RANGE
=
(
80
,
520
)
THREE_IMAGE_TOTAL_BLOCKS_RANGE
=
(
80
,
520
)
SINGLE_IMAGE_TOTAL_BLOCKS_RANGE
=
(
20
,
260
)
SINGLE_IMAGE_TOTAL_BLOCKS_RANGE
=
(
20
,
260
)
...
...
tests/serve/test_trtllm.py
View file @
22fb3398
...
@@ -190,18 +190,23 @@ trtllm_configs = {
...
@@ -190,18 +190,23 @@ trtllm_configs = {
"aggregated_multimodal_router"
:
TRTLLMConfig
(
"aggregated_multimodal_router"
:
TRTLLMConfig
(
name
=
"aggregated_multimodal_router"
,
name
=
"aggregated_multimodal_router"
,
directory
=
trtllm_dir
,
directory
=
trtllm_dir
,
script_name
=
"agg_multimodal.sh"
,
script_name
=
"agg_multimodal
_router
.sh"
,
marks
=
[
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
multimodal
,
pytest
.
mark
.
multimodal
,
pytest
.
mark
.
nightly
,
pytest
.
mark
.
pre_merge
,
],
],
model
=
"Qwen/Qwen
2
-VL-
7
B-Instruct"
,
model
=
"Qwen/Qwen
3
-VL-
2
B-Instruct"
,
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
timeout
=
900
,
timeout
=
900
,
delayed_start
=
60
,
delayed_start
=
60
,
request_payloads
=
[
multimodal_payload_default
()],
request_payloads
=
[
multimodal_payload_default
(
text
=
"Describe what you see in this image."
,
expected_response
=
[
"mountain"
,
"rock"
,
"trees"
,
"road"
],
)
],
),
),
# TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
# TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
# Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
# Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment