Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
9c6f36e9
Unverified
Commit
9c6f36e9
authored
Mar 05, 2026
by
Qi Wang
Committed by
GitHub
Mar 05, 2026
Browse files
fix: revert accidental change of trtllm/multimodal_processor (#6921)
Signed-off-by:
Qi Wang
<
qiwa@nvidia.com
>
parent
57e6a79f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
52 additions
and
33 deletions
+52
-33
components/src/dynamo/trtllm/multimodal_processor.py
components/src/dynamo/trtllm/multimodal_processor.py
+51
-32
tests/router/test_router_e2e_with_trtllm.py
tests/router/test_router_e2e_with_trtllm.py
+1
-1
No files found.
components/src/dynamo/trtllm/multimodal_processor.py
View file @
9c6f36e9
...
...
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
time
from
io
import
BytesIO
...
...
@@ -22,6 +23,7 @@ from urllib.parse import urlparse
from
urllib.request
import
urlopen
import
torch
from
tensorrt_llm.inputs
import
default_multimodal_input_loader
from
tensorrt_llm.llmapi.tokenizer
import
tokenizer_factory
from
dynamo.common.multimodal.image_loader
import
ImageLoader
...
...
@@ -187,6 +189,11 @@ class MultimodalRequestProcessor:
"prompt": str,
"prompt_token_ids": List[int]
}
-----------------------------------------------------------------------------
TODO: Revert default_multimodal_input_loader calls having fixed TRT-LLM's
token IDs & MM data path in generate_async() for the embeddings case.
-----------------------------------------------------------------------------
"""
self
.
previous_decoded_text
=
""
...
...
@@ -214,18 +221,50 @@ class MultimodalRequestProcessor:
# mm_processor_kwargs must be a dict (not None) for TRT-LLM's processor
processed_inputs
=
{
"prompt_token_ids"
:
token_ids
,
"mm_processor_kwargs"
:
{}}
# EPD Flow Case 2: Embeddings received via NIXL from encode worker
# The encode worker computed vision embeddings and transferred them via RDMA/NIXL
# We need to pass these embeddings directly to TRT-LLM's generate_async
if
embeddings
is
not
None
:
logging
.
info
(
f
"Using NIXL embeddings from encoder: shape=
{
embeddings
.
shape
if
hasattr
(
embeddings
,
'shape'
)
else
'N/A'
}
"
# The aforementioned fallback to default_multimodal_input_loader:
messages
=
request
.
get
(
"extra_args"
,
{}).
get
(
"messages"
,
request
.
get
(
"messages"
,
[])
)
text_prompt
,
_
,
embedding_paths
=
self
.
extract_prompt_and_media
(
messages
)
loader_kwargs
=
{}
# Two cases, both for the default_multimodal_input_loader fallback:
# 1) EPD Flow Case 2: Embeddings received via NIXL from encode worker
# The encode worker computed vision embeddings and transferred them via RDMA/NIXL
# We need to pass these embeddings directly to TRT-LLM's generate_async
# 2) PD flow with no NIXL and no encoder
if
embeddings
is
not
None
or
embedding_paths
:
if
embeddings
is
not
None
:
logging
.
info
(
f
"Using NIXL embeddings from encoder: shape=
{
embeddings
.
shape
if
hasattr
(
embeddings
,
'shape'
)
else
'N/A'
}
"
)
loader_kwargs
[
"mm_embeddings"
]
=
[
embeddings
]
elif
embedding_paths
:
# PD flow with no NIXL and no encoder
loader_kwargs
[
"mm_embeddings"
]
=
[
self
.
load_tensor_from_path_or_url
(
path
)
for
path
in
embedding_paths
]
logging
.
info
(
f
"Using embedding paths:
{
embedding_paths
}
"
)
# NOTE: default_multimodal_input_loader downloads images and preprocesses them
# synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
# across multiple requests, improving throughput at high concurrency.
fallback_processed_inputs
=
await
asyncio
.
to_thread
(
lambda
:
default_multimodal_input_loader
(
tokenizer
=
self
.
tokenizer
,
model_dir
=
self
.
model_dir
,
model_type
=
self
.
model_type
,
modality
=
self
.
modality
,
prompts
=
[
text_prompt
],
image_data_format
=
"pt"
,
device
=
"cuda"
,
**
loader_kwargs
,
)
)
# Structure embeddings in the format TRT-LLM's generate_async expects
processed_inputs
[
"multi_modal_embeddings"
]
=
embeddings
return
processed_inputs
# Return the first processed input if available
if
fallback_processed_inputs
:
return
fallback_processed_inputs
[
0
]
return
None
# PD Flow: Pre-tokenized by Rust frontend with direct media loading
# TODO: Add frontend decoding support
...
...
@@ -240,7 +279,6 @@ class MultimodalRequestProcessor:
if
image_items
and
isinstance
(
image_items
,
list
):
# Separate embedding paths from regular image URLs
# Items come from Rust in format: {"Url": "..."} or {"Decoded": ...}
embedding_paths
=
[]
image_urls
=
[]
for
item
in
image_items
:
...
...
@@ -261,9 +299,7 @@ class MultimodalRequestProcessor:
continue
# Check if this is an embedding file based on extension
if
url
.
endswith
((
".pt"
,
".pth"
,
".bin"
)):
embedding_paths
.
append
(
url
)
else
:
if
not
url
.
endswith
((
".pt"
,
".pth"
,
".bin"
)):
# Keep original item format for load_image_batch
image_urls
.
append
(
item
if
isinstance
(
item
,
dict
)
else
{
"Url"
:
item
}
...
...
@@ -285,23 +321,6 @@ class MultimodalRequestProcessor:
logging
.
error
(
f
"Failed to load images:
{
e
}
"
)
return
None
# Load embedding files (.pt, .pth, .bin) for PD flow
# These are pre-computed vision encoder outputs
if
embedding_paths
:
try
:
loaded_embeddings
=
[
self
.
load_tensor_from_path_or_url
(
path
)
for
path
in
embedding_paths
]
if
loaded_embeddings
:
processed_mm_data
[
"embedding"
]
=
loaded_embeddings
logging
.
info
(
f
"Loaded
{
len
(
loaded_embeddings
)
}
embedding file(s) from paths:
{
embedding_paths
}
"
)
except
Exception
as
e
:
logging
.
error
(
f
"Failed to load embeddings:
{
e
}
"
)
return
None
# TODO: Add support for video_url, audio_url
if
processed_mm_data
:
...
...
tests/router/test_router_e2e_with_trtllm.py
View file @
9c6f36e9
...
...
@@ -320,7 +320,7 @@ class TRTLLMProcess:
@
pytest
.
mark
.
pre_merge
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
parametrize
(
"request_plane"
,
[
"tcp"
],
indirect
=
True
)
@
pytest
.
mark
.
timeout
(
150
)
# ~3x average (~45s/test), rounded up
@
pytest
.
mark
.
timeout
(
300
)
def
test_trtllm_kv_router_basic
(
request
,
runtime_services_dynamic_ports
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment