fix: revert accidental change of trtllm/multimodal_processor (#6921)

Signed-off-by: Qi Wang <qiwa@nvidia.com>

fix: revert accidental change of trtllm/multimodal_processor (#6921)
Signed-off-by: Qi Wang <qiwa@nvidia.com>
9c6f36e9 · Qi Wang · GitHub · 57e6a79f · 9c6f36e9 · 9c6f36e9
Unverified Commit 9c6f36e9 authored Mar 05, 2026 by Qi Wang Committed by GitHub Mar 05, 2026
Showing with 52 additions and 33 deletions

components/src/dynamo/trtllm/multimodal_processor.py components/src/dynamo/trtllm/multimodal_processor.py +51 -32

tests/router/test_router_e2e_with_trtllm.py tests/router/test_router_e2e_with_trtllm.py +1 -1

No files found.
--- a/components/src/dynamo/trtllm/multimodal_processor.py
+++ b/components/src/dynamo/trtllm/multimodal_processor.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import asyncio
 import logging
 import time
 from io import BytesIO
@@ -22,6 +23,7 @@ from urllib.parse import urlparse
 from urllib.request import urlopen

 import torch
+from tensorrt_llm.inputs import default_multimodal_input_loader
 from tensorrt_llm.llmapi.tokenizer import tokenizer_factory

 from dynamo.common.multimodal.image_loader import ImageLoader
@@ -187,6 +189,11 @@ class MultimodalRequestProcessor:
            "prompt": str,
            "prompt_token_ids": List[int]
        }
+
+        -----------------------------------------------------------------------------
+        TODO: Revert default_multimodal_input_loader calls having fixed TRT-LLM's
+        token IDs & MM data path in generate_async() for the embeddings case.
+        -----------------------------------------------------------------------------
        """
        self.previous_decoded_text = ""

@@ -214,18 +221,50 @@ class MultimodalRequestProcessor:
        # mm_processor_kwargs must be a dict (not None) for TRT-LLM's processor
        processed_inputs = {"prompt_token_ids": token_ids, "mm_processor_kwargs": {}}

-        # EPD Flow Case 2: Embeddings received via NIXL from encode worker
-        # The encode worker computed vision embeddings and transferred them via RDMA/NIXL
-        # We need to pass these embeddings directly to TRT-LLM's generate_async
-        if embeddings is not None:
-            logging.info(
-                f"Using NIXL embeddings from encoder: shape={embeddings.shape if hasattr(embeddings, 'shape') else 'N/A'}"
+        # The aforementioned fallback to default_multimodal_input_loader:
+        messages = request.get("extra_args", {}).get(
+            "messages", request.get("messages", [])
+        )
+        text_prompt, _, embedding_paths = self.extract_prompt_and_media(messages)
+        loader_kwargs = {}
+
+        # Two cases, both for the default_multimodal_input_loader fallback:
+        # 1) EPD Flow Case 2: Embeddings received via NIXL from encode worker
+        #   The encode worker computed vision embeddings and transferred them via RDMA/NIXL
+        #   We need to pass these embeddings directly to TRT-LLM's generate_async
+        # 2) PD flow with no NIXL and no encoder
+        if embeddings is not None or embedding_paths:
+            if embeddings is not None:
+                logging.info(
+                    f"Using NIXL embeddings from encoder: shape={embeddings.shape if hasattr(embeddings, 'shape') else 'N/A'}"
+                )
+                loader_kwargs["mm_embeddings"] = [embeddings]
+            elif embedding_paths:
+                # PD flow with no NIXL and no encoder
+                loader_kwargs["mm_embeddings"] = [
+                    self.load_tensor_from_path_or_url(path) for path in embedding_paths
+                ]
+                logging.info(f"Using embedding paths: {embedding_paths}")
+
+            # NOTE: default_multimodal_input_loader downloads images and preprocesses them
+            # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
+            # across multiple requests, improving throughput at high concurrency.
+            fallback_processed_inputs = await asyncio.to_thread(
+                lambda: default_multimodal_input_loader(
+                    tokenizer=self.tokenizer,
+                    model_dir=self.model_dir,
+                    model_type=self.model_type,
+                    modality=self.modality,
+                    prompts=[text_prompt],
+                    image_data_format="pt",
+                    device="cuda",
+                    **loader_kwargs,
+                )
            )
-
-            # Structure embeddings in the format TRT-LLM's generate_async expects
-            processed_inputs["multi_modal_embeddings"] = embeddings
-
-            return processed_inputs
+            # Return the first processed input if available
+            if fallback_processed_inputs:
+                return fallback_processed_inputs[0]
+            return None

        # PD Flow: Pre-tokenized by Rust frontend with direct media loading
        # TODO: Add frontend decoding support
@@ -240,7 +279,6 @@ class MultimodalRequestProcessor:
            if image_items and isinstance(image_items, list):
                # Separate embedding paths from regular image URLs
                # Items come from Rust in format: {"Url": "..."} or {"Decoded": ...}
-                embedding_paths = []
                image_urls = []

                for item in image_items:
@@ -261,9 +299,7 @@ class MultimodalRequestProcessor:
                        continue

                    # Check if this is an embedding file based on extension
-                    if url.endswith((".pt", ".pth", ".bin")):
-                        embedding_paths.append(url)
-                    else:
+                    if not url.endswith((".pt", ".pth", ".bin")):
                        # Keep original item format for load_image_batch
                        image_urls.append(
                            item if isinstance(item, dict) else {"Url": item}
@@ -285,23 +321,6 @@ class MultimodalRequestProcessor:
                        logging.error(f"Failed to load images: {e}")
                        return None

-                # Load embedding files (.pt, .pth, .bin) for PD flow
-                # These are pre-computed vision encoder outputs
-                if embedding_paths:
-                    try:
-                        loaded_embeddings = [
-                            self.load_tensor_from_path_or_url(path)
-                            for path in embedding_paths
-                        ]
-                        if loaded_embeddings:
-                            processed_mm_data["embedding"] = loaded_embeddings
-                            logging.info(
-                                f"Loaded {len(loaded_embeddings)} embedding file(s) from paths: {embedding_paths}"
-                            )
-                    except Exception as e:
-                        logging.error(f"Failed to load embeddings: {e}")
-                        return None
-
            # TODO: Add support for video_url, audio_url

            if processed_mm_data:

--- a/tests/router/test_router_e2e_with_trtllm.py
+++ b/tests/router/test_router_e2e_with_trtllm.py
@@ -320,7 +320,7 @@ class TRTLLMProcess:
 @pytest.mark.pre_merge
 @pytest.mark.gpu_1
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
-@pytest.mark.timeout(150)  # ~3x average (~45s/test), rounded up
+@pytest.mark.timeout(300)
 def test_trtllm_kv_router_basic(
    request,
    runtime_services_dynamic_ports,