fix: Wrap default_multimodal_input_loader in asyncio.to_thread (#5945)

eff08aed · Indrajit Bhosale · GitHub · f3bea5d5 · eff08aed · eff08aed
Unverified Commit eff08aed authored Feb 04, 2026 by Indrajit Bhosale Committed by GitHub Feb 04, 2026
Showing with 27 additions and 21 deletions

components/src/dynamo/trtllm/encode_helper.py components/src/dynamo/trtllm/encode_helper.py +12 -9

components/src/dynamo/trtllm/multimodal_processor.py components/src/dynamo/trtllm/multimodal_processor.py +15 -12

No files found.
--- a/components/src/dynamo/trtllm/encode_helper.py
+++ b/components/src/dynamo/trtllm/encode_helper.py
@@ -298,15 +298,18 @@ class EncodeHelper:
        # for tokenizer loading. `model_type` is needed to retrieve the correct
        # multimodal placeholders and apply model-specific preprocessing.

-        # Pass tokenizer to reuse the pre-initialized tokenizer instead of
-        # creating a new one per request
-        inputs = default_multimodal_input_loader(
-            tokenizer=tokenizer,
-            model_dir=model_dir,
-            model_type=model_type,
-            modality="image",
-            prompts=[text_prompt],
-            media=image_urls[0],
+        # NOTE: default_multimodal_input_loader downloads images and preprocesses them
+        # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
+        # across multiple requests, improving throughput at high concurrency.
+        inputs = await asyncio.to_thread(
+            lambda: default_multimodal_input_loader(
+                tokenizer=tokenizer,
+                model_dir=model_dir,
+                model_type=model_type,
+                modality="image",
+                prompts=[text_prompt],
+                media=image_urls[0],
+            )
        )

        # NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid

--- a/components/src/dynamo/trtllm/multimodal_processor.py
+++ b/components/src/dynamo/trtllm/multimodal_processor.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import asyncio
 import logging
 import time
 from io import BytesIO
@@ -211,18 +212,20 @@ class MultimodalRequestProcessor:
            ]
            logging.info(f"Using embedding paths: {embedding_paths}")

-        # Process with default_multimodal_input_loader
-        # Pass self.tokenizer to reuse the pre-initialized tokenizer instead of
-        # creating a new one per request
-        processed_inputs = default_multimodal_input_loader(
-            tokenizer=self.tokenizer,
-            model_dir=self.model_dir,
-            model_type=self.model_type,
-            modality=self.modality,
-            prompts=[text_prompt],
-            image_data_format="pt",
-            device="cuda",
-            **loader_kwargs,
+        # NOTE: default_multimodal_input_loader downloads images and preprocesses them
+        # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
+        # across multiple requests, improving throughput at high concurrency.
+        processed_inputs = await asyncio.to_thread(
+            lambda: default_multimodal_input_loader(
+                tokenizer=self.tokenizer,
+                model_dir=self.model_dir,
+                model_type=self.model_type,
+                modality=self.modality,
+                prompts=[text_prompt],
+                image_data_format="pt",
+                device="cuda",
+                **loader_kwargs,
+            )
        )

        # Return the first processed input if available