fix: Wrap default_multimodal_input_loader in asyncio.to_thread (#5945)

eff08aed · Indrajit Bhosale · GitHub · f3bea5d5 · eff08aed · eff08aed
Unverified Commit eff08aed authored Feb 04, 2026 by Indrajit Bhosale Committed by GitHub Feb 04, 2026
Showing with 27 additions and 21 deletions

components/src/dynamo/trtllm/encode_helper.py components/src/dynamo/trtllm/encode_helper.py +12 -9

components/src/dynamo/trtllm/multimodal_processor.py components/src/dynamo/trtllm/multimodal_processor.py +15 -12

No files found.
--- a/components/src/dynamo/trtllm/encode_helper.py
+++ b/components/src/dynamo/trtllm/encode_helper.py
@@ -298,9 +298,11 @@ class EncodeHelper:
        # for tokenizer loading. `model_type` is needed to retrieve the correct
        # multimodal placeholders and apply model-specific preprocessing.
-        # Pass tokenizer to reuse the pre-initialized tokenizer instead of
+        # NOTE: default_multimodal_input_loader downloads images and preprocesses them
-        # creating a new one per request
+        # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
-        inputs = default_multimodal_input_loader(
+        # across multiple requests, improving throughput at high concurrency.
+        inputs = await asyncio.to_thread(
+            lambda: default_multimodal_input_loader(
                tokenizer=tokenizer,
                model_dir=model_dir,
                model_type=model_type,
@@ -308,6 +310,7 @@ class EncodeHelper:
                prompts=[text_prompt],
                media=image_urls[0],
            )
+        )
        # NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid
        # blocking the encode worker's event loop under concurrency.

--- a/components/src/dynamo/trtllm/multimodal_processor.py
+++ b/components/src/dynamo/trtllm/multimodal_processor.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import logging
 import time
 from io import BytesIO
@@ -211,10 +212,11 @@ class MultimodalRequestProcessor:
            ]
            logging.info(f"Using embedding paths: {embedding_paths}")
-        # Process with default_multimodal_input_loader
+        # NOTE: default_multimodal_input_loader downloads images and preprocesses them
-        # Pass self.tokenizer to reuse the pre-initialized tokenizer instead of
+        # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
-        # creating a new one per request
+        # across multiple requests, improving throughput at high concurrency.
-        processed_inputs = default_multimodal_input_loader(
+        processed_inputs = await asyncio.to_thread(
+            lambda: default_multimodal_input_loader(
                tokenizer=self.tokenizer,
                model_dir=self.model_dir,
                model_type=self.model_type,
@@ -224,6 +226,7 @@ class MultimodalRequestProcessor:
                device="cuda",
                **loader_kwargs,
            )
+        )
        # Return the first processed input if available
        if processed_inputs: