Unverified Commit eff08aed authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Wrap default_multimodal_input_loader in asyncio.to_thread (#5945)

parent f3bea5d5
......@@ -298,15 +298,18 @@ class EncodeHelper:
# for tokenizer loading. `model_type` is needed to retrieve the correct
# multimodal placeholders and apply model-specific preprocessing.
# Pass tokenizer to reuse the pre-initialized tokenizer instead of
# creating a new one per request
inputs = default_multimodal_input_loader(
tokenizer=tokenizer,
model_dir=model_dir,
model_type=model_type,
modality="image",
prompts=[text_prompt],
media=image_urls[0],
# NOTE: default_multimodal_input_loader downloads images and preprocesses them
# synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
# across multiple requests, improving throughput at high concurrency.
inputs = await asyncio.to_thread(
lambda: default_multimodal_input_loader(
tokenizer=tokenizer,
model_dir=model_dir,
model_type=model_type,
modality="image",
prompts=[text_prompt],
media=image_urls[0],
)
)
# NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid
......
......@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import time
from io import BytesIO
......@@ -211,18 +212,20 @@ class MultimodalRequestProcessor:
]
logging.info(f"Using embedding paths: {embedding_paths}")
# Process with default_multimodal_input_loader
# Pass self.tokenizer to reuse the pre-initialized tokenizer instead of
# creating a new one per request
processed_inputs = default_multimodal_input_loader(
tokenizer=self.tokenizer,
model_dir=self.model_dir,
model_type=self.model_type,
modality=self.modality,
prompts=[text_prompt],
image_data_format="pt",
device="cuda",
**loader_kwargs,
# NOTE: default_multimodal_input_loader downloads images and preprocesses them
# synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
# across multiple requests, improving throughput at high concurrency.
processed_inputs = await asyncio.to_thread(
lambda: default_multimodal_input_loader(
tokenizer=self.tokenizer,
model_dir=self.model_dir,
model_type=self.model_type,
modality=self.modality,
prompts=[text_prompt],
image_data_format="pt",
device="cuda",
**loader_kwargs,
)
)
# Return the first processed input if available
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment