Unverified Commit eff08aed authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Wrap default_multimodal_input_loader in asyncio.to_thread (#5945)

parent f3bea5d5
...@@ -298,15 +298,18 @@ class EncodeHelper: ...@@ -298,15 +298,18 @@ class EncodeHelper:
# for tokenizer loading. `model_type` is needed to retrieve the correct # for tokenizer loading. `model_type` is needed to retrieve the correct
# multimodal placeholders and apply model-specific preprocessing. # multimodal placeholders and apply model-specific preprocessing.
# Pass tokenizer to reuse the pre-initialized tokenizer instead of # NOTE: default_multimodal_input_loader downloads images and preprocesses them
# creating a new one per request # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
inputs = default_multimodal_input_loader( # across multiple requests, improving throughput at high concurrency.
tokenizer=tokenizer, inputs = await asyncio.to_thread(
model_dir=model_dir, lambda: default_multimodal_input_loader(
model_type=model_type, tokenizer=tokenizer,
modality="image", model_dir=model_dir,
prompts=[text_prompt], model_type=model_type,
media=image_urls[0], modality="image",
prompts=[text_prompt],
media=image_urls[0],
)
) )
# NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid # NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import asyncio
import logging import logging
import time import time
from io import BytesIO from io import BytesIO
...@@ -211,18 +212,20 @@ class MultimodalRequestProcessor: ...@@ -211,18 +212,20 @@ class MultimodalRequestProcessor:
] ]
logging.info(f"Using embedding paths: {embedding_paths}") logging.info(f"Using embedding paths: {embedding_paths}")
# Process with default_multimodal_input_loader # NOTE: default_multimodal_input_loader downloads images and preprocesses them
# Pass self.tokenizer to reuse the pre-initialized tokenizer instead of # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
# creating a new one per request # across multiple requests, improving throughput at high concurrency.
processed_inputs = default_multimodal_input_loader( processed_inputs = await asyncio.to_thread(
tokenizer=self.tokenizer, lambda: default_multimodal_input_loader(
model_dir=self.model_dir, tokenizer=self.tokenizer,
model_type=self.model_type, model_dir=self.model_dir,
modality=self.modality, model_type=self.model_type,
prompts=[text_prompt], modality=self.modality,
image_data_format="pt", prompts=[text_prompt],
device="cuda", image_data_format="pt",
**loader_kwargs, device="cuda",
**loader_kwargs,
)
) )
# Return the first processed input if available # Return the first processed input if available
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment