Unverified Commit eff08aed authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Wrap default_multimodal_input_loader in asyncio.to_thread (#5945)

parent f3bea5d5
...@@ -298,9 +298,11 @@ class EncodeHelper: ...@@ -298,9 +298,11 @@ class EncodeHelper:
# for tokenizer loading. `model_type` is needed to retrieve the correct # for tokenizer loading. `model_type` is needed to retrieve the correct
# multimodal placeholders and apply model-specific preprocessing. # multimodal placeholders and apply model-specific preprocessing.
# Pass tokenizer to reuse the pre-initialized tokenizer instead of # NOTE: default_multimodal_input_loader downloads images and preprocesses them
# creating a new one per request # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
inputs = default_multimodal_input_loader( # across multiple requests, improving throughput at high concurrency.
inputs = await asyncio.to_thread(
lambda: default_multimodal_input_loader(
tokenizer=tokenizer, tokenizer=tokenizer,
model_dir=model_dir, model_dir=model_dir,
model_type=model_type, model_type=model_type,
...@@ -308,6 +310,7 @@ class EncodeHelper: ...@@ -308,6 +310,7 @@ class EncodeHelper:
prompts=[text_prompt], prompts=[text_prompt],
media=image_urls[0], media=image_urls[0],
) )
)
# NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid # NOTE: MultimodalEncoder.generate() is synchronous. Run it off-thread to avoid
# blocking the encode worker's event loop under concurrency. # blocking the encode worker's event loop under concurrency.
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import asyncio
import logging import logging
import time import time
from io import BytesIO from io import BytesIO
...@@ -211,10 +212,11 @@ class MultimodalRequestProcessor: ...@@ -211,10 +212,11 @@ class MultimodalRequestProcessor:
] ]
logging.info(f"Using embedding paths: {embedding_paths}") logging.info(f"Using embedding paths: {embedding_paths}")
# Process with default_multimodal_input_loader # NOTE: default_multimodal_input_loader downloads images and preprocesses them
# Pass self.tokenizer to reuse the pre-initialized tokenizer instead of # synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
# creating a new one per request # across multiple requests, improving throughput at high concurrency.
processed_inputs = default_multimodal_input_loader( processed_inputs = await asyncio.to_thread(
lambda: default_multimodal_input_loader(
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
model_dir=self.model_dir, model_dir=self.model_dir,
model_type=self.model_type, model_type=self.model_type,
...@@ -224,6 +226,7 @@ class MultimodalRequestProcessor: ...@@ -224,6 +226,7 @@ class MultimodalRequestProcessor:
device="cuda", device="cuda",
**loader_kwargs, **loader_kwargs,
) )
)
# Return the first processed input if available # Return the first processed input if available
if processed_inputs: if processed_inputs:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment