Unverified Commit f01a5c71 authored by Daniel Socek's avatar Daniel Socek Committed by GitHub
Browse files

fix: vision model loader fixes (#6952)


Signed-off-by: default avatarDaniel Socek <daniel.socek@intel.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent fca0a801
......@@ -65,7 +65,9 @@ class EncodeWorkerHandler:
self.image_processor = AutoImageProcessor.from_pretrained(
self.model, trust_remote_code=True
)
self.vision_model = load_vision_model(self.model)
self.vision_model = load_vision_model(
self.model, enforce_eager=self.engine_args.enforce_eager
)
hidden_size = getattr(self.vision_model, "out_hidden_size", None)
if hidden_size is None:
hidden_size = getattr(
......
......@@ -150,7 +150,7 @@ def is_qwen_vl_model(model_name: str) -> bool:
)
def load_vision_model(model_id: str) -> torch.nn.Module:
def load_vision_model(model_id: str, enforce_eager: bool = False) -> torch.nn.Module:
"""
Load a vision model from a HuggingFace model ID.
"""
......@@ -167,10 +167,10 @@ def load_vision_model(model_id: str) -> torch.nn.Module:
# Load only the vision model via vLLM
vllm_model = LLM(
model=model_id,
enforce_eager=False,
enforce_eager=enforce_eager,
kv_cache_memory_bytes=1024
* 1024
* 8, # 8MB KV cache for vLLM to complete the init lifecycle, encoder-only doesn't require KV cache.
* 64, # 64MB KV cache for vLLM to complete the init lifecycle, encoder-only doesn't require KV cache.
max_model_len=1,
mm_encoder_only=True,
enable_prefix_caching=False,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment