[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for...

[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425)

[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for...
[Hotfix][Core][VLM] Disable chunked prefill by default and prefix caching for multimodal models (#8425)
c1636945 · Roger Wang · GitHub · 01987725 · c1636945 · c1636945
Unverified Commit c1636945 authored Sep 12, 2024 by Roger Wang Committed by GitHub Sep 12, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 3 deletions

vllm/engine/arg_utils.py vllm/engine/arg_utils.py +11 -1

vllm/model_executor/models/__init__.py vllm/model_executor/models/__init__.py +2 -2

No files found.
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -843,6 +843,13 @@ class EngineArgs:
        device_config = DeviceConfig(device=self.device)
        model_config = self.create_model_config()
+        if model_config.is_multimodal_model:
+            if self.enable_prefix_caching:
+                logger.warning(
+                    "--enable-prefix-caching is currently not "
+                    "supported for multimodal models and has been disabled.")
+            self.enable_prefix_caching = False
        cache_config = CacheConfig(
            block_size=self.block_size if self.device != "neuron" else
            self.max_model_len,  # neuron needs block_size = max_model_len
@@ -874,7 +881,10 @@ class EngineArgs:
            # If not explicitly set, enable chunked prefill by default for
            # long context (> 32K) models. This is to avoid OOM errors in the
            # initial memory profiling phase.
-            if use_long_context:
+            # Chunked prefill is currently disabled for multimodal models by
+            # default.
+            if use_long_context and not model_config.is_multimodal_model:
                is_gpu = device_config.device_type == "cuda"
                use_sliding_window = (model_config.get_sliding_window()
                                      is not None)

--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -90,12 +90,12 @@ _MULTIMODAL_MODELS = {
    "PaliGemmaForConditionalGeneration": ("paligemma",
                                          "PaliGemmaForConditionalGeneration"),
    "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
-    "UltravoxModel": ("ultravox", "UltravoxModel"),
-    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "PixtralForConditionalGeneration": ("pixtral",
                                        "PixtralForConditionalGeneration"),
+    "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
    "Qwen2VLForConditionalGeneration": ("qwen2_vl",
                                        "Qwen2VLForConditionalGeneration"),
+    "UltravoxModel": ("ultravox", "UltravoxModel"),
 }
 _CONDITIONAL_GENERATION_MODELS = {
    "BartModel": ("bart", "BartForConditionalGeneration"),