[Model] New model support for Phi-4-multimodal-instruct (#14119)

0a995d54 · Congcong Chen · GitHub · ade3f7d9 · 0a995d54 · 0a995d54
Unverified Commit 0a995d54 authored Mar 04, 2025 by Congcong Chen Committed by GitHub Mar 04, 2025
10 changed files
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -410,7 +410,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * ✅︎
 - * `Phi3ForCausalLM`
  * Phi-4, Phi-3
-  * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
+  * `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
  * ✅︎
  * ✅︎
 - * `Phi3SmallForCausalLM`
@@ -856,6 +856,13 @@ See [this page](#generative-models) for more information on how to use generativ
  *
  * ✅︎
  * ✅︎
+- * `Phi4MMForCausalLM`
+  * Phi-4-multimodal
+  * T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
+  * `microsoft/Phi-4-multimodal-instruct`, etc.
+  * ✅︎
+  *
+  *
 - * `PixtralForConditionalGeneration`
  * Pixtral
  * T + I<sup>+</sup>

--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -37,3 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
 python-json-logger # Used by logging as per examples/other/logging_configuration.md
+scipy # Required for phi-4-multimodal-instruct
\ No newline at end of file
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -272,6 +272,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
                                        trust_remote_code=True),
+    "Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
+                                        trust_remote_code=True),
    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
                                                       tokenizer_mode="mistral"),
    "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2284,9 +2284,9 @@ class LoRAConfig:
        return hash_str

    def __post_init__(self):
-        # Setting the maximum rank to 256 should be able to satisfy the vast
+        # Setting the maximum rank to 512 should be able to satisfy the vast
        # majority of applications.
-        possible_max_ranks = (8, 16, 32, 64, 128, 256)
+        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
        possible_lora_extra_vocab_size = (0, 256, 512)
        if self.max_lora_rank not in possible_max_ranks:
            raise ValueError(

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -395,6 +395,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
            if model_type == "phi3_v":
                # Workaround since this token is not defined in the tokenizer
                return f"<|image_{current_count}|>"
+            if model_type == "phi4mm":
+                return "<|endoftext10|>"  # 200010 (see vocab.json in hf model)
            if model_type in ("minicpmo", "minicpmv"):
                return "(<image>./</image>)"
            if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
@@ -424,6 +426,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
        elif modality == "audio":
            if model_type == "ultravox":
                return "<|audio|>"
+            if model_type == "phi4mm":
+                return "<|endoftext11|>"  # 200011 (see vocab.json in hf model)
            if model_type == "qwen2_audio":
                return (f"Audio {current_count}: "
                        f"<|audio_bos|><|AUDIO|><|audio_eos|>")

--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
--- a/vllm/model_executor/models/phi4mm_audio.py
+++ b/vllm/model_executor/models/phi4mm_audio.py
--- a/vllm/model_executor/models/phi4mm_utils.py
+++ b/vllm/model_executor/models/phi4mm_utils.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -182,6 +182,7 @@ _MULTIMODAL_MODELS = {
    "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
    "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
    "UltravoxModel": ("ultravox", "UltravoxModel"),
+    "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
    # [Encoder-decoder]
    "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"),  # noqa: E501
    "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501

--- a/vllm/model_executor/models/vision_siglip_navit.py
+++ b/vllm/model_executor/models/vision_siglip_navit.py