Unverified Commit 0a995d54 authored by Congcong Chen's avatar Congcong Chen Committed by GitHub
Browse files

[Model] New model support for Phi-4-multimodal-instruct (#14119)

parent ade3f7d9
......@@ -410,7 +410,7 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎
- * `Phi3ForCausalLM`
* Phi-4, Phi-3
* `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
* `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
* ✅︎
* ✅︎
- * `Phi3SmallForCausalLM`
......@@ -856,6 +856,13 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `Phi4MMForCausalLM`
* Phi-4-multimodal
* T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
* `microsoft/Phi-4-multimodal-instruct`, etc.
* ✅︎
*
*
- * `PixtralForConditionalGeneration`
* Pixtral
* T + I<sup>+</sup>
......
......@@ -37,3 +37,4 @@ depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
\ No newline at end of file
......@@ -272,6 +272,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True),
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
tokenizer_mode="mistral"),
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
......
......@@ -2284,9 +2284,9 @@ class LoRAConfig:
return hash_str
def __post_init__(self):
# Setting the maximum rank to 256 should be able to satisfy the vast
# Setting the maximum rank to 512 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256)
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
possible_lora_extra_vocab_size = (0, 256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
......
......@@ -395,6 +395,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
return f"<|image_{current_count}|>"
if model_type == "phi4mm":
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
if model_type in ("minicpmo", "minicpmv"):
return "(<image>./</image>)"
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
......@@ -424,6 +426,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
elif modality == "audio":
if model_type == "ultravox":
return "<|audio|>"
if model_type == "phi4mm":
return "<|endoftext11|>" # 200011 (see vocab.json in hf model)
if model_type == "qwen2_audio":
return (f"Audio {current_count}: "
f"<|audio_bos|><|AUDIO|><|audio_eos|>")
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -182,6 +182,7 @@ _MULTIMODAL_MODELS = {
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
"UltravoxModel": ("ultravox", "UltravoxModel"),
"Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
# [Encoder-decoder]
"Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501
"MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment