"vscode:/vscode.git/clone" did not exist on "6eaccb7353cfe84d77981da726f6d82a8aefd2be"
Unverified Commit a5bba7d2 authored by Jee Jee Li's avatar Jee Jee Li Committed by GitHub
Browse files

[Model] Add Idefics3 support (#9767)


Signed-off-by: default avatarJee Jee Li <pandaleefree@gmail.com>
Signed-off-by: default avatarB-201 <Joy25810@foxmail.com>
Co-authored-by: default avatarB-201 <Joy25810@foxmail.com>
parent 2003cc35
...@@ -446,6 +446,12 @@ Text Generation ...@@ -446,6 +446,12 @@ Text Generation
- :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
- -
- ✅︎ - ✅︎
* - :code:`Idefics3ForConditionalGeneration`
- Idefics3
- T + I
- :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
-
-
* - :code:`InternVLChatModel` * - :code:`InternVLChatModel`
- InternVL2 - InternVL2
- T + I\ :sup:`E+` - T + I\ :sup:`E+`
......
...@@ -377,6 +377,22 @@ def run_glm4v(question: str, modality: str): ...@@ -377,6 +377,22 @@ def run_glm4v(question: str, modality: str):
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# Idefics3-8B-Llama3
def run_idefics3(question: str, modality: str):
assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
llm = LLM(model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True)
prompt = (
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
)
stop_token_ids = None
return llm, prompt, stop_token_ids
model_example_map = { model_example_map = {
"llava": run_llava, "llava": run_llava,
"llava-next": run_llava_next, "llava-next": run_llava_next,
...@@ -397,6 +413,7 @@ model_example_map = { ...@@ -397,6 +413,7 @@ model_example_map = {
"mllama": run_mllama, "mllama": run_mllama,
"molmo": run_molmo, "molmo": run_molmo,
"glm4v": run_glm4v, "glm4v": run_glm4v,
"idefics3": run_idefics3,
} }
......
...@@ -290,6 +290,30 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ...@@ -290,6 +290,30 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
) )
def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
model_example_map = { model_example_map = {
"phi3_v": load_phi3v, "phi3_v": load_phi3v,
"h2ovl_chat": load_h2onvl, "h2ovl_chat": load_h2onvl,
...@@ -298,6 +322,7 @@ model_example_map = { ...@@ -298,6 +322,7 @@ model_example_map = {
"qwen2_vl": load_qwen2_vl, "qwen2_vl": load_qwen2_vl,
"qwen_vl_chat": load_qwenvl_chat, "qwen_vl_chat": load_qwenvl_chat,
"mllama": load_mllama, "mllama": load_mllama,
"idefics3": load_idefics3,
} }
......
...@@ -327,6 +327,22 @@ VLM_TEST_SETTINGS = { ...@@ -327,6 +327,22 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
prompt_path_encoder=model_utils.qwen_prompt_path_encoder, prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
), ),
"idefics3": VLMTestInfo(
models=["HuggingFaceM4/Idefics3-8B-Llama3"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>",
max_model_len=8192,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
marks=[
pytest.mark.skipif(
transformers.__version__ < "4.46.0",
reason="Model introduced in HF >= 4.46.0"
),
large_gpu_mark(min_gb=48),
],
),
### Tensor parallel / multi-gpu broadcast tests ### Tensor parallel / multi-gpu broadcast tests
"broadcast-chameleon": VLMTestInfo( "broadcast-chameleon": VLMTestInfo(
models=["facebook/chameleon-7b"], models=["facebook/chameleon-7b"],
......
...@@ -187,6 +187,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): ...@@ -187,6 +187,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|vision_start|><|image_pad|><|vision_end|>" return "<|vision_start|><|image_pad|><|vision_end|>"
if model_type == "molmo": if model_type == "molmo":
return "" return ""
if model_type == "idefics3":
return "<image>"
raise TypeError(f"Unknown {modality} model type: {model_type}") raise TypeError(f"Unknown {modality} model type: {model_type}")
elif modality == "audio": elif modality == "audio":
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
# limitations under the License. # limitations under the License.
"""PyTorch Idefics2 model.""" """PyTorch Idefics2 model."""
from typing import Optional from typing import Iterable, Optional, Tuple
import torch import torch
from torch import nn from torch import nn
...@@ -29,6 +29,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -29,6 +29,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
RowParallelLinear) RowParallelLinear)
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
class Idefics2VisionEmbeddings(nn.Module): class Idefics2VisionEmbeddings(nn.Module):
...@@ -329,3 +330,25 @@ class Idefics2VisionTransformer(nn.Module): ...@@ -329,3 +330,25 @@ class Idefics2VisionTransformer(nn.Module):
encoder_outputs = self.encoder(hidden_states) encoder_outputs = self.encoder(hidden_states)
last_hidden_state = self.post_layernorm(encoder_outputs) last_hidden_state = self.post_layernorm(encoder_outputs)
return last_hidden_state return last_hidden_state
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
]
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
for param_name, weight_name, shard_id in stacked_params_mapping:
if weight_name not in name:
continue
param = params_dict[name.replace(weight_name, param_name)]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)
This diff is collapsed.
...@@ -120,6 +120,7 @@ _MULTIMODAL_MODELS = { ...@@ -120,6 +120,7 @@ _MULTIMODAL_MODELS = {
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"),
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment