Unverified Commit 13abd0ea authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

[Model] Officially support Emu3 with Transformers backend (#21319)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 61b8cea3
......@@ -623,6 +623,12 @@ Specified using `--task generate`.
| `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ | ✅︎ |
| `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ | ✅︎ |
Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) |
|--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|---------------------|
| `Emu3ForConditionalGeneration` | Emu3 | T + I | `BAAI/Emu3-Chat-hf` | ✅︎ | ✅︎ | ✅︎ |
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
......
......@@ -23,18 +23,14 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
return ((name, torch.empty(0)) for name in weight_names)
def create_model_dummy_weights(
repo: str,
model_arch: str,
) -> Iterable[tuple[str, torch.Tensor]]:
def create_dummy_model(repo: str, model_arch: str) -> PreTrainedModel:
"""
Create weights from a dummy meta deserialized hf model with name conversion
"""
model_cls: PreTrainedModel = getattr(transformers, model_arch)
config = AutoConfig.from_pretrained(repo)
with torch.device("meta"):
model: PreTrainedModel = model_cls._from_config(config)
return model.named_parameters()
return model_cls._from_config(config)
def model_architectures_for_test() -> list[str]:
......@@ -70,14 +66,21 @@ def test_hf_model_weights_mapper(model_arch: str):
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_id)
hf_converted_weights = create_model_dummy_weights(model_id, model_arch)
hf_dummy_model = create_dummy_model(model_id, model_arch)
hf_converted_weights = hf_dummy_model.named_parameters()
hf_converted_buffers = hf_dummy_model.named_buffers()
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
mapped_original_weights = mapper.apply(original_weights)
mapped_hf_converted_weights = mapper.apply(hf_converted_weights)
mapped_hf_converted_buffers = mapper.apply(hf_converted_buffers)
ref_weight_names = set(map(lambda x: x[0], mapped_original_weights))
weight_names = set(map(lambda x: x[0], mapped_hf_converted_weights))
buffer_names = set(map(lambda x: x[0], mapped_hf_converted_buffers))
# Some checkpoints may have buffers, we ignore them for this test
ref_weight_names -= buffer_names
weights_missing = ref_weight_names - weight_names
weights_unmapped = weight_names - ref_weight_names
......
......@@ -357,6 +357,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
max_transformers_version="4.48", # noqa: E501
transformers_version_reason="HF model is not compatible.", # noqa: E501
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
"Emu3ForConditionalGeneration": _HfExamplesInfo("BAAI/Emu3-Chat-hf"),
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it"),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo("ibm-granite/granite-speech-3.3-2b"), # noqa: E501
......@@ -501,7 +502,7 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
speculative_model="XiaomiMiMo/MiMo-7B-RL")
}
_TRANSFORMERS_MODELS = {
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersForCausalLM": _HfExamplesInfo("hmellor/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501
"TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),
}
......@@ -512,7 +513,7 @@ _EXAMPLE_MODELS = {
**_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
**_MULTIMODAL_EXAMPLE_MODELS,
**_SPECULATIVE_DECODING_EXAMPLE_MODELS,
**_TRANSFORMERS_MODELS,
**_TRANSFORMERS_BACKEND_MODELS,
}
......
......@@ -26,7 +26,7 @@ from vllm.model_executor.models.adapters import (as_embedding_model,
as_seq_cls_model)
from vllm.model_executor.models.interfaces import SupportsQuant
from vllm.model_executor.models.registry import (_PREVIOUSLY_SUPPORTED_MODELS,
_TRANSFORMERS_MODELS)
_TRANSFORMERS_BACKEND_MODELS)
from vllm.utils import is_pin_memory_available
logger = init_logger(__name__)
......@@ -178,7 +178,7 @@ def resolve_transformers_arch(model_config: ModelConfig,
"happen.")
for i, arch in enumerate(architectures):
if arch in _TRANSFORMERS_MODELS:
if arch in _TRANSFORMERS_BACKEND_MODELS:
continue
if model_config.model_impl == ModelImpl.AUTO:
......@@ -241,7 +241,7 @@ def get_model_architecture(
vllm_supported_archs = ModelRegistry.get_supported_archs()
is_supported = lambda arch: (arch in vllm_supported_archs and arch not in
_TRANSFORMERS_MODELS)
_TRANSFORMERS_BACKEND_MODELS)
vllm_not_supported = not any(is_supported(arch) for arch in architectures)
if vllm_not_supported:
......
......@@ -254,7 +254,11 @@ _SPECULATIVE_DECODING_MODELS = {
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
}
_TRANSFORMERS_MODELS = {
_TRANSFORMERS_SUPPORTED_MODELS = {
"Emu3ForConditionalGeneration": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
}
_TRANSFORMERS_BACKEND_MODELS = {
"TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501
"TransformersForCausalLM": ("transformers", "TransformersForCausalLM"),
}
......@@ -266,7 +270,8 @@ _VLLM_MODELS = {
**_CROSS_ENCODER_MODELS,
**_MULTIMODAL_MODELS,
**_SPECULATIVE_DECODING_MODELS,
**_TRANSFORMERS_MODELS,
**_TRANSFORMERS_SUPPORTED_MODELS,
**_TRANSFORMERS_BACKEND_MODELS,
}
# This variable is used as the args for subprocess.run(). We
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment