[VLM] merged multimodal processor and V1 support for idefics3 (#12660)

Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>

[VLM] merged multimodal processor and V1 support for idefics3 (#12660)
Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
815079de · Isotr0py · GitHub · 18a88fcc · 815079de · 815079de
Unverified Commit 815079de authored Feb 04, 2025 by Isotr0py Committed by GitHub Feb 04, 2025
7 changed files
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -733,7 +733,7 @@ See [this page](#generative-models) for more information on how to use generativ
  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
  * ✅︎
  *
-  *
+  * ✅︎
 - * `InternVLChatModel`
  * InternVL 2.5, Mono-InternVL, InternVL 2.0
  * T + I<sup>E+</sup>

--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -254,14 +254,14 @@ VLM_TEST_SETTINGS = {
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
    ),
    "idefics3": VLMTestInfo(
-        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
+        models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
        img_idx_to_prompt=lambda idx: "<image>",
        max_model_len=8192,
        max_num_seqs=2,
        auto_cls=AutoModelForVision2Seq,
-        marks=[large_gpu_mark(min_gb=48)],
+        hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
    ),
    "intern_vl": VLMTestInfo(
        models=[

--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -192,6 +192,14 @@ def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
    return output_ids, output_str, out_logprobs
+def idefics3_trunc_hf_output(hf_output: RunnerOutput,
+                             model: str) -> RunnerOutput:
+    output_ids, output_str, out_logprobs = hf_output
+    if output_str.endswith("<end_of_utterance>"):
+        output_str = output_str.split("<end_of_utterance>")[0]
+    return output_ids, output_str, out_logprobs
 def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
                             model: str) -> RunnerOutput:
    output_ids, output_str, out_logprobs = hf_output

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -149,6 +149,7 @@ def _test_processing_correctness(
    "adept/fuyu-8b",
    "h2oai/h2ovl-mississippi-800m",
    "OpenGVLab/InternVL2-1B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",

--- a/tests/models/multimodal/processing/test_idefics3.py
+++ b/tests/models/multimodal/processing/test_idefics3.py
 # SPDX-License-Identifier: Apache-2.0
 """Tests for Idefics3's multimodal preprocessing kwargs."""
-from typing import Optional
 import pytest
-import torch
+from transformers import Idefics3Config
-from transformers import AutoImageProcessor, AutoTokenizer
-from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal import MultiModalRegistry
+from vllm.multimodal.utils import cached_get_tokenizer
 from ....conftest import _ImageAssets
 from ...utils import build_model_context
@@ -15,163 +12,53 @@ from ...utils import build_model_context
 models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
-# Wrap lazy imports to avoid initializing CUDA during test collection
-@pytest.fixture()
-def input_processor_for_idefics3():
-    from vllm.model_executor.models.idefics3 import (
-        input_processor_for_idefics3)
-    return input_processor_for_idefics3
-@pytest.fixture()
-def dummy_data_for_idefics3():
-    from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
-    return dummy_data_for_idefics3
-@pytest.fixture()
-def get_max_idefics3_image_tokens():
-    from vllm.model_executor.models.idefics3 import (
-        get_max_idefics3_image_tokens)
-    return get_max_idefics3_image_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
-def test_input_mapper_override(model: str, image_assets: _ImageAssets,
-                               longest_edge: Optional[int]):
-    """Ensure that the [default] input mapper handles size properly."""
-    mm_processor_kwargs = {
-        "size": {
-            "longest_edge": longest_edge
-        }
-    } if longest_edge is not None else {}
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=mm_processor_kwargs,
-    )
-    hf_processor = AutoImageProcessor.from_pretrained(model,
-                                                      trust_remote_code=True,
-                                                      **mm_processor_kwargs)
-    mm_registry = MultiModalRegistry()
-    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
-    image = image_assets[0].pil_image
-    hf_result = hf_processor.preprocess(
-        image,
-        return_tensors="pt",
-    )
-    vllm_result = mm_registry.map_input(
-        ctx.model_config,
-        {"image": image},
-    )
-    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
-    (None, 2873),
-    (168, 169),
-    (336, 169),
-    (400, 338),
-    (672, 338),
-])
-def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
-                             longest_edge: Optional[int],
-                             expected_max_tokens: int):
-    """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    actual_max_tokens = get_max_idefics3_image_tokens(
-        ctx=InputContext(ctx.model_config),
-        size=size,
-    )
-    assert expected_max_tokens == actual_max_tokens
-@pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
-    (168, 169, 1),
-    (168, 169, 2),
-    (400, 338, 1),
-    (400, 338, 2),
-])
-def test_dummy_data_override(dummy_data_for_idefics3, model: str,
-                             longest_edge: int, toks_per_img: int,
-                             num_imgs: int):
-    """Ensure dummy_data_for_idefics3 handles num_crops properly."""
-    # Same as the previous test - don't initialize mm_processor_kwargs
-    # in this test and assume that the kwargs will be correctly expanded by
-    # the partial when calling the dummy data func.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
-    ctx = build_model_context(
-        model_name=model,
-        tokenizer_name=model,
-        trust_remote_code=True,
-        mm_processor_kwargs=None,
-    )
-    dummy_data = dummy_data_for_idefics3(
-        ctx=ctx,
-        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
-        mm_counts={"image": num_imgs},
-        size=size)
-    sequence_data = dummy_data.seq_data
-    # Ensure we have the right number of placeholders per size
-    image_token_id = ctx.get_hf_config().image_token_id
-    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
-    assert img_tok_count == toks_per_img * num_imgs
 @pytest.mark.parametrize("model", models)
-@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
+# yapf: disable
-    (336, 169 * (1**2 + 1), 1),
+@pytest.mark.parametrize(
-    (336, 169 * (1**2 + 1), 2),
+    ("mm_processor_kwargs", "expected_toks_per_img"),
-    (400, 169 * (2**2 + 1), 1),
+    [
-    (400, 169 * (2**2 + 1), 2),
+        ({"size": {"longest_edge": 364}}, 169),
-])
+        ({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
-def test_input_processor_override(input_processor_for_idefics3,
+    ])
-                                  image_assets: _ImageAssets, model: str,
+# yapf: enable
-                                  longest_edge: int,
+@pytest.mark.parametrize("num_imgs", [1, 2])
-                                  expected_toks_per_img: int, num_imgs: int):
+def test_processor_override(image_assets: _ImageAssets, model: str,
+                            mm_processor_kwargs: dict[str, object],
+                            expected_toks_per_img: int, num_imgs: int):
    """Ensure input_processor_for_idefics3 handles num_crops properly."""
    # Same as the previous test - don't initialize mm_processor_kwargs
    # in this test and assume that the kwargs will be correctly expanded by
    # the partial when calling the custom input processor.
-    size = {"longest_edge": longest_edge} if longest_edge is not None else None
    ctx = build_model_context(
        model_name=model,
        tokenizer_name=model,
        trust_remote_code=True,
        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
    )
+    tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+    hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
    # Build the image str / prompt based on the number of images we pass
-    tokenizer = AutoTokenizer.from_pretrained(model)
    placeholders = "<image>" if num_imgs == 1 else "\n".join(
        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
-    images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": images})
-    processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
+    # Build mm_data
+    image_size = ctx.get_hf_config(Idefics3Config).vision_config.image_size
+    dummy_image_size = (image_size * 4, image_size * 4)
+    dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
+    mm_data = {"image": [dummy_image] * num_imgs}
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    # Ensure the placeholders format are correct
+    hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
+    assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
+        "input_ids"][0]
    # Ensure we have the right number of placeholders per num_crops size
    image_token_id = ctx.get_hf_config().image_token_id

--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -31,6 +31,17 @@ C = TypeVar("C", bound=PretrainedConfig, default=PretrainedConfig)
 P = TypeVar("P", bound=ProcessorMixin, default=ProcessorMixin)
+class HashableDict(dict):
+    """
+    A dictionary that can be hashed by lru_cache.
+    """
+    # NOTE: pythonic dict is not hashable,
+    # we override on it directly for simplicity
+    def __hash__(self) -> int:  # type: ignore[override]
+        return hash(frozenset(self.items()))
 @dataclass(frozen=True)
 class InputContext:
    """
@@ -104,6 +115,13 @@ class InputContext:
        if isinstance(typ, type):
            merged_kwargs["processor_cls"] = typ
+        # NOTE: Pythonic dict is not hashable and will raise unhashable type
+        # error when calling `cached_get_processor`, therefore we need to
+        # wrap it to a hashable dict.
+        for key, value in merged_kwargs.items():
+            if isinstance(value, dict):
+                merged_kwargs[key] = HashableDict(value)
        hf_processor = cached_get_processor(
            self.model_config.model,
            trust_remote_code=self.model_config.trust_remote_code,

--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py