[Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)

Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)
Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
e24113a8 · Isotr0py · GitHub · 7379b3d4 · e24113a8 · e24113a8
Unverified Commit e24113a8 authored Dec 20, 2024 by Isotr0py Committed by GitHub Dec 19, 2024
5 changed files
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -447,7 +447,6 @@ def run_qwen_vl(question: str, modality: str):
 # Qwen2-VL
 def run_qwen2_vl(question: str, modality: str):
-    assert modality == "image"
    model_name = "Qwen/Qwen2-VL-7B-Instruct"
@@ -463,8 +462,13 @@ def run_qwen2_vl(question: str, modality: str):
        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
    )
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+              f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
              f"{question}<|im_end|>\n"
              "<|im_start|>assistant\n")
    stop_token_ids = None

--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
 from typing import Any, Dict, Tuple
 import pytest
-import torch
-from PIL.Image import Image
 from transformers import AutoTokenizer
-from vllm.inputs import InputContext, token_inputs
+from vllm.inputs import InputContext, InputProcessingContext
-from vllm.multimodal import MultiModalRegistry
 from .....conftest import _ImageAssets
 from ....utils import build_model_context
@@ -20,22 +17,9 @@ MAX_PIXELS = "max_pixels"
 # NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
 # input mappers.
 @pytest.fixture()
-def image_input_mapper_for_qwen2_vl():
+def processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
+    from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
-        image_input_mapper_for_qwen2_vl)
+    return Qwen2VLMultiModalProcessor
-    return image_input_mapper_for_qwen2_vl
-@pytest.fixture()
-def input_processor_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import (
-        input_processor_for_qwen2_vl)
-    return input_processor_for_qwen2_vl
-@pytest.fixture()
-def qwen2_vl_context() -> InputContext:
-    return build_model_context(model_name=MODEL)
 @pytest.fixture()
@@ -45,12 +29,6 @@ def get_max_qwen2_vl_image_tokens():
    return get_max_qwen2_vl_image_tokens
-@pytest.fixture()
-def dummy_data_for_qwen2_vl():
-    from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
-    return dummy_data_for_qwen2_vl
 @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
    ({}, 1225),
    ({
@@ -58,110 +36,70 @@ def dummy_data_for_qwen2_vl():
        MAX_PIXELS: 512**2
    }, 324),
 ])
-def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens,
+@pytest.mark.parametrize("model", [MODEL])
-                                   qwen2_vl_context: InputContext,
+def test_qwen2_vl_max_image_tokens(
+    get_max_qwen2_vl_image_tokens,
+    model: str,
    mm_processor_kwargs: Dict[str, Any],
-                                   expected_max_tokens: int):
+    expected_max_tokens: int,
+):
    """Ensure that the max token calc handles min/max pixels properly."""
-    actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context,
+    ctx = build_model_context(
-                                                      **mm_processor_kwargs)
+        model_name=model,
-    assert actual_max_tokens == expected_max_tokens
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
-@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
-    [{}, 1225, (980, 980)],
-    [{
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 324, (504, 504)],
-])
-def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
-                             qwen2_vl_context: InputContext,
-                             mm_processor_kwargs: Dict[str, Any],
-                             token_count: int, img_size: Tuple[int, int]):
-    """Ensure that the dummy data handles min/max pixels properly."""
-    seq_len = 3000
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-    # NOTE: video value is required, but isn't actually used
-    # when making the dummy data except for error handling currently
-    dummy_data = dummy_data_for_qwen2_vl(
-        ctx=qwen2_vl_context,
-        seq_len=seq_len,
-        mm_counts={
-            "image": 1,
-            "video": 0
-        },
-        **mm_processor_kwargs,
    )
-    seq_data = dummy_data.seq_data
-    mm_data = dummy_data.multi_modal_data
-    # Ensure we have the right number of placeholders for min/max pixel values
+    actual_max_tokens = get_max_qwen2_vl_image_tokens(
-    assert seq_data.get_token_ids().count(image_token_id) == token_count
+        InputContext(ctx.model_config), **mm_processor_kwargs)
+    assert actual_max_tokens == expected_max_tokens
-    # Ensure the images were resized correctly
-    image = mm_data["image"]
-    assert isinstance(image, Image)
-    assert image.size == img_size
-@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [
+@pytest.mark.parametrize(
-    ({}, 1426),
+    "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
-    ({
+        ({}, 1426, (5704, 1176)),
-        MIN_PIXELS: 64**2,
-        MAX_PIXELS: 512**2
-    }, 330),
-])
-def test_input_processor(input_processor_for_qwen2_vl,
-                         qwen2_vl_context: InputContext,
-                         image_assets: _ImageAssets, num_placeholders: int,
-                         mm_processor_kwargs: Dict[str, Any]):
-    """Ensure that the image processor handles min/max pixels properly."""
-    tokenizer = AutoTokenizer.from_pretrained(MODEL)
-    prompt = "<|vision_start|><|image_pad|><|vision_end|>"
-    image = image_assets[0].pil_image
-    hf_config = qwen2_vl_context.get_hf_config()
-    image_token_id = hf_config.image_token_id
-    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                          prompt=prompt,
-                          multi_modal_data={"image": [image]})
-    processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
-                                                    **mm_processor_kwargs)
-    assert processed_inputs["prompt_token_ids"].count(
-        image_token_id) == num_placeholders
-    assert len(processed_inputs["multi_modal_data"]["image"]) == 1
-@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
-    ({}, [5704, 1176]),
        ({
            MIN_PIXELS: 64**2,
            MAX_PIXELS: 512**2
-    }, [1320, 1176]),
+        }, 330, (1320, 1176)),
-])
+    ])
-def test_image_mapper_override(qwen2_vl_context: InputContext,
+@pytest.mark.parametrize("model", [MODEL])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    processor_for_qwen2_vl,
    image_assets: _ImageAssets,
+    model: str,
    mm_processor_kwargs: Dict[str, Any],
-                               pixels_shape: Tuple[int, int]):
+    expected_toks_per_img: int,
-    """Ensure that the image mapper handles min/max pixels properly."""
+    expected_pixels_shape: Tuple[int, int],
-    mm_registry = MultiModalRegistry()
+    num_imgs: int,
-    mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config)
+):
+    """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
-    image = image_assets[0].pil_image
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
-    mapped_output = mm_registry.map_input(
+    # the partial when calling the custom input processor.
-        qwen2_vl_context.model_config,
+    ctx = build_model_context(
-        {"image": image},
+        model_name=model,
-        mm_processor_kwargs=mm_processor_kwargs,
+        tokenizer_name=model,
+        mm_processor_kwargs=None,
    )
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
-    # Dimension 0 of pixel values should match the product of image_grid_thw
+    ctx = InputProcessingContext(ctx.model_config, tokenizer)
-    actual_pixels_shape = mapped_output["pixel_values"].shape
+    # Build the image str / prompt based on the number of images we pass
-    assert list(actual_pixels_shape) == pixels_shape
+    prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
-    assert actual_pixels_shape[0] == torch.prod(
+    images = [image_assets[0].pil_image] * num_imgs
-        mapped_output["image_grid_thw"])
+    mm_data = {"image": images}
+    processor = processor_for_qwen2_vl(ctx)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+    # Ensure we have the right number of placeholders per num_crops size
+    hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
+    image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
+    assert img_tok_count == expected_toks_per_img * num_imgs
+    assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
+    assert pixel_shape[1] == expected_pixels_shape[1]
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -164,7 +164,9 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
        self,
        mm_counts: Mapping[str, int],
    ) -> ProcessorInputs:
-        audio_len = get_max_qwen2_audio_audio_tokens(self.ctx)
+        feature_extractor = self._get_feature_extractor()
+        sampling_rate = feature_extractor.sampling_rate
+        audio_len = feature_extractor.chunk_length * sampling_rate
        audio_count = mm_counts["audio"]
        audio = np.zeros(audio_len)

--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -220,15 +220,18 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
        multi_data = MultiModalDataItems()
        for k, v in data.items():
+            # TODO: Make a separate modality for embedding inputs
+            # to avoid confusion
            # yapf: disable
            if k == "video":
                # Special case since even a single item can be a list
                multi_data[k] = (  # type: ignore[index]
-                    v if is_list_of(v, (list, torch.Tensor)) else [v]
+                    v if (isinstance(v, torch.Tensor)
+                          or is_list_of(v, list)) else [v]
                )
            elif k in ("image", "audio"):
                multi_data[k] = (  # type: ignore[index]
-                    v if isinstance(v, (list, torch.Tensor)) else [v]
+                    v if isinstance(v, (torch.Tensor, list)) else [v]
                )
            else:
                multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
@@ -252,6 +255,9 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
    def audios(self) -> Sequence[AudioItem]:
        return self.get("audio", [])
+    def get_item_counts(self) -> Mapping[str, int]:
+        return {m: len(items) for m, items in self.items()}
    def get_image_size(self, item_idx: int) -> ImageSize:
        image = self.images[item_idx]
@@ -612,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
    def _get_tokenizer(self) -> AnyTokenizer:
        return self.ctx.tokenizer
+    def _get_mm_items(
+        self,
+        mm_data: MultiModalDataDict,
+    ) -> MultiModalDataItems:
+        return MultiModalDataItems.from_dict(mm_data)
    @abstractmethod
    def _get_prompt_replacements(
        self,
@@ -778,7 +790,7 @@ class BaseMultiModalProcessor(ABC):
        3. Extract information about the placeholder tokens from the
           processed token IDs.
        """
-        mm_items = MultiModalDataItems.from_dict(mm_data)
+        mm_items = self._get_mm_items(mm_data)
        hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
                                             mm_processor_kwargs)
@@ -791,7 +803,7 @@ class BaseMultiModalProcessor(ABC):
        # If HF processor already inserts placeholder tokens,
        # there is no need for us to insert them
-        mm_item_counts = {m: len(items) for m, items in mm_items.items()}
+        mm_item_counts = mm_items.get_item_counts()
        all_placeholders = self._find_placeholders(all_prompt_repls,
                                                   prompt_ids, mm_item_counts)