"vscode:/vscode.git/clone" did not exist on "ef53395e2ccbe049c332207e31838e03566b2ae8"
Unverified Commit e24113a8 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)


Signed-off-by: default avatarIsotr0py <2037008807@qq.com>
Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: default avatarCyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 7379b3d4
...@@ -447,7 +447,6 @@ def run_qwen_vl(question: str, modality: str): ...@@ -447,7 +447,6 @@ def run_qwen_vl(question: str, modality: str):
# Qwen2-VL # Qwen2-VL
def run_qwen2_vl(question: str, modality: str): def run_qwen2_vl(question: str, modality: str):
assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
...@@ -463,8 +462,13 @@ def run_qwen2_vl(question: str, modality: str): ...@@ -463,8 +462,13 @@ def run_qwen2_vl(question: str, modality: str):
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
if modality == "image":
placeholder = "<|image_pad|>"
elif modality == "video":
placeholder = "<|video_pad|>"
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"{question}<|im_end|>\n" f"{question}<|im_end|>\n"
"<|im_start|>assistant\n") "<|im_start|>assistant\n")
stop_token_ids = None stop_token_ids = None
......
from typing import Any, Dict, Tuple from typing import Any, Dict, Tuple
import pytest import pytest
import torch
from PIL.Image import Image
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm.inputs import InputContext, token_inputs from vllm.inputs import InputContext, InputProcessingContext
from vllm.multimodal import MultiModalRegistry
from .....conftest import _ImageAssets from .....conftest import _ImageAssets
from ....utils import build_model_context from ....utils import build_model_context
...@@ -20,22 +17,9 @@ MAX_PIXELS = "max_pixels" ...@@ -20,22 +17,9 @@ MAX_PIXELS = "max_pixels"
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple # NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers. # input mappers.
@pytest.fixture() @pytest.fixture()
def image_input_mapper_for_qwen2_vl(): def processor_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import ( from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor
image_input_mapper_for_qwen2_vl) return Qwen2VLMultiModalProcessor
return image_input_mapper_for_qwen2_vl
@pytest.fixture()
def input_processor_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import (
input_processor_for_qwen2_vl)
return input_processor_for_qwen2_vl
@pytest.fixture()
def qwen2_vl_context() -> InputContext:
return build_model_context(model_name=MODEL)
@pytest.fixture() @pytest.fixture()
...@@ -45,12 +29,6 @@ def get_max_qwen2_vl_image_tokens(): ...@@ -45,12 +29,6 @@ def get_max_qwen2_vl_image_tokens():
return get_max_qwen2_vl_image_tokens return get_max_qwen2_vl_image_tokens
@pytest.fixture()
def dummy_data_for_qwen2_vl():
from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl
return dummy_data_for_qwen2_vl
@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [
({}, 1225), ({}, 1225),
({ ({
...@@ -58,110 +36,70 @@ def dummy_data_for_qwen2_vl(): ...@@ -58,110 +36,70 @@ def dummy_data_for_qwen2_vl():
MAX_PIXELS: 512**2 MAX_PIXELS: 512**2
}, 324), }, 324),
]) ])
def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens, @pytest.mark.parametrize("model", [MODEL])
qwen2_vl_context: InputContext, def test_qwen2_vl_max_image_tokens(
get_max_qwen2_vl_image_tokens,
model: str,
mm_processor_kwargs: Dict[str, Any], mm_processor_kwargs: Dict[str, Any],
expected_max_tokens: int): expected_max_tokens: int,
):
"""Ensure that the max token calc handles min/max pixels properly.""" """Ensure that the max token calc handles min/max pixels properly."""
actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context, ctx = build_model_context(
**mm_processor_kwargs) model_name=model,
assert actual_max_tokens == expected_max_tokens tokenizer_name=model,
mm_processor_kwargs=None,
@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [
[{}, 1225, (980, 980)],
[{
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 324, (504, 504)],
])
def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
qwen2_vl_context: InputContext,
mm_processor_kwargs: Dict[str, Any],
token_count: int, img_size: Tuple[int, int]):
"""Ensure that the dummy data handles min/max pixels properly."""
seq_len = 3000
hf_config = qwen2_vl_context.get_hf_config()
image_token_id = hf_config.image_token_id
# NOTE: video value is required, but isn't actually used
# when making the dummy data except for error handling currently
dummy_data = dummy_data_for_qwen2_vl(
ctx=qwen2_vl_context,
seq_len=seq_len,
mm_counts={
"image": 1,
"video": 0
},
**mm_processor_kwargs,
) )
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data
# Ensure we have the right number of placeholders for min/max pixel values actual_max_tokens = get_max_qwen2_vl_image_tokens(
assert seq_data.get_token_ids().count(image_token_id) == token_count InputContext(ctx.model_config), **mm_processor_kwargs)
assert actual_max_tokens == expected_max_tokens
# Ensure the images were resized correctly
image = mm_data["image"]
assert isinstance(image, Image)
assert image.size == img_size
@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [ @pytest.mark.parametrize(
({}, 1426), "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [
({ ({}, 1426, (5704, 1176)),
MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2
}, 330),
])
def test_input_processor(input_processor_for_qwen2_vl,
qwen2_vl_context: InputContext,
image_assets: _ImageAssets, num_placeholders: int,
mm_processor_kwargs: Dict[str, Any]):
"""Ensure that the image processor handles min/max pixels properly."""
tokenizer = AutoTokenizer.from_pretrained(MODEL)
prompt = "<|vision_start|><|image_pad|><|vision_end|>"
image = image_assets[0].pil_image
hf_config = qwen2_vl_context.get_hf_config()
image_token_id = hf_config.image_token_id
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
prompt=prompt,
multi_modal_data={"image": [image]})
processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs,
**mm_processor_kwargs)
assert processed_inputs["prompt_token_ids"].count(
image_token_id) == num_placeholders
assert len(processed_inputs["multi_modal_data"]["image"]) == 1
@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [
({}, [5704, 1176]),
({ ({
MIN_PIXELS: 64**2, MIN_PIXELS: 64**2,
MAX_PIXELS: 512**2 MAX_PIXELS: 512**2
}, [1320, 1176]), }, 330, (1320, 1176)),
]) ])
def test_image_mapper_override(qwen2_vl_context: InputContext, @pytest.mark.parametrize("model", [MODEL])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_override(
processor_for_qwen2_vl,
image_assets: _ImageAssets, image_assets: _ImageAssets,
model: str,
mm_processor_kwargs: Dict[str, Any], mm_processor_kwargs: Dict[str, Any],
pixels_shape: Tuple[int, int]): expected_toks_per_img: int,
"""Ensure that the image mapper handles min/max pixels properly.""" expected_pixels_shape: Tuple[int, int],
mm_registry = MultiModalRegistry() num_imgs: int,
mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config) ):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
image = image_assets[0].pil_image # Same as the previous test - don't initialize mm_processor_kwargs
# in this test and assume that the kwargs will be correctly expanded by
mapped_output = mm_registry.map_input( # the partial when calling the custom input processor.
qwen2_vl_context.model_config, ctx = build_model_context(
{"image": image}, model_name=model,
mm_processor_kwargs=mm_processor_kwargs, tokenizer_name=model,
mm_processor_kwargs=None,
) )
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
# Dimension 0 of pixel values should match the product of image_grid_thw ctx = InputProcessingContext(ctx.model_config, tokenizer)
actual_pixels_shape = mapped_output["pixel_values"].shape # Build the image str / prompt based on the number of images we pass
assert list(actual_pixels_shape) == pixels_shape prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
assert actual_pixels_shape[0] == torch.prod( images = [image_assets[0].pil_image] * num_imgs
mapped_output["image_grid_thw"])
mm_data = {"image": images}
processor = processor_for_qwen2_vl(ctx)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor._get_hf_processor(**mm_processor_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
assert pixel_shape[1] == expected_pixels_shape[1]
...@@ -164,7 +164,9 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): ...@@ -164,7 +164,9 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor):
self, self,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
) -> ProcessorInputs: ) -> ProcessorInputs:
audio_len = get_max_qwen2_audio_audio_tokens(self.ctx) feature_extractor = self._get_feature_extractor()
sampling_rate = feature_extractor.sampling_rate
audio_len = feature_extractor.chunk_length * sampling_rate
audio_count = mm_counts["audio"] audio_count = mm_counts["audio"]
audio = np.zeros(audio_len) audio = np.zeros(audio_len)
......
This diff is collapsed.
...@@ -220,15 +220,18 @@ class MultiModalDataItems(UserDict[str, list[Any]]): ...@@ -220,15 +220,18 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
multi_data = MultiModalDataItems() multi_data = MultiModalDataItems()
for k, v in data.items(): for k, v in data.items():
# TODO: Make a separate modality for embedding inputs
# to avoid confusion
# yapf: disable # yapf: disable
if k == "video": if k == "video":
# Special case since even a single item can be a list # Special case since even a single item can be a list
multi_data[k] = ( # type: ignore[index] multi_data[k] = ( # type: ignore[index]
v if is_list_of(v, (list, torch.Tensor)) else [v] v if (isinstance(v, torch.Tensor)
or is_list_of(v, list)) else [v]
) )
elif k in ("image", "audio"): elif k in ("image", "audio"):
multi_data[k] = ( # type: ignore[index] multi_data[k] = ( # type: ignore[index]
v if isinstance(v, (list, torch.Tensor)) else [v] v if isinstance(v, (torch.Tensor, list)) else [v]
) )
else: else:
multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index]
...@@ -252,6 +255,9 @@ class MultiModalDataItems(UserDict[str, list[Any]]): ...@@ -252,6 +255,9 @@ class MultiModalDataItems(UserDict[str, list[Any]]):
def audios(self) -> Sequence[AudioItem]: def audios(self) -> Sequence[AudioItem]:
return self.get("audio", []) return self.get("audio", [])
def get_item_counts(self) -> Mapping[str, int]:
return {m: len(items) for m, items in self.items()}
def get_image_size(self, item_idx: int) -> ImageSize: def get_image_size(self, item_idx: int) -> ImageSize:
image = self.images[item_idx] image = self.images[item_idx]
...@@ -612,6 +618,12 @@ class BaseMultiModalProcessor(ABC): ...@@ -612,6 +618,12 @@ class BaseMultiModalProcessor(ABC):
def _get_tokenizer(self) -> AnyTokenizer: def _get_tokenizer(self) -> AnyTokenizer:
return self.ctx.tokenizer return self.ctx.tokenizer
def _get_mm_items(
self,
mm_data: MultiModalDataDict,
) -> MultiModalDataItems:
return MultiModalDataItems.from_dict(mm_data)
@abstractmethod @abstractmethod
def _get_prompt_replacements( def _get_prompt_replacements(
self, self,
...@@ -778,7 +790,7 @@ class BaseMultiModalProcessor(ABC): ...@@ -778,7 +790,7 @@ class BaseMultiModalProcessor(ABC):
3. Extract information about the placeholder tokens from the 3. Extract information about the placeholder tokens from the
processed token IDs. processed token IDs.
""" """
mm_items = MultiModalDataItems.from_dict(mm_data) mm_items = self._get_mm_items(mm_data)
hf_inputs = self._apply_hf_processor(prompt_text, mm_items, hf_inputs = self._apply_hf_processor(prompt_text, mm_items,
mm_processor_kwargs) mm_processor_kwargs)
...@@ -791,7 +803,7 @@ class BaseMultiModalProcessor(ABC): ...@@ -791,7 +803,7 @@ class BaseMultiModalProcessor(ABC):
# If HF processor already inserts placeholder tokens, # If HF processor already inserts placeholder tokens,
# there is no need for us to insert them # there is no need for us to insert them
mm_item_counts = {m: len(items) for m, items in mm_items.items()} mm_item_counts = mm_items.get_item_counts()
all_placeholders = self._find_placeholders(all_prompt_repls, all_placeholders = self._find_placeholders(all_prompt_repls,
prompt_ids, mm_item_counts) prompt_ids, mm_item_counts)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment