Unverified Commit 73e0225e authored by Travis Johnson's avatar Travis Johnson Committed by GitHub
Browse files

[Bugfix] Check that number of images matches number of <|image|> tokens with mllama (#13911)


Signed-off-by: default avatarTravis Johnson <tsjohnso@us.ibm.com>
parent 6c85da3a
...@@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, ...@@ -479,8 +479,9 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
# Regression tests for https://github.com/vllm-project/vllm/issues/10648 # Regression tests for https://github.com/vllm-project/vllm/issues/10648
# Number of image tags is greater than the number of images provided # Number of groups of image tokens is greater than the number of images
prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501 # provided (the whitespace between the tags is necessary)
prompt = "<|begin_of_text|><|image|> <|image|> Compare the two images" # noqa: E501
image = stop_sign image = stop_sign
with pytest.raises(ValueError): with pytest.raises(ValueError):
vllm_model.generate_greedy_logprobs([prompt], vllm_model.generate_greedy_logprobs([prompt],
......
...@@ -54,7 +54,8 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -54,7 +54,8 @@ from vllm.model_executor.model_loader.weight_utils import (
default_weight_loader, maybe_remap_kv_scale_name) default_weight_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalKwargs from vllm.multimodal.inputs import (MultiModalEncDecInputs,
MultiModalFieldConfig, MultiModalKwargs)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataDict, MultiModalDataItems) MultiModalDataDict, MultiModalDataItems)
from vllm.multimodal.processing import (BaseProcessingInfo, from vllm.multimodal.processing import (BaseProcessingInfo,
...@@ -169,6 +170,27 @@ class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]): ...@@ -169,6 +170,27 @@ class MllamaDummyInputsBuilder(BaseDummyInputsBuilder[MllamaProcessingInfo]):
class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo] class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
): ):
def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
) -> MultiModalEncDecInputs:
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
# Check that the number of image tokens in the decoder prompt matches
# the number of images provided in mm_data
num_image_tokens = mm_inputs['prompt_token_ids'].count(
self.info.get_hf_config().image_token_index)
image_data = mm_data.get("image", [])
num_images = 1 if isinstance(image_data, Image) else len(image_data)
if num_image_tokens != num_images:
raise ValueError(
f"The number of image tokens ({num_image_tokens}) must be"
f" the same as the number of images ({num_images})")
return mm_inputs
def _call_hf_processor( def _call_hf_processor(
self, self,
prompt: str, prompt: str,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment