Unverified Commit 27e8d1ea authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Refactor] Define MultiModalKwargsItems separate from MultiModalKwargs (#23053)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 5c79b0d6
...@@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype ...@@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.models.transformers import replace_linear_class from vllm.model_executor.models.transformers import replace_linear_class
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor( ...@@ -252,7 +252,7 @@ class DeepseekVL2MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
...@@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor( ...@@ -291,7 +291,8 @@ class DeepseekVL2MultiModalProcessor(
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
# The processor logic is different for len(images) <= 2 vs > 2 # The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is # Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only # invariant of how many images are passed per prompt, we only
......
...@@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, ...@@ -21,7 +21,7 @@ from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseProcessingInfo, from vllm.multimodal.processing import (BaseProcessingInfo,
EncDecMultiModalProcessor, EncDecMultiModalProcessor,
...@@ -860,7 +860,7 @@ class Florence2MultiModalProcessor( ...@@ -860,7 +860,7 @@ class Florence2MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
pad_token_id = hf_config.pad_token_id pad_token_id = hf_config.pad_token_id
......
...@@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM ...@@ -32,7 +32,7 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]): ...@@ -226,7 +226,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
bos_token_id = hf_config.bos_token_id bos_token_id = hf_config.bos_token_id
......
...@@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -17,7 +17,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, from vllm.multimodal.parse import (ImageProcessorItems, ImageSize,
MultiModalDataItems) MultiModalDataItems)
# yapf: disable # yapf: disable
...@@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]): ...@@ -311,7 +311,7 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token = hf_processor.boi_token image_token = hf_processor.boi_token
......
...@@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems, from vllm.multimodal.parse import (ImageProcessorItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
# yapf: disable # yapf: disable
...@@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo] ...@@ -209,7 +209,7 @@ class Gemma3nMultiModalProcessor(BaseMultiModalProcessor[Gemma3nProcessingInfo]
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
......
...@@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader ...@@ -59,7 +59,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, VideoItem) MultiModalKwargsItems, VideoItem)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): ...@@ -1158,7 +1158,7 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor( image_processor = self.info.get_image_processor(
...@@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]): ...@@ -1175,14 +1175,16 @@ class Glm4vMultiModalProcessor(BaseMultiModalProcessor[Glm4vProcessingInfo]):
merge_length = image_processor.merge_size**2 merge_length = image_processor.merge_size**2
def get_image_replacement_glm4v(item_idx: int): def get_image_replacement_glm4v(item_idx: int):
grid_thw = out_mm_kwargs["image_grid_thw"][item_idx] out_item = out_mm_kwargs["image"][item_idx]
grid_thw = out_item["image_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
num_tokens = int(grid_thw.prod()) // merge_length num_tokens = int(grid_thw.prod()) // merge_length
return [hf_processor.image_token_id] * num_tokens return [hf_processor.image_token_id] * num_tokens
def get_video_replacement_glm4v(item_idx: int): def get_video_replacement_glm4v(item_idx: int):
grid_thw = out_mm_kwargs["video_grid_thw"][item_idx] out_item = out_mm_kwargs["video"][item_idx]
grid_thw = out_item["video_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
video, metadata = mm_items["video"][item_idx] video, metadata = mm_items["video"][item_idx]
......
...@@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -30,7 +30,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
...@@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]): ...@@ -503,7 +503,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
......
...@@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -40,7 +40,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor( ...@@ -118,7 +118,7 @@ class GraniteSpeechMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> list[PromptUpdate]: ) -> list[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
......
...@@ -17,7 +17,7 @@ from transformers import PretrainedConfig ...@@ -17,7 +17,7 @@ from transformers import PretrainedConfig
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs from vllm.multimodal.inputs import MultiModalKwargsItems
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement, from vllm.multimodal.processing import (MultiModalHashes, PromptReplacement,
...@@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor( ...@@ -425,18 +425,19 @@ class H2OVLMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
image_num_patches = out_mm_kwargs["image_num_patches"] if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs: elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs # TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL) # to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) image_num_patches = [None] * len(out_mm_data["image_embeds"])
else: else:
image_num_patches = [] image_num_patches = []
...@@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor( ...@@ -479,7 +480,8 @@ class H2OVLMultiModalProcessor(
tokenization_kwargs: Mapping[str, object], tokenization_kwargs: Mapping[str, object],
*, *,
return_mm_hashes: bool, return_mm_hashes: bool,
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]: ) -> tuple[list[int], MultiModalKwargsItems, Optional[MultiModalHashes],
bool]:
# The processor logic is different for len(images) <= 1 vs > 1 # The processor logic is different for len(images) <= 1 vs > 1
# Since the processing cache assumes that the processor output is # Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only # invariant of how many images are passed per prompt, we only
......
...@@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -34,7 +34,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import ImageSize, MultiModalDataItems from vllm.multimodal.parse import ImageSize, MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, ProcessingCache, BaseProcessingInfo, ProcessingCache,
...@@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor( ...@@ -295,7 +295,7 @@ class HCXVisionMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
placeholder = { placeholder = {
...@@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor( ...@@ -306,21 +306,22 @@ class HCXVisionMultiModalProcessor(
def get_replacement_hyperclovax( def get_replacement_hyperclovax(
item_idx: int, item_idx: int,
modality: str, modality: str,
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
): ):
num_tokens = None out_item = out_mm_kwargs[modality][item_idx]
if modality == "image": if modality == "image":
lens = out_item["vision_query_lengths_images"].data
num_tokens = self.info.get_num_image_tokens( num_tokens = self.info.get_num_image_tokens(
vision_query_length=out_mm_kwargs[ vision_query_length=lens)
"vision_query_lengths_images"][item_idx], ) elif modality == "video":
if modality == "video": lens = out_item["vision_query_lengths_videos"].data
num_tokens = self.info.get_num_video_tokens( num_tokens = self.info.get_num_video_tokens(
vision_query_length=out_mm_kwargs[ vision_query_length=lens)
"vision_query_lengths_videos"][item_idx], ) else:
assert isinstance(num_tokens, int) raise NotImplementedError(modality)
return [
placeholder[modality], return [placeholder[modality]] * num_tokens
] * num_tokens
return [ return [
PromptReplacement( PromptReplacement(
......
...@@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -34,7 +34,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import ImageProcessorItems, ImageSize from vllm.multimodal.parse import ImageProcessorItems, ImageSize
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
...@@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor( ...@@ -374,7 +374,7 @@ class Idefics3MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token, _, _ = self.info._get_image_token(hf_processor) image_token, _, _ = self.info._get_image_token(hf_processor)
......
...@@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -24,7 +24,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -399,7 +399,7 @@ class InternS1MultiModalProcessor( ...@@ -399,7 +399,7 @@ class InternS1MultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
img_context_token = hf_processor.image_token img_context_token = hf_processor.image_token
...@@ -407,15 +407,16 @@ class InternS1MultiModalProcessor( ...@@ -407,15 +407,16 @@ class InternS1MultiModalProcessor(
end_image_token = hf_processor.end_image_token end_image_token = hf_processor.end_image_token
video_token = hf_processor.video_token video_token = hf_processor.video_token
if "video_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
video_num_patches = out_mm_kwargs["video_num_patches"] if "video_num_patches" in out_mm_data:
video_num_patches = out_mm_data["video_num_patches"]
assert isinstance(video_num_patches, torch.Tensor) assert isinstance(video_num_patches, torch.Tensor)
video_num_patches = video_num_patches.tolist() video_num_patches = video_num_patches.tolist()
else: else:
video_num_patches = [] video_num_patches = []
if "image_num_patches" in out_mm_kwargs: if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_kwargs["image_num_patches"] image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
else: else:
......
...@@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -28,7 +28,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -797,18 +797,19 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "image_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
image_num_patches = out_mm_kwargs["image_num_patches"] if "image_num_patches" in out_mm_data:
image_num_patches = out_mm_data["image_num_patches"]
assert isinstance(image_num_patches, torch.Tensor) assert isinstance(image_num_patches, torch.Tensor)
image_num_patches = image_num_patches.tolist() image_num_patches = image_num_patches.tolist()
elif "image_embeds" in out_mm_kwargs: elif "image_embeds" in out_mm_data:
# TODO: Use image size information in dictionary embedding inputs # TODO: Use image size information in dictionary embedding inputs
# to compute num_patches (similar to Qwen2-VL) # to compute num_patches (similar to Qwen2-VL)
image_num_patches = [None] * len(out_mm_kwargs["image_embeds"]) image_num_patches = [None] * len(out_mm_data["image_embeds"])
else: else:
image_num_patches = [] image_num_patches = []
...@@ -966,15 +967,19 @@ class InternVLMultiModalProcessor( ...@@ -966,15 +967,19 @@ class InternVLMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
prompt_repl: list[PromptUpdate] = super()._get_prompt_updates( prompt_repl = super()._get_prompt_updates(
mm_items, hf_processor_mm_kwargs, out_mm_kwargs) mm_items=mm_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
out_mm_kwargs=out_mm_kwargs,
)
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if "video_num_patches" in out_mm_kwargs: out_mm_data = out_mm_kwargs.get_data()
video_num_patches = out_mm_kwargs["video_num_patches"] if "video_num_patches" in out_mm_data:
video_num_patches = out_mm_data["video_num_patches"]
assert isinstance(video_num_patches, torch.Tensor) assert isinstance(video_num_patches, torch.Tensor)
video_num_patches = video_num_patches.tolist() video_num_patches = video_num_patches.tolist()
else: else:
...@@ -992,12 +997,15 @@ class InternVLMultiModalProcessor( ...@@ -992,12 +997,15 @@ class InternVLMultiModalProcessor(
video_context_token=hf_processor.video_token) video_context_token=hf_processor.video_token)
if self.info.supports_video: if self.info.supports_video:
prompt_repl.append( prompt_repl = [
*prompt_repl,
PromptReplacement( PromptReplacement(
modality="video", modality="video",
target="<video>", target="<video>",
replacement=get_video_replacement_internvl, replacement=get_video_replacement_internvl,
)) )
]
return prompt_repl return prompt_repl
......
...@@ -33,7 +33,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys ...@@ -33,7 +33,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors
from vllm.multimodal.inputs import (ImageItem, ModalityData, from vllm.multimodal.inputs import (ImageItem, ModalityData,
MultiModalDataDict, MultiModalFieldConfig, MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, VideoItem) MultiModalKwargsItems, VideoItem)
from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize, from vllm.multimodal.parse import (DictEmbeddingItems, ImageSize,
ModalityDataItems, MultiModalDataItems, ModalityDataItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
...@@ -1192,7 +1192,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): ...@@ -1192,7 +1192,7 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
image_processor = self.info.get_image_processor( image_processor = self.info.get_image_processor(
...@@ -1208,7 +1208,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]): ...@@ -1208,7 +1208,8 @@ class KeyeMultiModalProcessor(BaseMultiModalProcessor[KeyeProcessingInfo]):
merge_length = image_processor.merge_size**2 merge_length = image_processor.merge_size**2
def get_replacement_keye(item_idx: int, modality: str): def get_replacement_keye(item_idx: int, modality: str):
grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] out_item = out_mm_kwargs[modality][item_idx]
grid_thw = out_item[f"{modality}_grid_thw"].data
assert isinstance(grid_thw, torch.Tensor) assert isinstance(grid_thw, torch.Tensor)
num_tokens = int(grid_thw.prod()) // merge_length num_tokens = int(grid_thw.prod()) // merge_length
......
...@@ -69,7 +69,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings ...@@ -69,7 +69,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs, NestedTensors) MultiModalKwargsItems, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -239,7 +239,7 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]): ...@@ -239,7 +239,7 @@ class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, Any], hf_processor_mm_kwargs: Mapping[str, Any],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
image_token_id = self.info.image_token_id image_token_id = self.info.image_token_id
......
...@@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs) MultiModalInputs, MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
ImageSize, MultiModalDataItems) ImageSize, MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -250,7 +250,7 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -250,7 +250,7 @@ class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index
...@@ -343,7 +343,7 @@ class PixtralHFMultiModalProcessor( ...@@ -343,7 +343,7 @@ class PixtralHFMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
......
...@@ -16,7 +16,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel ...@@ -16,7 +16,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems) VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
...@@ -185,7 +185,7 @@ class LlavaNextVideoMultiModalProcessor( ...@@ -185,7 +185,7 @@ class LlavaNextVideoMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
video_token_id = hf_config.video_token_index video_token_id = hf_config.video_token_index
......
...@@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import get_act_fn ...@@ -18,7 +18,7 @@ from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
MultiModalKwargs) MultiModalKwargsItems)
from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
VideoEmbeddingItems, VideoProcessorItems) VideoEmbeddingItems, VideoProcessorItems)
from vllm.multimodal.processing import PromptReplacement, PromptUpdate from vllm.multimodal.processing import PromptReplacement, PromptUpdate
...@@ -372,7 +372,7 @@ class LlavaOnevisionMultiModalProcessor( ...@@ -372,7 +372,7 @@ class LlavaOnevisionMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
image_repls = super()._get_prompt_updates( image_repls = super()._get_prompt_updates(
mm_items=mm_items, mm_items=mm_items,
......
...@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig ...@@ -40,7 +40,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig
from vllm.model_executor.layers.quantization.gptq_marlin import ( from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQMarlinConfig) GPTQMarlinConfig)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) NestedTensors)
from vllm.multimodal.parse import (AudioItem, AudioProcessorItems, from vllm.multimodal.parse import (AudioItem, AudioProcessorItems,
...@@ -316,7 +316,7 @@ class MiniCPMOMultiModalProcessor( ...@@ -316,7 +316,7 @@ class MiniCPMOMultiModalProcessor(
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
base_updates = super()._get_prompt_updates( base_updates = super()._get_prompt_updates(
mm_items=mm_items, mm_items=mm_items,
......
...@@ -48,7 +48,7 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM ...@@ -48,7 +48,7 @@ from vllm.model_executor.models.minicpm import MiniCPMForCausalLM
from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargsItems
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) NestedTensors)
from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem, from vllm.multimodal.parse import (DictEmbeddingItems, ImageItem,
...@@ -694,7 +694,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]): ...@@ -694,7 +694,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
self, self,
mm_items: MultiModalDataItems, mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs, out_mm_kwargs: MultiModalKwargsItems,
) -> Sequence[PromptUpdate]: ) -> Sequence[PromptUpdate]:
placeholders = [("image", self.info.image_pattern), placeholders = [("image", self.info.image_pattern),
("video", self.info.video_pattern)] ("video", self.info.video_pattern)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment