Unverified Commit 90f9c2eb authored by Russell Bryant's avatar Russell Bryant Committed by GitHub
Browse files

[V1] Change return type on get_multimodal_embeddings() (#19446)


Signed-off-by: default avatarRussell Bryant <rbryant@redhat.com>
parent 387bdf0a
...@@ -794,11 +794,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -794,11 +794,10 @@ class Llama4ForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings(self, def get_multimodal_embeddings(self, **kwargs) -> MultiModalEmbeddings:
**kwargs) -> Optional[MultiModalEmbeddings]:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
return self._process_image_input(image_input) return self._process_image_input(image_input)
......
...@@ -1473,11 +1473,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA, ...@@ -1473,11 +1473,11 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.model return self.model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
return self._process_image_input(image_input) return self._process_image_input(image_input)
......
...@@ -499,11 +499,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -499,11 +499,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
return tuple(vision_embeddings) return tuple(vision_embeddings)
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
image_features = self._process_image_input(image_input) image_features = self._process_image_input(image_input)
......
...@@ -338,11 +338,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -338,11 +338,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
vision_embeddings = self._process_image_input(image_input) vision_embeddings = self._process_image_input(image_input)
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa
vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5) vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5)
......
...@@ -655,11 +655,11 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -655,11 +655,11 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
vision_embeddings = self._process_image_input(image_input) vision_embeddings = self._process_image_input(image_input)
return vision_embeddings return vision_embeddings
...@@ -669,7 +669,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, ...@@ -669,7 +669,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP,
multimodal_embeddings: Optional[MultiModalEmbeddings] = None, multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
) -> torch.Tensor: ) -> torch.Tensor:
inputs_embeds = self.embed_tokens(input_ids) inputs_embeds = self.embed_tokens(input_ids)
if multimodal_embeddings is not None: if multimodal_embeddings:
inputs_embeds = merge_multimodal_embeddings( inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, multimodal_embeddings, input_ids, inputs_embeds, multimodal_embeddings,
self.image_token_id) self.image_token_id)
......
...@@ -1112,11 +1112,12 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): ...@@ -1112,11 +1112,12 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
image_attention_mask) image_attention_mask)
return image_embeds return image_embeds
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
modalities = self._parse_and_validate_multimodal_inputs(**kwargs) modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
if not modalities: if not modalities:
return []
return None return None
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
......
...@@ -409,11 +409,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -409,11 +409,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
return self._process_image_input(image_input) return self._process_image_input(image_input)
......
...@@ -772,13 +772,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration( ...@@ -772,13 +772,13 @@ class Qwen2_5OmniThinkerForConditionalGeneration(
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
mm_input_by_modality = self._parse_and_validate_multimodal_inputs( mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
**kwargs) **kwargs)
if not mm_input_by_modality: if not mm_input_by_modality:
return None return []
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video). # tensor correspoending to a multimodal data item (image or video).
......
...@@ -1016,13 +1016,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1016,13 +1016,13 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
mm_input_by_modality = self._parse_and_validate_multimodal_inputs( mm_input_by_modality = self._parse_and_validate_multimodal_inputs(
**kwargs) **kwargs)
if not mm_input_by_modality: if not mm_input_by_modality:
return None return []
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
# tensor correspoending to a multimodal data item (image or video). # tensor correspoending to a multimodal data item (image or video).
......
...@@ -350,11 +350,11 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -350,11 +350,11 @@ class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
audio_input = self._parse_and_validate_audio_input(**kwargs) audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None: if audio_input is None:
return None return []
masked_audio_features = self._process_audio_input(audio_input) masked_audio_features = self._process_audio_input(audio_input)
return masked_audio_features return masked_audio_features
......
...@@ -1257,11 +1257,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -1257,11 +1257,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
modalities = self._parse_and_validate_multimodal_inputs(**kwargs) modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
if not modalities: if not modalities:
return []
return None return None
# The result multimodal_embeddings is tuple of tensors, with each # The result multimodal_embeddings is tuple of tensors, with each
......
...@@ -738,11 +738,11 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA, ...@@ -738,11 +738,11 @@ class QwenVLForConditionalGeneration(QWenBaseModel, SupportsPP, SupportsLoRA,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.transformer return self.transformer
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
vision_embeddings = self._process_image_input(image_input) vision_embeddings = self._process_image_input(image_input)
return vision_embeddings return vision_embeddings
......
...@@ -869,11 +869,11 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -869,11 +869,11 @@ class SkyworkR1VChatModel(nn.Module, SupportsMultiModal, SupportsPP):
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
return self._process_image_input(image_input) return self._process_image_input(image_input)
......
...@@ -585,11 +585,11 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -585,11 +585,11 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
image_input = self._parse_and_validate_image_input(**kwargs) image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None: if image_input is None:
return None return []
return self._process_image_input(image_input) return self._process_image_input(image_input)
def get_input_embeddings( def get_input_embeddings(
......
...@@ -546,11 +546,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): ...@@ -546,11 +546,11 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.language_model return self.language_model
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
audio_input = self._parse_and_validate_audio_input(**kwargs) audio_input = self._parse_and_validate_audio_input(**kwargs)
if audio_input is None: if audio_input is None:
return None return []
audio_embeddings = self._process_audio_input(audio_input) audio_embeddings = self._process_audio_input(audio_input)
return audio_embeddings return audio_embeddings
......
...@@ -687,8 +687,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription, ...@@ -687,8 +687,8 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
def get_language_model(self) -> torch.nn.Module: def get_language_model(self) -> torch.nn.Module:
return self.model.decoder return self.model.decoder
def get_multimodal_embeddings( def get_multimodal_embeddings(self,
self, **kwargs: object) -> Optional[MultiModalEmbeddings]: **kwargs: object) -> MultiModalEmbeddings:
# TODO: This method does not obey the interface for SupportsMultiModal. # TODO: This method does not obey the interface for SupportsMultiModal.
# Refactor this once encoder/decoder support is implemented in V1. # Refactor this once encoder/decoder support is implemented in V1.
audio_input = self._parse_and_validate_audio_input(**kwargs) audio_input = self._parse_and_validate_audio_input(**kwargs)
......
...@@ -4,11 +4,12 @@ from typing import Optional ...@@ -4,11 +4,12 @@ from typing import Optional
import torch import torch
from vllm.model_executor.models.interfaces import MultiModalEmbeddings
from vllm.v1.kv_cache_interface import KVCacheGroupSpec from vllm.v1.kv_cache_interface import KVCacheGroupSpec
def sanity_check_mm_encoder_outputs( def sanity_check_mm_encoder_outputs(
mm_embeddings: object, mm_embeddings: MultiModalEmbeddings,
expected_num_items: int, expected_num_items: int,
) -> None: ) -> None:
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment