Unverified Commit 4fb349f6 authored by Lifu Huang's avatar Lifu Huang Committed by GitHub
Browse files

Fix copy-paste error in phi4mm image processing (#18315)


Signed-off-by: default avatarLifu Huang <lifu.hlf@gmail.com>
parent 908733ac
...@@ -415,15 +415,6 @@ class Phi4MMImagePixelInputs(TypedDict): ...@@ -415,15 +415,6 @@ class Phi4MMImagePixelInputs(TypedDict):
"""Shape: `(batch_size * num_images, H_mask, W_mask)`""" """Shape: `(batch_size * num_images, H_mask, W_mask)`"""
class Phi4MMImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: Union[torch.Tensor, list[torch.Tensor]]
"""Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
class Phi4MMAudioFeatureInputs(TypedDict): class Phi4MMAudioFeatureInputs(TypedDict):
type: Literal["audio_features"] type: Literal["audio_features"]
data: Union[torch.Tensor, list[torch.Tensor]] data: Union[torch.Tensor, list[torch.Tensor]]
...@@ -436,7 +427,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict): ...@@ -436,7 +427,6 @@ class Phi4MMAudioEmbeddingInputs(TypedDict):
"""Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)""" """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)"""
Phi4MMImageInput = Union[Phi4MMImagePixelInputs, Phi4MMImageEmbeddingInputs]
Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs] Phi4MMAudioInputs = Union[Phi4MMAudioFeatureInputs, Phi4MMAudioEmbeddingInputs]
...@@ -1112,9 +1102,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): ...@@ -1112,9 +1102,7 @@ class Phi4MMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
def _process_image_input( def _process_image_input(
self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]: self, image_input: Phi4MMImagePixelInputs) -> list[torch.Tensor]:
if image_input["type"] == "image_embeds":
image_embeds = image_input["image_embeds"].type(self.visual.dtype)
else:
dtype = next(self.vision_encoder.parameters()).dtype dtype = next(self.vision_encoder.parameters()).dtype
pixel_values = image_input['data'].to(dtype) pixel_values = image_input['data'].to(dtype)
image_sizes = image_input['image_sizes'] image_sizes = image_input['image_sizes']
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment