"vllm/vscode:/vscode.git/clone" did not exist on "a482e4e7696141cba45180d7463ddde691d512ae"
Unverified Commit 5f8c9a42 authored by Benji Beck's avatar Benji Beck Committed by GitHub
Browse files

Migrate Florence2ImagePixelInputs to TensorSchema (#21663)


Signed-off-by: default avatarBenji Beck <benjibeck@meta.com>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 1cbf951b
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import math import math
from collections import OrderedDict from collections import OrderedDict
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Literal, Optional, TypedDict, Union from typing import Annotated, Literal, Optional, Union
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -29,16 +29,28 @@ from vllm.multimodal.processing import (BaseProcessingInfo, ...@@ -29,16 +29,28 @@ from vllm.multimodal.processing import (BaseProcessingInfo,
PromptUpdate) PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, from .interfaces import (MultiModalEmbeddings, SupportsMultiModal,
SupportsV0Only) SupportsV0Only)
from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
class Florence2ImagePixelInputs(TypedDict): class Florence2ImagePixelInputs(TensorSchema):
"""
Dimensions:
- b: Batch size
- c: Number of channels (3)
- h: Height of the image
- w: Width of the image
"""
type: Literal["pixel_values"] type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: (batch_size, num_channel, height, width)""" data: Annotated[
torch.Tensor,
TensorShape("b", 3, "h", "w"),
]
# ViT implementation are all copied from # ViT implementation are all copied from
...@@ -931,28 +943,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -931,28 +943,6 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
raise NotImplementedError( raise NotImplementedError(
'Florence2 only supports COSINE as temporal embedding.') 'Florence2 only supports COSINE as temporal embedding.')
def _validate_pixel_values(
self, data: Union[torch.Tensor, list[torch.Tensor]]
) -> Union[torch.Tensor, list[torch.Tensor]]:
size = self.processor_config["size"]
h, w = size["height"], size["width"]
expected_dims = (3, h, w)
def _validate_shape(d: torch.Tensor):
actual_dims = tuple(d.shape)
if actual_dims != expected_dims:
expected_expr = tuple(*map(str, expected_dims))
raise ValueError(
"The expected shape of pixel values per batch "
f"is {expected_expr}. You supplied {tuple(d.shape)}.")
for d in data:
_validate_shape(d)
return data
def _parse_and_validate_image_input(self, **kwargs: object): def _parse_and_validate_image_input(self, **kwargs: object):
pixel_values: Optional[Union[list[list[torch.Tensor]], pixel_values: Optional[Union[list[list[torch.Tensor]],
list[torch.Tensor], list[torch.Tensor],
...@@ -971,10 +961,16 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -971,10 +961,16 @@ class Florence2ForConditionalGeneration(nn.Module, SupportsMultiModal,
"Both pixel values and image embeds are provided.") "Both pixel values and image embeds are provided.")
if pixel_values is not None: if pixel_values is not None:
size = self.processor_config["size"]
expected_h, expected_w = size["height"], size["width"]
return Florence2ImagePixelInputs( return Florence2ImagePixelInputs(
type="pixel_values", type="pixel_values",
data=self._validate_pixel_values( data=flatten_bn(pixel_values, concat=True),
flatten_bn(pixel_values, concat=True)), resolve_bindings={
"h": expected_h,
"w": expected_w
},
) )
if image_embeds is not None: if image_embeds is not None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment