"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "470484a4f503d4768008c2f5a8dc828dc90633b4"
Unverified Commit d128d0d5 authored by Benji Beck's avatar Benji Beck Committed by GitHub
Browse files

Migrate KeyeImageInputs and KeyeVideoInputs to TensorSchema (#21686)


Signed-off-by: default avatarBenji Beck <benjibeck@meta.com>
parent a6c05028
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import math import math
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from functools import partial from functools import partial
from typing import Any, Literal, Optional, TypedDict, Union from typing import Annotated, Any, Literal, Optional, Union
import numpy as np import numpy as np
import torch import torch
...@@ -46,6 +46,7 @@ from vllm.sequence import IntermediateTensors ...@@ -46,6 +46,7 @@ from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.config import uses_mrope
from vllm.transformers_utils.processor import ( from vllm.transformers_utils.processor import (
cached_image_processor_from_config) cached_image_processor_from_config)
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import (MultiModalEmbeddings, SupportsLoRA, from .interfaces import (MultiModalEmbeddings, SupportsLoRA,
SupportsMultiModal, SupportsPP) SupportsMultiModal, SupportsPP)
...@@ -102,77 +103,62 @@ def smart_resize( ...@@ -102,77 +103,62 @@ def smart_resize(
return h_bar, w_bar return h_bar, w_bar
class KeyeImagePixelInputs(TypedDict): class KeyeImagePixelInputs(TensorSchema):
type: Literal["pixel_values"]
pixel_values: torch.Tensor
"""Shape:
`(num_patches, num_channels * patch_size * patch_size)`
""" """
Dimensions:
image_grid_thw: torch.Tensor - np: Number of patches
"""Shape: `(num_images, 3)` - cps: Number of channels * patch_size * patch_size
This should be in `(grid_t, grid_h, grid_w)` format. - ni: Number of images
- g: Grid dimensions (3 for t, h, w)
""" """
type: Literal["pixel_values"]
pixel_values: Annotated[torch.Tensor, TensorShape("np", "cps")]
image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
class KeyeImageEmbeddingInputs(TypedDict): class KeyeImageEmbeddingInputs(TensorSchema):
type: Literal["image_embeds"]
image_embeds: torch.Tensor
"""Supported types:
- list[`torch.Tensor`]: A list of tensors holding all images' features.
Each tensor holds an image's features.
- `torch.Tensor`: A tensor holding all images' features
(concatenation of all images' feature tensors).
Tensor shape: `(num_image_features, hidden_size)`
- `num_image_features` varies based on
the number and resolution of the images.
- `hidden_size` must match the hidden size of language model backbone.
""" """
Dimensions:
image_grid_thw: torch.Tensor - nf: Number of image features
"""Shape: `(num_images, 3)` - hs: Hidden size (must match the hidden size of language model
This should be in `(grid_t, grid_h, grid_w)` format. backbone)
- ni: Number of images
- g: Grid dimensions (3 for t, h, w)
""" """
type: Literal["image_embeds"]
image_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]
KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs] KeyeImageInputs = Union[KeyeImagePixelInputs, KeyeImageEmbeddingInputs]
class KeyeVideoPixelInputs(TypedDict): class KeyeVideoPixelInputs(TensorSchema):
type: Literal["pixel_values_videos"]
pixel_values_videos: torch.Tensor
"""Shape:
`(num_patches,
num_channels * temporal_patch_size * patch_size * patch_size)`
""" """
Dimensions:
video_grid_thw: torch.Tensor - np: Number of patches
"""Shape: `(num_videos, 3)` - ctps: Number of channels * temporal_patch_size * patch_size *
patch_size
This should be in `(grid_t, grid_h, grid_w)` format. - nv: Number of videos
- g: Grid dimensions (3 for t, h, w)
""" """
type: Literal["pixel_values_videos"]
pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctps")]
video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
class KeyeVideoEmbeddingInputs(TypedDict): class KeyeVideoEmbeddingInputs(TensorSchema):
type: Literal["video_embeds"]
video_embeds: torch.Tensor
"""Supported types:
- list[`torch.Tensor`]: A list of tensors holding all videos' features.
Each tensor holds an video's features.
- `torch.Tensor`: A tensor holding all videos' features
(concatenation of all videos' feature tensors).
Tensor shape: `(num_image_features, hidden_size)`
- `num_image_features` varies based on
the number and resolution of the videos.
- `hidden_size` must match the hidden size of language model backbone.
""" """
Dimensions:
video_grid_thw: torch.Tensor - nf: Number of video features
"""Shape: `(num_videos, 3)` - hs: Hidden size (must match the hidden size of language model
This should be in `(grid_t, grid_h, grid_w)` format. backbone)
- nv: Number of videos
- g: Grid dimensions (3 for t, h, w)
""" """
type: Literal["video_embeds"]
video_embeds: Annotated[torch.Tensor, TensorShape("nf", "hs")]
video_grid_thw: Annotated[torch.Tensor, TensorShape("nv", 3)]
KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs] KeyeVideoInputs = Union[KeyeVideoPixelInputs, KeyeVideoEmbeddingInputs]
...@@ -1420,10 +1406,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, ...@@ -1420,10 +1406,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
image_grid_thw = self._validate_and_reshape_mm_tensor( image_grid_thw = self._validate_and_reshape_mm_tensor(
image_grid_thw, "image grid_thw") image_grid_thw, "image grid_thw")
if not isinstance(pixel_values, (torch.Tensor, list)):
raise ValueError("Incorrect type of image pixel values. "
f"Got type: {type(pixel_values)}")
return KeyeImagePixelInputs( return KeyeImagePixelInputs(
type="pixel_values", type="pixel_values",
pixel_values=pixel_values, pixel_values=pixel_values,
...@@ -1436,9 +1418,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, ...@@ -1436,9 +1418,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
image_grid_thw = self._validate_and_reshape_mm_tensor( image_grid_thw = self._validate_and_reshape_mm_tensor(
image_grid_thw, "image grid_thw") image_grid_thw, "image grid_thw")
if not isinstance(image_embeds, torch.Tensor):
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")
return KeyeImageEmbeddingInputs( return KeyeImageEmbeddingInputs(
type="image_embeds", type="image_embeds",
image_embeds=image_embeds, image_embeds=image_embeds,
...@@ -1474,9 +1453,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, ...@@ -1474,9 +1453,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
video_grid_thw = self._validate_and_reshape_mm_tensor( video_grid_thw = self._validate_and_reshape_mm_tensor(
video_grid_thw, "video grid_thw") video_grid_thw, "video grid_thw")
if not isinstance(video_embeds, torch.Tensor):
raise ValueError("Incorrect type of video embeddings. "
f"Got type: {type(video_embeds)}")
return KeyeVideoEmbeddingInputs( return KeyeVideoEmbeddingInputs(
type="video_embeds", type="video_embeds",
video_embeds=video_embeds, video_embeds=video_embeds,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment