Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d128d0d5
Unverified
Commit
d128d0d5
authored
Jul 28, 2025
by
Benji Beck
Committed by
GitHub
Jul 28, 2025
Browse files
Migrate KeyeImageInputs and KeyeVideoInputs to TensorSchema (#21686)
Signed-off-by:
Benji Beck
<
benjibeck@meta.com
>
parent
a6c05028
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
41 additions
and
65 deletions
+41
-65
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+41
-65
No files found.
vllm/model_executor/models/keye.py
View file @
d128d0d5
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
import
math
import
math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
partial
from
functools
import
partial
from
typing
import
Any
,
Literal
,
Optional
,
TypedDict
,
Union
from
typing
import
Annotated
,
Any
,
Literal
,
Optional
,
Union
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
...
@@ -46,6 +46,7 @@ from vllm.sequence import IntermediateTensors
...
@@ -46,6 +46,7 @@ from vllm.sequence import IntermediateTensors
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.transformers_utils.config
import
uses_mrope
from
vllm.transformers_utils.processor
import
(
from
vllm.transformers_utils.processor
import
(
cached_image_processor_from_config
)
cached_image_processor_from_config
)
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
)
SupportsMultiModal
,
SupportsPP
)
...
@@ -102,77 +103,62 @@ def smart_resize(
...
@@ -102,77 +103,62 @@ def smart_resize(
return
h_bar
,
w_bar
return
h_bar
,
w_bar
class
KeyeImagePixelInputs
(
TypedDict
):
class
KeyeImagePixelInputs
(
TensorSchema
):
type
:
Literal
[
"pixel_values"
]
pixel_values
:
torch
.
Tensor
"""Shape:
`(num_patches, num_channels * patch_size * patch_size)`
"""
"""
Dimensions:
image_grid_thw
:
torch
.
Tensor
- np: Number of patches
"""Shape: `(num_images, 3)`
- cps: Number of channels * patch_size * patch_size
This should be in `(grid_t, grid_h, grid_w)` format.
- ni: Number of images
- g: Grid dimensions (3 for t, h, w)
"""
"""
type
:
Literal
[
"pixel_values"
]
pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"np"
,
"cps"
)]
image_grid_thw
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"ni"
,
3
)]
class
KeyeImageEmbeddingInputs
(
TypedDict
):
class
KeyeImageEmbeddingInputs
(
TensorSchema
):
type
:
Literal
[
"image_embeds"
]
image_embeds
:
torch
.
Tensor
"""Supported types:
- list[`torch.Tensor`]: A list of tensors holding all images' features.
Each tensor holds an image's features.
- `torch.Tensor`: A tensor holding all images' features
(concatenation of all images' feature tensors).
Tensor shape: `(num_image_features, hidden_size)`
- `num_image_features` varies based on
the number and resolution of the images.
- `hidden_size` must match the hidden size of language model backbone.
"""
"""
Dimensions:
image_grid_thw
:
torch
.
Tensor
- nf: Number of image features
"""Shape: `(num_images, 3)`
- hs: Hidden size (must match the hidden size of language model
This should be in `(grid_t, grid_h, grid_w)` format.
backbone)
- ni: Number of images
- g: Grid dimensions (3 for t, h, w)
"""
"""
type
:
Literal
[
"image_embeds"
]
image_embeds
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"nf"
,
"hs"
)]
image_grid_thw
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"ni"
,
3
)]
KeyeImageInputs
=
Union
[
KeyeImagePixelInputs
,
KeyeImageEmbeddingInputs
]
KeyeImageInputs
=
Union
[
KeyeImagePixelInputs
,
KeyeImageEmbeddingInputs
]
class
KeyeVideoPixelInputs
(
TypedDict
):
class
KeyeVideoPixelInputs
(
TensorSchema
):
type
:
Literal
[
"pixel_values_videos"
]
pixel_values_videos
:
torch
.
Tensor
"""Shape:
`(num_patches,
num_channels * temporal_patch_size * patch_size * patch_size)`
"""
"""
Dimensions:
video_grid_thw
:
torch
.
Tensor
- np: Number of patches
"""Shape: `(num_videos, 3)`
- ctps: Number of channels * temporal_patch_size * patch_size *
patch_size
This should be in `(grid_t, grid_h, grid_w)` format.
- nv: Number of videos
- g: Grid dimensions (3 for t, h, w)
"""
"""
type
:
Literal
[
"pixel_values_videos"
]
pixel_values_videos
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"np"
,
"ctps"
)]
video_grid_thw
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"nv"
,
3
)]
class
KeyeVideoEmbeddingInputs
(
TypedDict
):
class
KeyeVideoEmbeddingInputs
(
TensorSchema
):
type
:
Literal
[
"video_embeds"
]
video_embeds
:
torch
.
Tensor
"""Supported types:
- list[`torch.Tensor`]: A list of tensors holding all videos' features.
Each tensor holds an video's features.
- `torch.Tensor`: A tensor holding all videos' features
(concatenation of all videos' feature tensors).
Tensor shape: `(num_image_features, hidden_size)`
- `num_image_features` varies based on
the number and resolution of the videos.
- `hidden_size` must match the hidden size of language model backbone.
"""
"""
Dimensions:
video_grid_thw
:
torch
.
Tensor
- nf: Number of video features
"""Shape: `(num_videos, 3)`
- hs: Hidden size (must match the hidden size of language model
This should be in `(grid_t, grid_h, grid_w)` format.
backbone)
- nv: Number of videos
- g: Grid dimensions (3 for t, h, w)
"""
"""
type
:
Literal
[
"video_embeds"
]
video_embeds
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"nf"
,
"hs"
)]
video_grid_thw
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"nv"
,
3
)]
KeyeVideoInputs
=
Union
[
KeyeVideoPixelInputs
,
KeyeVideoEmbeddingInputs
]
KeyeVideoInputs
=
Union
[
KeyeVideoPixelInputs
,
KeyeVideoEmbeddingInputs
]
...
@@ -1420,10 +1406,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
...
@@ -1420,10 +1406,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
image_grid_thw
=
self
.
_validate_and_reshape_mm_tensor
(
image_grid_thw
=
self
.
_validate_and_reshape_mm_tensor
(
image_grid_thw
,
"image grid_thw"
)
image_grid_thw
,
"image grid_thw"
)
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of image pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
return
KeyeImagePixelInputs
(
return
KeyeImagePixelInputs
(
type
=
"pixel_values"
,
type
=
"pixel_values"
,
pixel_values
=
pixel_values
,
pixel_values
=
pixel_values
,
...
@@ -1436,9 +1418,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
...
@@ -1436,9 +1418,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
image_grid_thw
=
self
.
_validate_and_reshape_mm_tensor
(
image_grid_thw
=
self
.
_validate_and_reshape_mm_tensor
(
image_grid_thw
,
"image grid_thw"
)
image_grid_thw
,
"image grid_thw"
)
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
return
KeyeImageEmbeddingInputs
(
return
KeyeImageEmbeddingInputs
(
type
=
"image_embeds"
,
type
=
"image_embeds"
,
image_embeds
=
image_embeds
,
image_embeds
=
image_embeds
,
...
@@ -1474,9 +1453,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
...
@@ -1474,9 +1453,6 @@ class KeyeForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA,
video_grid_thw
=
self
.
_validate_and_reshape_mm_tensor
(
video_grid_thw
=
self
.
_validate_and_reshape_mm_tensor
(
video_grid_thw
,
"video grid_thw"
)
video_grid_thw
,
"video grid_thw"
)
if
not
isinstance
(
video_embeds
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of video embeddings. "
f
"Got type:
{
type
(
video_embeds
)
}
"
)
return
KeyeVideoEmbeddingInputs
(
return
KeyeVideoEmbeddingInputs
(
type
=
"video_embeds"
,
type
=
"video_embeds"
,
video_embeds
=
video_embeds
,
video_embeds
=
video_embeds
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment