Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
56d04089
Unverified
Commit
56d04089
authored
Sep 01, 2025
by
Benji Beck
Committed by
GitHub
Sep 02, 2025
Browse files
Migrate Interns1 inputs to TensorSchema (#23510)
Signed-off-by:
Benji Beck
<
benjibeck@meta.com
>
parent
7be0cb8e
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
50 additions
and
51 deletions
+50
-51
vllm/model_executor/models/interns1.py
vllm/model_executor/models/interns1.py
+50
-51
No files found.
vllm/model_executor/models/interns1.py
View file @
56d04089
...
...
@@ -7,7 +7,7 @@
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
typing
import
Literal
,
Optional
,
TypedDict
,
Union
from
typing
import
Annotated
,
Literal
,
Optional
,
Union
import
regex
as
re
import
torch
...
...
@@ -32,6 +32,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
PromptUpdate
,
PromptUpdateDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
)
...
...
@@ -62,51 +63,60 @@ class InternS1MultiModalProjector(nn.Module):
return
hidden_states
class
InternS1ImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
pixel_values
:
torch
.
Tensor
class
InternS1ImagePixelInputs
(
TensorSchema
):
"""
Shape:
`(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
Dimensions:
- bnp: Batch size * number of images * (1 + num_patches)
- c: Number of channels (3)
- h: Height
- w: Width
- bn: Batch size * number of images
"""
type
:
Literal
[
"pixel_values"
]
=
"pixel_values"
pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bnp"
,
3
,
"h"
,
"w"
)]
num_patches
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
)]
class
InternS1ImageEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"image_embeds"
]
data
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
class
InternS1ImageEmbeddingInputs
(
TensorSchema
):
"""
A tensor of shape `(num_images, total_image_feature_size, hidden_size)`
or a list of tensors of shape `(total_image_feature_size, hidden_size)`
`h
idden
_
size
`
must match
the hidden size of
language model backbone
.
Dimensions:
- ni: Number of images
- tifs: Total image feature size
- hs: H
idden
size
(
must match language model backbone
)
"""
type
:
Literal
[
"image_embeds"
]
=
"image_embeds"
data
:
Annotated
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
TensorShape
(
"ni"
,
"tifs"
,
"hs"
)]
InternS1ImageInputs
=
Union
[
InternS1ImagePixelInputs
,
InternS1ImageEmbeddingInputs
]
class
InternS1VideoPixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values_videos"
]
pixel_values
:
torch
.
Tensor
class
InternS1VideoPixelInputs
(
TensorSchema
):
"""
Shape:
`(batch_size * num_video * num_frames, num_channels, height, width)`
Dimensions:
- bnv: Batch size * number of videos * number of frames
- bn: Batch size * number of images
- c: Number of channels (3)
- h: Height
- w: Width
"""
num_patches
:
torch
.
Tensor
"""Shape: `(batch_size * num_images)`"""
type
:
Literal
[
"pixel_values_videos"
]
=
"pixel_values_videos"
pixel_values
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bnv"
,
3
,
"h"
,
"w"
)]
num_patches
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
)]
class
InternS1VideoEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"video_embeds"
]
data
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
class
InternS1VideoEmbeddingInputs
(
TensorSchema
):
"""
A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
or a list of tensors of shape `(total_video_feature_size, hidden_size)`
`h
idden
_
size
`
must match
the hidden size of
language model backbone
.
Dimensions:
- nv: Number of videos
- tvfs: Total video feature size
- hs: H
idden
size
(
must match language model backbone
)
"""
type
:
Literal
[
"video_embeds"
]
=
"video_embeds"
data
:
Annotated
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
TensorShape
(
"nv"
,
"tvfs"
,
"hs"
)]
InternS1VideoInputs
=
Union
[
InternS1VideoPixelInputs
,
...
...
@@ -572,26 +582,6 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
vit_embeds
=
self
.
multi_modal_projector
(
vit_embeds
)
return
vit_embeds
def
_validate_pixel_values
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
,
w
=
self
.
config
.
vision_config
.
image_size
expected_dims
=
(
3
,
h
,
w
)
def
_validate_shape
(
d
:
torch
.
Tensor
):
actual_dims
=
tuple
(
d
.
shape
)
if
actual_dims
!=
expected_dims
:
expected_expr
=
str
(
expected_dims
)
raise
ValueError
(
"The expected shape of pixel values per image per batch "
f
" per patch is
{
expected_expr
}
. "
f
"You supplied
{
tuple
(
d
.
shape
)
}
."
)
for
d
in
data
:
_validate_shape
(
d
)
return
data
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
InternS1ImageInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
...
...
@@ -627,10 +617,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values
=
flatten_bn
(
pixel_values
,
concat
=
True
)
image_num_patches
=
flatten_bn
(
image_num_patches
,
concat
=
True
)
h
,
w
=
self
.
config
.
vision_config
.
image_size
return
InternS1ImagePixelInputs
(
type
=
"pixel_values"
,
pixel_values
=
self
.
_validate_pixel_values
(
pixel_values
)
,
pixel_values
=
pixel_values
,
num_patches
=
image_num_patches
,
resolve_bindings
=
{
"h"
:
h
,
"w"
:
w
,
},
)
raise
AssertionError
(
"This line should be unreachable."
)
...
...
@@ -671,11 +666,15 @@ class InternS1ForConditionalGeneration(nn.Module, SupportsMultiModal,
concat
=
True
)
video_num_patches
=
flatten_bn
(
video_num_patches
,
concat
=
True
)
h
,
w
=
self
.
config
.
vision_config
.
image_size
return
InternS1VideoPixelInputs
(
type
=
"pixel_values_videos"
,
pixel_values
=
self
.
_validate_pixel_values
(
pixel_values_flat_video
),
num_patches
=
video_num_patches
,
pixel_values
=
pixel_values_flat_video
,
resolve_bindings
=
{
"h"
:
h
,
"w"
:
w
,
},
)
raise
AssertionError
(
"This line should be unreachable."
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment