Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a482e4e7
Unverified
Commit
a482e4e7
authored
Aug 21, 2025
by
Benji Beck
Committed by
GitHub
Aug 21, 2025
Browse files
Migrate MolmoImageInputs to TensorSchema (#22022)
Signed-off-by:
Benji Beck
<
benjibeck@meta.com
>
parent
e0b056e4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
33 deletions
+23
-33
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+23
-33
No files found.
vllm/model_executor/models/molmo.py
View file @
a482e4e7
...
...
@@ -5,7 +5,7 @@ import math
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
dataclasses
import
dataclass
from
functools
import
cached_property
,
partial
from
typing
import
Optional
,
TypedDict
,
Union
from
typing
import
Annotated
,
Optional
,
Union
import
numpy
as
np
import
torch
...
...
@@ -51,6 +51,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
PromptUpdateDetails
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
(
MultiModalEmbeddings
,
SupportsLoRA
,
SupportsMultiModal
,
SupportsPP
,
SupportsQuant
)
...
...
@@ -70,23 +71,25 @@ IM_END_TOKEN = "<im_end>"
POOLING_SIZE
=
2
class
MolmoImageInputs
(
TypedDict
):
images
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
"""Shape: `(batch_size * num_images, num_crops, num_patch, patch_dim)`"""
image_masks
:
Optional
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]]
"""Shape: `(batch_size * num_images, num_crops, num_patch)`"""
feat_is_patch
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
class
MolmoImageInputs
(
TensorSchema
):
"""
A boolean mask indicating which image features correspond
to patch tokens.
Shape: `(batch_size * num_images, num_crops, num_patch)`
Dimensions:
- bn: Batch size * number of images
- nc: Number of crops
- np: Number of patches
- pd: Patch dimension
"""
images
:
Annotated
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
TensorShape
(
"bn"
,
"nc"
,
"np"
,
"pd"
)]
image_masks
:
Annotated
[
Optional
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]],
TensorShape
(
"bn"
,
"nc"
,
"np"
)]
feat_is_patch
:
Annotated
[
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]],
TensorShape
(
"bn"
,
"nc"
,
"np"
)]
# A boolean mask indicating which image features correspond to patch tokens.
num_crops
:
torch
.
Tensor
"""Shape: `(batch_size * num_images)`"""
num_crops
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
)]
@
dataclass
...
...
@@ -1410,28 +1413,17 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
**
kwargs
:
object
,
)
->
Optional
[
MolmoImageInputs
]:
images
=
kwargs
.
pop
(
"images"
,
None
)
if
images
is
None
:
return
None
if
not
isinstance
(
images
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of images. "
f
"Got type:
{
type
(
images
)
}
"
)
image_masks
=
kwargs
.
pop
(
"image_masks"
,
None
)
if
not
(
image_masks
is
None
or
isinstance
(
image_masks
,
(
torch
.
Tensor
,
list
))):
raise
ValueError
(
"Incorrect type of image_masks. "
f
"Got type:
{
type
(
image_masks
)
}
"
)
feat_is_patch
=
kwargs
.
pop
(
"feat_is_patch"
,
None
)
if
not
isinstance
(
feat_is_patch
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of feat_is_patch. "
f
"Got type:
{
type
(
feat_is_patch
)
}
"
)
num_crops
=
kwargs
.
pop
(
"num_crops"
,
None
)
if
images
is
None
:
return
None
if
not
isinstance
(
num_crops
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of num_crops. "
f
"Got type:
{
type
(
num_crops
)
}
"
)
num_crops
=
flatten_bn
(
num_crops
,
concat
=
True
)
img_patch_id
=
kwargs
.
pop
(
"img_patch_id"
,
None
)
if
not
isinstance
(
img_patch_id
,
torch
.
Tensor
):
...
...
@@ -1439,8 +1431,6 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA,
f
"Got type:
{
type
(
img_patch_id
)
}
"
)
self
.
img_patch_id
=
img_patch_id
.
flatten
().
unique
().
item
()
num_crops
=
flatten_bn
(
num_crops
,
concat
=
True
)
return
MolmoImageInputs
(
images
=
images
,
image_masks
=
image_masks
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment