Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
08d26a1b
Unverified
Commit
08d26a1b
authored
Oct 07, 2025
by
Isotr0py
Committed by
GitHub
Oct 07, 2025
Browse files
[Model] Use `merge_by_field_config` for MM models (Ovis family) (#26308)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
63773a62
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
80 additions
and
75 deletions
+80
-75
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+2
-6
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+3
-5
vllm/model_executor/models/ovis.py
vllm/model_executor/models/ovis.py
+12
-13
vllm/model_executor/models/ovis2_5.py
vllm/model_executor/models/ovis2_5.py
+62
-50
vllm/transformers_utils/processors/ovis.py
vllm/transformers_utils/processors/ovis.py
+1
-1
No files found.
examples/offline_inference/vision_language.py
View file @
08d26a1b
...
...
@@ -1140,14 +1140,10 @@ def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
elif
modality
==
"video"
:
placeholder
=
"<video>"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
messages
=
[
[{
"role"
:
"user"
,
"content"
:
f
"
{
placeholder
}
\n
{
question
}
"
}]
prompts
=
[
f
"<|im_start|>user
\n\n
{
placeholder
}
\n
{
question
}
<|im_end|>
\n
<|im_start|>assistant
\n
"
for
question
in
questions
]
prompts
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
return
ModelRequestData
(
engine_args
=
engine_args
,
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
08d26a1b
...
...
@@ -713,11 +713,9 @@ def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
placeholders
=
"
\n
"
.
join
(
f
"Image-
{
i
}
: <image>
\n
"
for
i
,
_
in
enumerate
(
image_urls
,
start
=
1
)
)
messages
=
[{
"role"
:
"user"
,
"content"
:
f
"
{
placeholders
}
\n
{
question
}
"
}]
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
prompt
=
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
prompt
=
(
f
"<|im_start|>user
\n\n
{
placeholders
}
\n
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
return
ModelRequestData
(
...
...
vllm/model_executor/models/ovis.py
View file @
08d26a1b
...
...
@@ -217,17 +217,17 @@ class VisualTokenizer(torch.nn.Module):
class
OvisImagePatchInputs
(
TensorSchema
):
"""
Dimensions:
- batch_patches: Batch size * number of patches
- patch_size: patch_size_x * patch_size_y * num_channels
- bnp: Batch size * number of images * number of patches
- h: Height of each patch
- w: Width of each patch
- patch_indicators: Batch size * (number of patches + 1)
- patches_per_image: List of number of total patches for each image
in the batch.
- bn: Batch size * number of images
"""
type
:
Literal
[
"image_patches"
]
flat_data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b
atch_patches"
,
"patch_size
"
)]
flat_data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"b
np"
,
3
,
"h"
,
"w
"
)]
indicator_tokens
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"patch_indicators"
)]
patches_per_image
:
Annotated
[
list
[
int
],
TensorShape
(
"n
um_patches_per_image
"
)]
patches_per_image
:
Annotated
[
list
[
int
],
TensorShape
(
"
b
n"
)]
# This is used to restore the first two dimensions of `flat_data`.
...
...
@@ -366,7 +366,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
self
.
image_indicators_to_visual_tokens
(
indicator
)
for
indicator
in
image_indicators
]
processed_outputs
[
"indicator_tokens"
]
=
indicator_tokens
processed_outputs
[
"indicator_tokens"
]
=
torch
.
tensor
(
indicator_tokens
)
return
processed_outputs
def
_apply_hf_processor_tokens_only
(
...
...
@@ -414,6 +414,8 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
dummy_inputs
=
OvisDummyInputsBuilder
,
)
class
Ovis
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
merge_by_field_config
=
True
@
classmethod
def
get_placeholder_str
(
cls
,
modality
:
str
,
i
:
int
)
->
Optional
[
str
]:
if
modality
.
startswith
(
"image"
):
...
...
@@ -470,14 +472,11 @@ class Ovis(nn.Module, SupportsMultiModal, SupportsPP):
f
"Got type:
{
type
(
pixel_values
)
}
"
)
flat_data
=
flatten_bn
(
pixel_values
,
concat
=
True
)
if
flat_data
.
ndim
>=
3
:
flat_data
=
flat_data
.
flatten
(
start_dim
=
1
)
return
OvisImagePatchInputs
(
type
=
"image_patches"
,
flat_data
=
flat
_data
,
patches_per_image
=
[
x
.
shape
[
0
]
for
x
in
flatten_bn
(
pixel_values
)
],
indicator_tokens
=
flatten_bn
(
flatten_bn
(
indicator_tokens
)
,
concat
=
True
),
flat_data
=
flat
ten_bn
(
pixel_values
,
concat
=
True
)
,
patches_per_image
=
[
x
.
shape
[
0
]
for
x
in
pixel_values
],
indicator_tokens
=
flatten_bn
(
indicator_tokens
,
concat
=
True
),
)
raise
AssertionError
(
"This line should be unreachable."
)
...
...
vllm/model_executor/models/ovis2_5.py
View file @
08d26a1b
...
...
@@ -4,7 +4,7 @@
from
collections.abc
import
Iterable
,
Mapping
from
functools
import
partial
from
typing
import
Literal
,
Optional
,
TypedDict
,
Union
from
typing
import
Annotated
,
Literal
,
Optional
,
Union
import
torch
import
torch.nn
as
nn
...
...
@@ -14,7 +14,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.models.ovis
import
OvisImagePatchInputs
,
VisualEmbedding
from
vllm.model_executor.models.ovis
import
VisualEmbedding
from
vllm.model_executor.models.siglip2navit
import
Siglip2NavitModel
from
vllm.model_executor.models.utils
import
(
AutoWeightsLoader
,
...
...
@@ -37,6 +37,7 @@ from vllm.multimodal.processing import (
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.processors.ovis2_5
import
Ovis2_5Processor
from
vllm.utils.tensor_schema
import
TensorSchema
,
TensorShape
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
...
...
@@ -58,36 +59,38 @@ IMAGE_PAD_TOKEN_ID_MAP = {
}
class
OvisVideoPatchInputs
(
TypedDict
):
type
:
Literal
[
"video_patches"
]
flat_data
:
torch
.
Tensor
class
Ovis2_5ImagePatchInputs
(
TensorSchema
):
"""
Shape:
`(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
Dimensions:
- bnp: Batch size * number of images * number of patches
- patch_size: patch_size_x * patch_size_y * num_channels
- patch_indicators: Batch size * (number of patches + 1)
- bn: Batch size * number of images
"""
indicator_tokens
:
torch
.
Tensor
"""
Shape:
`(batch_size * (num_patches + 1))`
"""
type
:
Literal
[
"image_patches"
]
flat_data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bnp"
,
"patch_size"
)]
indicator_tokens
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"patch_indicators"
)]
patches_per_item
:
Annotated
[
list
[
int
],
TensorShape
(
"bn"
)]
grids
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
,
3
)]
# This is used to restore the first two dimensions of `flat_data`.
patches_per_image
:
list
[
int
]
class
Ovis2_5VideoPatchInputs
(
TensorSchema
):
"""
List of number of total patches for each frame in the video.
This is used to restore the first two dimensions of `flat_data`.
Dimensions:
- bnp: Batch size * number of videos * number of patches
- patch_size: patch_size_x * patch_size_y * num_channels
- patch_indicators: Batch size * (number of patches + 1)
- bn: Batch size * number of videos
"""
def
_ovis2_5_field_config
():
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
batched
(
"image"
),
grids
=
MultiModalFieldConfig
.
batched
(
"image"
),
indicator_tokens
=
MultiModalFieldConfig
.
batched
(
"image"
),
video_pixel_values
=
MultiModalFieldConfig
.
batched
(
"video"
),
video_indicator_tokens
=
MultiModalFieldConfig
.
batched
(
"video"
),
video_grids
=
MultiModalFieldConfig
.
batched
(
"video"
),
)
type
:
Literal
[
"video_patches"
]
flat_data
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bnp"
,
"patch_size"
)]
indicator_tokens
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"patch_indicators"
)]
patches_per_item
:
Annotated
[
list
[
int
],
TensorShape
(
"bn"
)]
grids
:
Annotated
[
torch
.
Tensor
,
TensorShape
(
"bn"
,
3
)]
# This is used to restore the first two dimensions of `flat_data`.
class
VisualTokenizer
(
torch
.
nn
.
Module
):
...
...
@@ -380,7 +383,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
self
.
visual_indicators_to_visual_tokens
(
indicator
)
for
indicator
in
visual_indicators
]
processed_outputs
[
"video_indicator_tokens"
]
=
indicator_tokens
processed_outputs
[
"video_indicator_tokens"
]
=
torch
.
tensor
(
indicator_tokens
)
if
"images"
in
mm_data
:
visual_indicators
=
[
hf_processor
.
construct_visual_indicators
((
1
,
1
,
1
),
False
)
...
...
@@ -391,7 +394,7 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
for
indicator
in
visual_indicators
]
processed_outputs
[
"indicator_tokens"
]
=
indicator_tokens
processed_outputs
[
"indicator_tokens"
]
=
torch
.
tensor
(
indicator_tokens
)
return
processed_outputs
def
_apply_hf_processor_tokens_only
(
...
...
@@ -405,7 +408,14 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
hf_inputs
:
BatchFeature
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
return
_ovis2_5_field_config
()
return
dict
(
pixel_values
=
MultiModalFieldConfig
.
batched
(
"image"
),
grids
=
MultiModalFieldConfig
.
batched
(
"image"
),
indicator_tokens
=
MultiModalFieldConfig
.
batched
(
"image"
),
video_pixel_values
=
MultiModalFieldConfig
.
batched
(
"video"
),
video_indicator_tokens
=
MultiModalFieldConfig
.
batched
(
"video"
),
video_grids
=
MultiModalFieldConfig
.
batched
(
"video"
),
)
def
_get_prompt_updates
(
self
,
...
...
@@ -441,6 +451,8 @@ class Ovis2_5MultiModalProcessor(BaseMultiModalProcessor[Ovis2_5ProcessingInfo])
dummy_inputs
=
Ovis2_5DummyInputsBuilder
,
)
class
Ovis2_5
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
merge_by_field_config
=
True
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
...
...
@@ -470,7 +482,7 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
OvisImagePatchInputs
]:
)
->
Optional
[
Ovis
2_5
ImagePatchInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
indicator_tokens
=
kwargs
.
pop
(
"indicator_tokens"
,
None
)
grids
=
kwargs
.
pop
(
"grids"
,
None
)
...
...
@@ -489,22 +501,22 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
f
"Got type:
{
type
(
indicator_tokens
)
}
"
)
return
OvisImagePatchInputs
(
return
Ovis
2_5
ImagePatchInputs
(
type
=
"image_patches"
,
flat_data
=
flatten_bn
(
flatten_bn
(
pixel_values
)
,
concat
=
True
),
patches_per_i
mage
=
[
flat_data
=
flatten_bn
(
pixel_values
,
concat
=
True
),
patches_per_i
tem
=
[
x
.
shape
[
0
]
//
(
self
.
config
.
vit_config
.
hidden_stride
**
2
)
for
x
in
flatten_bn
(
pixel_values
)
for
x
in
pixel_values
],
indicator_tokens
=
flatten_bn
(
flatten_bn
(
indicator_tokens
)
,
concat
=
True
),
grids
=
flatten_bn
(
flatten_bn
(
grids
)
,
concat
=
True
),
indicator_tokens
=
flatten_bn
(
indicator_tokens
,
concat
=
True
),
grids
=
flatten_bn
(
grids
,
concat
=
True
),
)
raise
AssertionError
(
"This line should be unreachable."
)
def
_parse_and_validate_video_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
Ovis
Image
PatchInputs
]:
)
->
Optional
[
Ovis
2_5Video
PatchInputs
]:
pixel_values
=
kwargs
.
pop
(
"video_pixel_values"
,
None
)
indicator_tokens
=
kwargs
.
pop
(
"video_indicator_tokens"
,
None
)
grids
=
kwargs
.
pop
(
"video_grids"
,
None
)
...
...
@@ -523,26 +535,26 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
f
"Got type:
{
type
(
indicator_tokens
)
}
"
)
return
OvisVideoPatchInputs
(
return
Ovis
2_5
VideoPatchInputs
(
type
=
"video_patches"
,
flat_data
=
flatten_bn
(
flatten_bn
(
pixel_values
)
,
concat
=
True
),
patches_per_i
mage
=
[
flat_data
=
flatten_bn
(
pixel_values
,
concat
=
True
),
patches_per_i
tem
=
[
x
.
shape
[
0
]
//
(
self
.
config
.
vit_config
.
hidden_stride
**
2
)
for
x
in
flatten_bn
(
pixel_values
)
for
x
in
pixel_values
],
indicator_tokens
=
flatten_bn
(
flatten_bn
(
indicator_tokens
)
,
concat
=
True
),
grids
=
flatten_bn
(
flatten_bn
(
grids
)
,
concat
=
True
),
indicator_tokens
=
flatten_bn
(
indicator_tokens
,
concat
=
True
),
grids
=
flatten_bn
(
grids
,
concat
=
True
),
)
raise
AssertionError
(
"This line should be unreachable."
)
def
_process_
image
_input
(
self
,
image
_input
:
Union
[
OvisImagePatchInputs
,
OvisVideoPatchInputs
]
def
_process_
visual
_input
(
self
,
visual
_input
:
Union
[
Ovis
2_5
ImagePatchInputs
,
Ovis
2_5
VideoPatchInputs
]
)
->
MultiModalEmbeddings
:
image_patches_flat
=
image
_input
[
"flat_data"
]
patches_per_image
=
image
_input
[
"patches_per_i
mage
"
]
indicator_tokens
=
image
_input
[
"indicator_tokens"
]
grid_thws
=
image
_input
[
"grids"
]
image_patches_flat
=
visual
_input
[
"flat_data"
]
patches_per_image
=
visual
_input
[
"patches_per_i
tem
"
]
indicator_tokens
=
visual
_input
[
"indicator_tokens"
]
grid_thws
=
visual
_input
[
"grids"
]
indicator_per_image
=
list
(
map
(
lambda
x
:
2
if
x
>
1
else
x
+
2
,
patches_per_image
)
...
...
@@ -604,11 +616,11 @@ class Ovis2_5(nn.Module, SupportsMultiModal, SupportsPP):
for
modality
in
modalities
:
if
modality
==
"images"
:
image_input
=
modalities
[
"images"
]
vision_embeddings
=
self
.
_process_
image
_input
(
image_input
)
vision_embeddings
=
self
.
_process_
visual
_input
(
image_input
)
multimodal_embeddings
+=
vision_embeddings
if
modality
==
"videos"
:
video_input
=
modalities
[
"videos"
]
video_embeddings
=
self
.
_process_
image
_input
(
video_input
)
video_embeddings
=
self
.
_process_
visual
_input
(
video_input
)
multimodal_embeddings
+=
video_embeddings
return
multimodal_embeddings
...
...
vllm/transformers_utils/processors/ovis.py
View file @
08d26a1b
...
...
@@ -408,7 +408,7 @@ class OvisProcessor(ProcessorMixin):
crops
.
insert
(
0
,
image
)
pixel_values
=
torch
.
cat
([
_preprocess
(
crop
,
side
)
for
crop
in
crops
],
dim
=
0
)
image_placeholders
=
self
.
construct_image_placeholders
(
grid
)
return
pixel_values
,
image_placeholders
,
grid
return
torch
.
tensor
(
pixel_values
)
,
image_placeholders
,
torch
.
tensor
(
grid
)
def
batch_decode
(
self
,
*
args
,
**
kwargs
):
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment