Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
68fcd3fa
Unverified
Commit
68fcd3fa
authored
Aug 20, 2025
by
Cyrus Leung
Committed by
GitHub
Aug 20, 2025
Browse files
[Bugfix] Ensure correctness of Cohere2Vision processing (#23245)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
83e69a09
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
56 additions
and
19 deletions
+56
-19
tests/models/multimodal/processing/test_common.py
tests/models/multimodal/processing/test_common.py
+1
-0
vllm/model_executor/models/aya_vision.py
vllm/model_executor/models/aya_vision.py
+1
-2
vllm/model_executor/models/cohere2_vision.py
vllm/model_executor/models/cohere2_vision.py
+54
-17
No files found.
tests/models/multimodal/processing/test_common.py
View file @
68fcd3fa
...
@@ -268,6 +268,7 @@ def _test_processing_correctness_one(
...
@@ -268,6 +268,7 @@ def _test_processing_correctness_one(
"CohereForAI/aya-vision-8b"
,
"CohereForAI/aya-vision-8b"
,
"Salesforce/blip2-opt-2.7b"
,
"Salesforce/blip2-opt-2.7b"
,
"facebook/chameleon-7b"
,
"facebook/chameleon-7b"
,
"CohereLabs/command-a-vision-07-2025"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"deepseek-ai/deepseek-vl2-tiny"
,
"microsoft/Florence-2-base"
,
"microsoft/Florence-2-base"
,
"adept/fuyu-8b"
,
"adept/fuyu-8b"
,
...
...
vllm/model_executor/models/aya_vision.py
View file @
68fcd3fa
...
@@ -250,8 +250,7 @@ class AyaVisionMultiModalProcessor(
...
@@ -250,8 +250,7 @@ class AyaVisionMultiModalProcessor(
image_processor
=
hf_processor
.
image_processor
image_processor
=
hf_processor
.
image_processor
def
get_replacement
(
item_idx
:
int
):
def
get_replacement
(
item_idx
:
int
):
images
:
ImageProcessorItems
=
mm_items
.
get
(
"image"
,
images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
ImageProcessorItems
)
image_size
:
ImageSize
=
images
.
get_image_size
(
item_idx
)
image_size
:
ImageSize
=
images
.
get_image_size
(
item_idx
)
num_patches
=
self
.
info
.
get_num_patches
(
num_patches
=
self
.
info
.
get_num_patches
(
image_width
=
image_size
.
width
,
image_width
=
image_size
.
width
,
...
...
vllm/model_executor/models/cohere2_vision.py
View file @
68fcd3fa
...
@@ -10,6 +10,8 @@ import torch
...
@@ -10,6 +10,8 @@ import torch
from
torch
import
nn
from
torch
import
nn
from
transformers
import
BatchFeature
,
PretrainedConfig
from
transformers
import
BatchFeature
,
PretrainedConfig
from
transformers.models.cohere2_vision
import
Cohere2VisionConfig
from
transformers.models.cohere2_vision
import
Cohere2VisionConfig
from
transformers.models.cohere2_vision.image_processing_cohere2_vision_fast
import
(
# noqa: E501
get_optimal_tiled_canvas
)
from
transformers.models.cohere2_vision.processing_cohere2_vision
import
(
from
transformers.models.cohere2_vision.processing_cohere2_vision
import
(
Cohere2VisionProcessor
)
Cohere2VisionProcessor
)
...
@@ -150,14 +152,46 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
...
@@ -150,14 +152,46 @@ class Cohere2VisionProcessingInfo(BaseProcessingInfo):
max_patches
=
image_processor
.
max_patches
max_patches
=
image_processor
.
max_patches
return
ImageSize
(
height
=
height
*
max_patches
,
width
=
width
)
return
ImageSize
(
height
=
height
*
max_patches
,
width
=
width
)
def
get_num_patches
(
self
,
image_width
:
int
,
image_height
:
int
)
->
int
:
def
get_num_patches
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
processor
:
Optional
[
Cohere2VisionProcessor
],
)
->
int
:
"""
"""
Calculate the number of image patches for a given image.
Calculate the number of image patches for a given image.
Uses the HF processor to determine the actual number of patches.
Uses the HF processor to determine the actual number of patches.
"""
"""
return
self
.
get_hf_processor
(
if
processor
is
None
:
).
image_processor
.
get_number_of_image_patches
(
image_height
,
processor
=
self
.
get_hf_processor
()
image_width
,
{})
image_processor
=
processor
.
image_processor
# The current implementation of get_number_of_image_patches
# is incorrect, so we patch it here.
# return image_processor.get_number_of_image_patches(image_height,
# image_width, {})
min_patches
=
image_processor
.
min_patches
max_patches
=
image_processor
.
max_patches
patch_size
=
image_processor
.
size
crop_to_patches
=
image_processor
.
crop_to_patches
if
not
crop_to_patches
:
return
1
num_columns
,
num_rows
=
get_optimal_tiled_canvas
(
(
image_height
,
image_width
),
(
patch_size
[
"height"
],
patch_size
[
"width"
]),
min_patches
,
max_patches
,
)
num_patches
=
num_columns
*
num_rows
if
num_patches
>
1
:
num_patches
+=
1
# Thumbnail image
return
num_patches
class
Cohere2VisionDummyInputsBuilder
(
class
Cohere2VisionDummyInputsBuilder
(
...
@@ -208,6 +242,8 @@ class Cohere2VisionMultiModalProcessor(
...
@@ -208,6 +242,8 @@ class Cohere2VisionMultiModalProcessor(
# Ensure num_patches is available for proper tensor splitting
# Ensure num_patches is available for proper tensor splitting
if
"num_patches"
not
in
processed_outputs
and
(
if
"num_patches"
not
in
processed_outputs
and
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
# Fallback calculation if HF processor didn't provide num_patches
# Fallback calculation if HF processor didn't provide num_patches
parsed_images
=
self
.
_get_data_parser
().
parse_mm_data
({
parsed_images
=
self
.
_get_data_parser
().
parse_mm_data
({
"image"
:
"image"
:
...
@@ -217,8 +253,9 @@ class Cohere2VisionMultiModalProcessor(
...
@@ -217,8 +253,9 @@ class Cohere2VisionMultiModalProcessor(
num_patches
=
[
num_patches
=
[
self
.
info
.
get_num_patches
(
self
.
info
.
get_num_patches
(
image_width
=
parsed_images
.
get_image_size
(
i
).
width
,
image_width
=
parsed_images
.
get_image_size
(
i
).
width
,
image_height
=
parsed_images
.
get_image_size
(
i
).
height
)
image_height
=
parsed_images
.
get_image_size
(
i
).
height
,
for
i
in
range
(
len
(
parsed_images
))
processor
=
hf_processor
,
)
for
i
in
range
(
len
(
parsed_images
))
]
]
processed_outputs
[
"num_patches"
]
=
torch
.
tensor
(
num_patches
)
processed_outputs
[
"num_patches"
]
=
torch
.
tensor
(
num_patches
)
...
@@ -245,25 +282,25 @@ class Cohere2VisionMultiModalProcessor(
...
@@ -245,25 +282,25 @@ class Cohere2VisionMultiModalProcessor(
)
->
Sequence
[
PromptUpdate
]:
)
->
Sequence
[
PromptUpdate
]:
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
hf_processor_mm_kwargs
)
image_token
=
hf_processor
.
image_token
image_token
=
hf_processor
.
image_token
img_tokens_per_tile
=
int
(
hf_processor
.
patch_size
**
2
)
img_line_break_token
=
hf_processor
.
img_line_break_token
img_line_break_token
=
hf_processor
.
img_line_break_token
boi_token
=
hf_processor
.
boi_token
boi_token
=
hf_processor
.
boi_token
eoi_token
=
hf_processor
.
eoi_token
eoi_token
=
hf_processor
.
eoi_token
def
get_replacement
(
item_idx
:
int
):
def
get_replacement
(
item_idx
:
int
):
images
:
ImageProcessorItems
=
mm_items
.
get
(
"image"
,
images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
ImageProcessorItems
)
image_size
:
ImageSize
=
images
.
get_image_size
(
item_idx
)
image_size
:
ImageSize
=
images
.
get_image_size
(
item_idx
)
num_patches
=
self
.
info
.
get_num_patches
(
image_size
.
height
,
num_patches
=
self
.
info
.
get_num_patches
(
image_size
.
width
)
image_width
=
image_size
.
width
,
img_tokens_per_tile
=
int
(
hf_processor
.
patch_size
**
2
)
image_height
=
image_size
.
height
,
single_tile_tokens
=
image_token
*
img_tokens_per_tile
+
\
processor
=
hf_processor
,
img_line_break_token
)
img_string
=
f
"
{
boi_token
}
\
patch_tokens
=
(
image_token
*
img_tokens_per_tile
+
{
single_tile_tokens
*
num_patches
}
\
img_line_break_token
)
{
eoi_token
}
"
repl
=
f
"
{
boi_token
}{
patch_tokens
*
num_patches
}
{
eoi_token
}
"
return
PromptUpdateDetails
.
select_text
(
img_string
,
image_token
)
return
PromptUpdateDetails
.
select_text
(
repl
,
image_token
)
return
[
return
[
PromptReplacement
(
PromptReplacement
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment