Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f192ca90
Unverified
Commit
f192ca90
authored
May 01, 2025
by
Michael Goin
Committed by
GitHub
May 01, 2025
Browse files
Fix PixtralHF missing spatial_merge_size (#17571)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
f89d0e11
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
18 additions
and
25 deletions
+18
-25
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+2
-3
vllm/model_executor/models/mistral3.py
vllm/model_executor/models/mistral3.py
+2
-6
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+3
-2
vllm/model_executor/models/vision.py
vllm/model_executor/models/vision.py
+11
-14
No files found.
vllm/model_executor/models/llava.py
View file @
f192ca90
...
@@ -354,9 +354,8 @@ class PixtralHFMultiModalProcessor(
...
@@ -354,9 +354,8 @@ class PixtralHFMultiModalProcessor(
image_token_id
=
hf_config
.
image_token_index
image_token_id
=
hf_config
.
image_token_index
image_end_id
=
vocab
[
processor
.
image_end_token
]
image_end_id
=
vocab
[
processor
.
image_end_token
]
vision_config
=
hf_config
.
vision_config
assert
isinstance
(
hf_config
.
vision_config
,
PixtralVisionConfig
)
assert
isinstance
(
vision_config
,
PixtralVisionConfig
)
encoder_info
=
PixtralHFEncoderInfo
(
hf_config
)
encoder_info
=
PixtralHFEncoderInfo
(
vision_config
)
def
get_replacement
(
item_idx
:
int
):
def
get_replacement
(
item_idx
:
int
):
images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
...
...
vllm/model_executor/models/mistral3.py
View file @
f192ca90
...
@@ -272,12 +272,8 @@ class Mistral3MultiModalProcessor(
...
@@ -272,12 +272,8 @@ class Mistral3MultiModalProcessor(
image_token_id
=
hf_config
.
image_token_index
image_token_id
=
hf_config
.
image_token_index
image_end_id
=
vocab
[
processor
.
image_end_token
]
image_end_id
=
vocab
[
processor
.
image_end_token
]
vision_config
=
hf_config
.
vision_config
assert
isinstance
(
hf_config
.
vision_config
,
PixtralVisionConfig
)
assert
isinstance
(
vision_config
,
PixtralVisionConfig
)
encoder_info
=
PixtralHFEncoderInfo
(
hf_config
)
# Need to sneak in spatial_merge_size for Mistral3
vision_config
.
spatial_merge_size
=
getattr
(
hf_config
,
"spatial_merge_size"
,
1
)
encoder_info
=
PixtralHFEncoderInfo
(
vision_config
)
def
get_replacement
(
item_idx
:
int
):
def
get_replacement
(
item_idx
:
int
):
images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
images
=
mm_items
.
get_items
(
"image"
,
ImageProcessorItems
)
...
...
vllm/model_executor/models/pixtral.py
View file @
f192ca90
...
@@ -916,8 +916,9 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
...
@@ -916,8 +916,9 @@ class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
return
self
.
vision_config
.
image_size
return
self
.
vision_config
.
image_size
def
get_patch_size
(
self
)
->
int
:
def
get_patch_size
(
self
)
->
int
:
return
(
self
.
vision_config
.
patch_size
*
# spatial_merge_size is needed for Mistral3
self
.
vision_config
.
spatial_merge_size
)
spatial_merge_size
=
getattr
(
self
.
hf_config
,
"spatial_merge_size"
,
1
)
return
self
.
vision_config
.
patch_size
*
spatial_merge_size
def
get_patch_grid_length
(
self
)
->
int
:
def
get_patch_grid_length
(
self
)
->
int
:
image_size
,
patch_size
=
self
.
get_image_size
(),
self
.
get_patch_size
()
image_size
,
patch_size
=
self
.
get_image_size
(),
self
.
get_patch_size
()
...
...
vllm/model_executor/models/vision.py
View file @
f192ca90
...
@@ -19,10 +19,11 @@ _C = TypeVar("_C", bound=PretrainedConfig)
...
@@ -19,10 +19,11 @@ _C = TypeVar("_C", bound=PretrainedConfig)
class
VisionEncoderInfo
(
ABC
,
Generic
[
_C
]):
class
VisionEncoderInfo
(
ABC
,
Generic
[
_C
]):
def
__init__
(
self
,
vision
_config
:
_C
)
->
None
:
def
__init__
(
self
,
hf
_config
:
_C
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
vision_config
=
vision_config
self
.
hf_config
=
hf_config
self
.
vision_config
=
hf_config
.
vision_config
@
abstractmethod
@
abstractmethod
def
get_num_image_tokens
(
def
get_num_image_tokens
(
...
@@ -57,18 +58,14 @@ def get_vision_encoder_info(
...
@@ -57,18 +58,14 @@ def get_vision_encoder_info(
from
.pixtral
import
PixtralHFEncoderInfo
,
PixtralVisionConfig
from
.pixtral
import
PixtralHFEncoderInfo
,
PixtralVisionConfig
from
.siglip
import
SiglipEncoderInfo
,
SiglipVisionConfig
from
.siglip
import
SiglipEncoderInfo
,
SiglipVisionConfig
vision_config
=
hf_config
.
vision_config
if
isinstance
(
hf_config
.
vision_config
,
CLIPVisionConfig
):
if
isinstance
(
vision_config
,
CLIPVisionConfig
):
return
CLIPEncoderInfo
(
hf_config
)
return
CLIPEncoderInfo
(
vision_config
)
if
isinstance
(
hf_config
.
vision_config
,
PixtralVisionConfig
):
if
isinstance
(
vision_config
,
PixtralVisionConfig
):
return
PixtralHFEncoderInfo
(
hf_config
)
# Need to sneak in spatial_merge_size for Mistral3
if
isinstance
(
hf_config
.
vision_config
,
SiglipVisionConfig
):
vision_config
.
spatial_merge_size
=
getattr
(
hf_config
,
return
SiglipEncoderInfo
(
hf_config
)
"spatial_merge_size"
,
1
)
return
PixtralHFEncoderInfo
(
vision_config
)
msg
=
f
"Unsupported vision config:
{
type
(
hf_config
.
vision_config
)
}
"
if
isinstance
(
vision_config
,
SiglipVisionConfig
):
return
SiglipEncoderInfo
(
vision_config
)
msg
=
f
"Unsupported vision config:
{
type
(
vision_config
)
}
"
raise
NotImplementedError
(
msg
)
raise
NotImplementedError
(
msg
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment