Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6ff51862
Unverified
Commit
6ff51862
authored
Feb 25, 2025
by
Isotr0py
Committed by
GitHub
Feb 25, 2025
Browse files
[Bugfix] Fix deepseek-vl2 inference with more than 2 images (#13818)
parent
fa820741
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
46 additions
and
10 deletions
+46
-10
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+42
-8
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+4
-2
No files found.
vllm/model_executor/models/deepseek_vl2.py
View file @
6ff51862
...
...
@@ -25,7 +25,8 @@ from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
from
vllm.multimodal.parse
import
(
ImageEmbeddingItems
,
ImageProcessorItems
,
ImageSize
,
MultiModalDataItems
)
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
BaseProcessingInfo
,
PromptReplacement
)
BaseProcessingInfo
,
ProcessingCache
,
PromptReplacement
)
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.deepseek_vl2
import
(
DeepseekVLV2Config
,
...
...
@@ -138,18 +139,24 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
def
get_supported_mm_limits
(
self
)
->
Mapping
[
str
,
Optional
[
int
]]:
return
{
"image"
:
None
}
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
)
->
int
:
def
get_num_image_tokens
(
self
,
*
,
image_width
:
int
,
image_height
:
int
,
cropping
:
bool
=
True
)
->
int
:
hf_processor
=
self
.
get_hf_processor
()
image_size
=
hf_processor
.
image_size
patch_size
=
hf_processor
.
patch_size
downsample_ratio
=
hf_processor
.
downsample_ratio
if
cropping
:
best_width
,
best_height
=
hf_processor
.
select_best_resolution
(
(
image_width
,
image_height
))
num_width_tiles
,
num_height_tiles
=
(
best_width
//
image_size
,
best_height
//
image_size
)
else
:
num_width_tiles
=
num_height_tiles
=
1
h
=
w
=
math
.
ceil
((
image_size
//
patch_size
)
/
downsample_ratio
)
global_views_tokens
=
h
*
(
w
+
1
)
...
...
@@ -169,10 +176,12 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
seq_len
:
int
,
mm_counts
:
Mapping
[
str
,
int
],
)
->
Mapping
[
str
,
int
]:
num_images
=
mm_counts
.
get
(
"image"
,
0
)
max_image_size
=
self
.
get_image_size_with_most_features
()
max_image_tokens
=
self
.
get_num_image_tokens
(
image_height
=
max_image_size
.
height
,
image_width
=
max_image_size
.
width
)
image_width
=
max_image_size
.
width
,
cropping
=
num_images
<=
2
)
return
{
"image"
:
max_image_tokens
}
...
...
@@ -207,6 +216,30 @@ class DeepseekVL2DummyInputsBuilder(
class
DeepseekVL2MultiModalProcessor
(
BaseMultiModalProcessor
[
DeepseekVL2ProcessingInfo
]):
def
__init__
(
self
,
info
:
DeepseekVL2ProcessingInfo
,
dummy_inputs
:
"BaseDummyInputsBuilder[DeepseekVL2ProcessingInfo]"
,
*
,
cache
:
Optional
[
ProcessingCache
]
=
None
,
enable_sanity_checks
:
bool
=
True
)
->
None
:
super
().
__init__
(
info
,
dummy_inputs
,
cache
=
cache
,
enable_sanity_checks
=
enable_sanity_checks
,
)
mm_limit
=
self
.
info
.
ctx
.
model_config
.
multimodal_config
.
limit_per_prompt
if
self
.
cache
is
not
None
and
mm_limit
[
"image"
]
>
2
:
# The processor output depends on the number of images passed,
# making it incompatible with processing cache which is supposed
# to be invariant of how many images are passed per prompt
self
.
cache
=
None
logger
.
warning_once
(
f
"
{
type
(
self
).
__name__
}
does not support processing cache with "
"image limit larger than 2."
)
def
_call_hf_processor
(
self
,
prompt
:
str
,
...
...
@@ -271,6 +304,7 @@ class DeepseekVL2MultiModalProcessor(
num_image_tokens
=
self
.
info
.
get_num_image_tokens
(
image_width
=
image_size
.
width
,
image_height
=
image_size
.
height
,
cropping
=
len
(
images
)
<=
2
,
)
return
[
image_token_id
]
*
num_image_tokens
...
...
vllm/model_executor/models/h2ovl.py
View file @
6ff51862
...
...
@@ -477,13 +477,15 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
enable_sanity_checks
=
enable_sanity_checks
,
)
if
self
.
cache
is
not
None
:
mm_limit
=
self
.
info
.
ctx
.
model_config
.
multimodal_config
.
limit_per_prompt
if
self
.
cache
is
not
None
and
mm_limit
[
"image"
]
>=
2
:
# The processor output depends on the number of images passed,
# making it incompatible with processing cache which is supposed
# to be invariant of how many images are passed per prompt
self
.
cache
=
None
logger
.
warning_once
(
f
"
{
type
(
self
).
__name__
}
does not support processing cache."
)
f
"
{
type
(
self
).
__name__
}
does not support processing cache with "
"multi-image support enabled."
)
def
_get_prompt_replacements
(
self
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment