Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8693e47e
Unverified
Commit
8693e47e
authored
Mar 28, 2025
by
Cyrus Leung
Committed by
GitHub
Mar 28, 2025
Browse files
[Bugfix] Fix `mm_hashes` forgetting to be passed (#15668)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
cec8c7d7
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
15 additions
and
10 deletions
+15
-10
vllm/inputs/preprocess.py
vllm/inputs/preprocess.py
+2
-0
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+2
-0
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mllama.py
+1
-1
vllm/model_executor/models/phi4mm.py
vllm/model_executor/models/phi4mm.py
+8
-8
vllm/model_executor/models/prithvi_geospatial_mae.py
vllm/model_executor/models/prithvi_geospatial_mae.py
+1
-0
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+1
-1
No files found.
vllm/inputs/preprocess.py
View file @
8693e47e
...
...
@@ -528,6 +528,7 @@ class InputPreprocessor:
prompt_token_ids
=
decoder_inputs_to_override
[
"prompt_token_ids"
],
mm_kwargs
=
inputs
[
"mm_kwargs"
],
mm_hashes
=
inputs
[
"mm_hashes"
],
mm_placeholders
=
inputs
[
"mm_placeholders"
],
)
else
:
...
...
@@ -536,6 +537,7 @@ class InputPreprocessor:
prompt
=
inputs
[
"prompt"
],
prompt_token_ids
=
inputs
[
"prompt_token_ids"
],
mm_kwargs
=
inputs
[
"mm_kwargs"
],
mm_hashes
=
inputs
[
"mm_hashes"
],
mm_placeholders
=
inputs
[
"mm_placeholders"
],
)
elif
inputs
[
"type"
]
==
"token"
:
...
...
vllm/model_executor/models/llava.py
View file @
8693e47e
...
...
@@ -868,6 +868,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
mm_items
=
self
.
_to_mm_items
(
mm_data
)
mm_item_counts
=
mm_items
.
get_all_counts
()
mm_kwargs
=
result
[
"mm_kwargs"
]
mm_hashes
=
result
[
"mm_hashes"
]
# We reimplement the functionality of MLlavaProcessor from
# https://github.com/TIGER-AI-Lab/Mantis.git
...
...
@@ -916,6 +917,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
prompt
=
prompt
,
prompt_token_ids
=
prompt_ids
,
mm_kwargs
=
mm_kwargs
,
mm_hashes
=
mm_hashes
,
mm_placeholders
=
mm_placeholder_ranges
,
)
...
...
vllm/model_executor/models/mllama.py
View file @
8693e47e
...
...
@@ -1378,7 +1378,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
# Because attn_metadata.encoder_seq_lens only counts the last
# group of images for each sample, which is used to cheat the
# block manager to allocate blocks for those images only.
# See
input_processor_for_mllama()
for more details.
# See
MllamaMultiModalProcessor
for more details.
num_tiles_tensor
=
kwargs
.
pop
(
"num_tiles"
)
num_tiles
=
[
t
.
tolist
()
for
t
in
num_tiles_tensor
]
num_tokens_per_tile
=
calc_token_per_chunk
(
self
.
image_size
)
...
...
vllm/model_executor/models/phi4mm.py
View file @
8693e47e
...
...
@@ -28,7 +28,7 @@ from vllm.model_executor.models.llama import LlamaModel
from
vllm.model_executor.models.module_mapping
import
MultiModelKeys
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
MultiModal
Input
s
,
NestedTensors
from
vllm.multimodal.inputs
import
MultiModal
Kwarg
s
,
NestedTensors
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.transformers_utils.tokenizer
import
cached_tokenizer_from_config
...
...
@@ -1319,9 +1319,9 @@ def dummy_data_for_phi4mm(ctx: InputContext, seq_len: int,
def
input_mapper_for_phi4mm_audio
(
ctx
:
InputContext
,
data
:
object
)
->
MultiModal
Input
s
:
data
:
object
)
->
MultiModal
Kwarg
s
:
"""
This function is used to create the MultiModal
Input
s for the Phi4MM
This function is used to create the MultiModal
Kwarg
s for the Phi4MM
(audio) model.
Specifically, for audio, we extract the audio features from the sound
file and create pairs of audio features and audio embed lengths (the
...
...
@@ -1338,13 +1338,13 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
data (object): Audio data.
Returns:
MultiModal
Input
s: Multi-modal inputs.
MultiModal
Kwarg
s: Multi-modal inputs.
"""
if
not
isinstance
(
data
,
list
):
data
=
[
data
]
if
len
(
data
)
==
0
:
return
MultiModal
Input
s
()
return
MultiModal
Kwarg
s
()
audio_features
=
[]
for
audio_input
in
data
:
...
...
@@ -1365,7 +1365,7 @@ def input_mapper_for_phi4mm_audio(ctx: InputContext,
[
single_audio_embed_size
],
)
audio_features
.
append
(
single_audio_feature_audio_len_pair
)
return
MultiModal
Input
s
({
"audio_features"
:
audio_features
})
return
MultiModal
Kwarg
s
({
"audio_features"
:
audio_features
})
def
input_mapper_for_phi4mm_image
(
ctx
:
InputContext
,
data
:
object
):
...
...
@@ -1373,7 +1373,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
data
=
[
data
]
# data: list of PIL images
if
len
(
data
)
==
0
:
return
MultiModal
Input
s
()
return
MultiModal
Kwarg
s
()
hf_config
=
ctx
.
get_hf_config
()
vision_encoder_name
=
hf_config
.
img_processor
if
vision_encoder_name
is
None
:
...
...
@@ -1385,7 +1385,7 @@ def input_mapper_for_phi4mm_image(ctx: InputContext, data: object):
image_input_dict
=
preprocess
(
data
,
dynamic_hd_size
,
vit_image_size
,
vit_patch_size
)
return
MultiModal
Input
s
({
return
MultiModal
Kwarg
s
({
"pixel_values"
:
image_input_dict
[
"pixel_values"
],
"image_sizes"
:
...
...
vllm/model_executor/models/prithvi_geospatial_mae.py
View file @
8693e47e
...
...
@@ -105,6 +105,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
prompt
=
prompt
,
prompt_token_ids
=
[
1
],
mm_kwargs
=
MultiModalKwargs
(
mm_kwargs
),
mm_hashes
=
None
,
mm_placeholders
=
{},
)
...
...
vllm/multimodal/inputs.py
View file @
8693e47e
...
...
@@ -743,7 +743,7 @@ class MultiModalInputs(TypedDict):
mm_kwargs
:
MultiModalKwargs
"""Keyword arguments to be directly passed to the model after batching."""
mm_hashes
:
NotRequired
[
Optional
[
"MultiModalHashDict"
]
]
mm_hashes
:
Optional
[
"MultiModalHashDict"
]
"""The hashes of the multi-modal data."""
mm_placeholders
:
MultiModalPlaceholderDict
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment