Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
696259ca
Unverified
Commit
696259ca
authored
May 27, 2025
by
Cyrus Leung
Committed by
GitHub
May 27, 2025
Browse files
[Core] Automatically cast multi-modal input dtype (#18756)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
6b6d4961
Changes
16
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
91 additions
and
44 deletions
+91
-44
vllm/model_executor/models/deepseek_vl2.py
vllm/model_executor/models/deepseek_vl2.py
+1
-3
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+0
-5
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+7
-1
vllm/spec_decode/draft_model_runner.py
vllm/spec_decode/draft_model_runner.py
+5
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+9
-3
vllm/v1/worker/tpu_model_runner.py
vllm/v1/worker/tpu_model_runner.py
+10
-4
vllm/worker/cpu_enc_dec_model_runner.py
vllm/worker/cpu_enc_dec_model_runner.py
+5
-2
vllm/worker/cpu_model_runner.py
vllm/worker/cpu_model_runner.py
+4
-1
vllm/worker/cpu_pooling_model_runner.py
vllm/worker/cpu_pooling_model_runner.py
+5
-2
vllm/worker/enc_dec_model_runner.py
vllm/worker/enc_dec_model_runner.py
+7
-3
vllm/worker/model_runner.py
vllm/worker/model_runner.py
+5
-2
vllm/worker/multi_step_neuron_model_runner.py
vllm/worker/multi_step_neuron_model_runner.py
+5
-2
vllm/worker/multi_step_neuronx_distributed_model_runner.py
vllm/worker/multi_step_neuronx_distributed_model_runner.py
+5
-2
vllm/worker/neuron_model_runner.py
vllm/worker/neuron_model_runner.py
+10
-6
vllm/worker/pooling_model_runner.py
vllm/worker/pooling_model_runner.py
+7
-3
vllm/worker/xpu_model_runner.py
vllm/worker/xpu_model_runner.py
+6
-3
No files found.
vllm/model_executor/models/deepseek_vl2.py
View file @
696259ca
...
...
@@ -210,9 +210,7 @@ class DeepseekVL2MultiModalProcessor(
dict
(
prompt
=
prompt
,
**
mm_data
),
mm_kwargs
,
)
target_dtype
=
self
.
info
.
ctx
.
model_config
.
dtype
pixel_values
=
processed_outputs
.
pop
(
"pixel_values"
).
to
(
target_dtype
)
pixel_values
=
processed_outputs
[
"pixel_values"
]
# split pixel values into patches corresponding to each image
images_spatial_crop
=
processed_outputs
[
"images_spatial_crop"
]
patches_per_image
=
[
...
...
vllm/model_executor/models/gemma3_mm.py
View file @
696259ca
...
...
@@ -263,11 +263,6 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
mm_data
,
mm_kwargs
,
)
if
"pixel_values"
in
processed_outputs
:
# Cast pixel values to model dtype already here,
# so we need to transfer less data to the GPU
processed_outputs
[
"pixel_values"
]
=
processed_outputs
[
"pixel_values"
].
to
(
self
.
info
.
ctx
.
model_config
.
dtype
)
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
if
(
images
:
=
mm_data
.
get
(
"images"
))
is
not
None
:
...
...
vllm/multimodal/inputs.py
View file @
696259ca
...
...
@@ -746,11 +746,17 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
batched_inputs
:
BatchedTensorInputs
,
*
,
device
:
torch
.
types
.
Device
,
dtype
:
Optional
[
torch
.
dtype
]
=
None
,
)
->
BatchedTensorInputs
:
json_inputs
=
cast
(
JSONTree
[
torch
.
Tensor
],
batched_inputs
)
def
maybe_cast_dtype
(
x
:
torch
.
Tensor
):
# This mimics the behavior of transformers.BatchFeature
return
x
.
to
(
dtype
=
dtype
)
if
x
.
is_floating_point
()
else
x
json_mapped
=
json_map_leaves
(
lambda
x
:
x
.
to
(
device
,
non_blocking
=
True
),
# NOTE: Cast the dtype before sending it to device
lambda
x
:
maybe_cast_dtype
(
x
).
to
(
device
=
device
,
non_blocking
=
True
),
json_inputs
,
)
...
...
vllm/spec_decode/draft_model_runner.py
View file @
696259ca
...
...
@@ -294,8 +294,11 @@ class TP1DraftModelRunner(ModelRunnerWrapperBase):
inputs_embeds
=
None
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
dtype
=
self
.
model_runner
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
model_execute_kwargs
,
)
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
696259ca
...
...
@@ -929,8 +929,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
encoder_outputs
=
[]
for
grouped_mm_inputs
in
grouped_mm_inputs_list
:
batched_mm_inputs
=
MultiModalKwargs
.
batch
(
grouped_mm_inputs
)
batched_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_mm_inputs
,
device
=
self
.
device
)
batched_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_mm_inputs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
)
# Run the encoder.
# `curr_group_outputs` is either of the following:
...
...
@@ -1874,7 +1877,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
batched_dummy_mm_inputs
=
MultiModalKwargs
.
batch
(
[
dummy_mm_kwargs
]
*
max_num_mm_items
)
batched_dummy_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_dummy_mm_inputs
,
device
=
self
.
device
)
batched_dummy_mm_inputs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
)
# Run multimodal encoder.
dummy_encoder_outputs
=
self
.
model
.
get_multimodal_embeddings
(
...
...
vllm/v1/worker/tpu_model_runner.py
View file @
696259ca
...
...
@@ -652,8 +652,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
encoder_outputs
=
[]
for
grouped_mm_inputs
in
grouped_mm_inputs_list
:
batched_mm_inputs
=
MultiModalKwargs
.
batch
(
grouped_mm_inputs
)
batched_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_mm_inputs
,
device
=
self
.
device
)
batched_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_mm_inputs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
)
# Run the encoder.
# `curr_group_outputs` is either of the following:
...
...
@@ -1435,8 +1438,11 @@ class TPUModelRunner(LoRAModelRunnerMixin):
batched_dummy_mm_inputs
=
MultiModalKwargs
.
batch
([
dummy_mm_kwargs
]
*
batch_size
)
return
MultiModalKwargs
.
as_kwargs
(
batched_dummy_mm_inputs
,
device
=
self
.
device
)
return
MultiModalKwargs
.
as_kwargs
(
batched_dummy_mm_inputs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
)
def
_get_req_paddings
(
min_req_size
:
int
,
max_req_size
:
int
)
->
list
[
int
]:
...
...
vllm/worker/cpu_enc_dec_model_runner.py
View file @
696259ca
...
...
@@ -297,8 +297,11 @@ class CPUEncoderDecoderModelRunner(
model_input
.
encoder_input_tokens
,
"encoder_positions"
:
model_input
.
encoder_input_positions
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
"intermediate_tensors"
:
intermediate_tensors
,
}
...
...
vllm/worker/cpu_model_runner.py
View file @
696259ca
...
...
@@ -628,7 +628,10 @@ class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
multimodal_kwargs
=
{}
if
model_input
.
multi_modal_kwargs
is
not
None
:
multimodal_kwargs
=
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
,
device
=
self
.
device
)
model_input
.
multi_modal_kwargs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
)
execute_model_kwargs
=
{}
if
previous_hidden_states
is
not
None
:
execute_model_kwargs
.
update
(
...
...
vllm/worker/cpu_pooling_model_runner.py
View file @
696259ca
...
...
@@ -50,8 +50,11 @@ class CPUPoolingModelRunner(
model_input
.
input_tokens
,
"positions"
:
model_input
.
input_positions
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
cross_enc_kwargs
,
"intermediate_tensors"
:
intermediate_tensors
,
...
...
vllm/worker/enc_dec_model_runner.py
View file @
696259ca
...
...
@@ -202,9 +202,13 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
encoder_input_ids
=
model_input
.
encoder_input_tokens
,
encoder_positions
=
model_input
.
encoder_input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
seqlen_agnostic_kwargs
)
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
seqlen_agnostic_kwargs
,
)
logits
=
self
.
model
.
compute_logits
(
hidden_or_intermediate_states
,
model_input
.
sampling_metadata
)
...
...
vllm/worker/model_runner.py
View file @
696259ca
...
...
@@ -1845,8 +1845,11 @@ class ModelRunner(GPUModelRunnerBase[ModelInputForGPUWithSamplingMetadata]):
inputs_embeds
=
model_input
.
inputs_embeds
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
seqlen_agnostic_kwargs
,
**
model_kwargs
,
)
...
...
vllm/worker/multi_step_neuron_model_runner.py
View file @
696259ca
...
...
@@ -70,8 +70,11 @@ class MultiStepNeuronModelRunner(NeuronModelRunner):
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
output
=
self
.
model
.
sample
(
...
...
vllm/worker/multi_step_neuronx_distributed_model_runner.py
View file @
696259ca
...
...
@@ -49,8 +49,11 @@ class MultiStepNeuronxDistributedModelRunner(NeuronxDistributedModelRunner):
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
sampling_params
=
sampling_params
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
output
=
self
.
model
.
sample
(
...
...
vllm/worker/neuron_model_runner.py
View file @
696259ca
...
...
@@ -378,9 +378,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
sampling_params
=
sampling_params
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
elif
current_platform
.
use_transformers_neuronx
():
# [TODO] validate on-device sampling
...
...
@@ -389,9 +391,11 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]):
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
input_block_ids
=
model_input
.
input_block_ids
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
# Compute the logits only if the on-device sampling is turned off as
...
...
vllm/worker/pooling_model_runner.py
View file @
696259ca
...
...
@@ -119,10 +119,14 @@ class PoolingModelRunner(
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
device
=
self
.
device
),
**
MultiModalKwargs
.
as_kwargs
(
multi_modal_kwargs
,
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
**
cross_enc_kwargs
,
**
seqlen_agnostic_kwargs
)
**
seqlen_agnostic_kwargs
,
)
if
(
self
.
observability_config
is
not
None
and
self
.
observability_config
.
collect_model_forward_time
):
...
...
vllm/worker/xpu_model_runner.py
View file @
696259ca
...
...
@@ -562,9 +562,12 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]):
input_ids
=
model_input
.
input_tokens
,
positions
=
model_input
.
input_positions
,
intermediate_tensors
=
intermediate_tensors
,
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
device
=
self
.
device
))
**
MultiModalKwargs
.
as_kwargs
(
model_input
.
multi_modal_kwargs
or
{},
dtype
=
self
.
model_config
.
dtype
,
device
=
self
.
device
,
),
)
# Compute the logits in the last pipeline stage.
if
not
get_pp_group
().
is_last_rank
:
return
hidden_or_intermediate_states
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment