Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
319cb1e3
Unverified
Commit
319cb1e3
authored
Jun 10, 2025
by
Lukas Geiger
Committed by
GitHub
Jun 10, 2025
Browse files
[Core] Batch multi modal input using pinned memory (#19169)
Signed-off-by:
Lukas Geiger
<
lukas.geiger94@gmail.com
>
parent
1efef716
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
18 additions
and
7 deletions
+18
-7
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+14
-5
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+4
-2
No files found.
vllm/multimodal/inputs.py
View file @
319cb1e3
...
@@ -680,7 +680,8 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
...
@@ -680,7 +680,8 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
return
self
.
_items_by_modality
.
keys
()
return
self
.
_items_by_modality
.
keys
()
@
staticmethod
@
staticmethod
def
_try_stack
(
nested_tensors
:
NestedTensors
)
->
NestedTensors
:
def
_try_stack
(
nested_tensors
:
NestedTensors
,
pin_memory
:
bool
=
False
)
->
NestedTensors
:
"""
"""
Stack the inner dimensions that have the same shape in
Stack the inner dimensions that have the same shape in
a nested list of tensors.
a nested list of tensors.
...
@@ -697,7 +698,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
...
@@ -697,7 +698,9 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
if
isinstance
(
nested_tensors
,
(
int
,
float
)):
if
isinstance
(
nested_tensors
,
(
int
,
float
)):
return
torch
.
tensor
(
nested_tensors
)
return
torch
.
tensor
(
nested_tensors
)
stacked
=
[
MultiModalKwargs
.
_try_stack
(
t
)
for
t
in
nested_tensors
]
stacked
=
[
MultiModalKwargs
.
_try_stack
(
t
,
pin_memory
)
for
t
in
nested_tensors
]
if
not
is_list_of
(
stacked
,
torch
.
Tensor
,
check
=
"all"
):
if
not
is_list_of
(
stacked
,
torch
.
Tensor
,
check
=
"all"
):
# Only tensors (not lists) can be stacked.
# Only tensors (not lists) can be stacked.
return
stacked
return
stacked
...
@@ -713,10 +716,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
...
@@ -713,10 +716,16 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
# The tensors have incompatible shapes and can't be stacked.
# The tensors have incompatible shapes and can't be stacked.
return
tensors_
return
tensors_
return
torch
.
stack
(
tensors_
)
outputs
=
torch
.
empty
(
len
(
tensors_
),
*
tensors_
[
0
].
shape
,
dtype
=
tensors_
[
0
].
dtype
,
device
=
tensors_
[
0
].
device
,
pin_memory
=
pin_memory
)
return
torch
.
stack
(
tensors_
,
out
=
outputs
)
@
staticmethod
@
staticmethod
def
batch
(
inputs_list
:
list
[
"MultiModalKwargs"
])
->
BatchedTensorInputs
:
def
batch
(
inputs_list
:
list
[
"MultiModalKwargs"
],
pin_memory
:
bool
=
False
)
->
BatchedTensorInputs
:
"""
"""
Batch multiple inputs together into a dictionary.
Batch multiple inputs together into a dictionary.
...
@@ -738,7 +747,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
...
@@ -738,7 +747,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]):
item_lists
[
k
].
append
(
v
)
item_lists
[
k
].
append
(
v
)
return
{
return
{
k
:
MultiModalKwargs
.
_try_stack
(
item_list
)
k
:
MultiModalKwargs
.
_try_stack
(
item_list
,
pin_memory
)
for
k
,
item_list
in
item_lists
.
items
()
for
k
,
item_list
in
item_lists
.
items
()
}
}
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
319cb1e3
...
@@ -962,7 +962,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -962,7 +962,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
encoder_outputs
=
[]
encoder_outputs
=
[]
for
grouped_mm_inputs
in
grouped_mm_inputs_list
:
for
grouped_mm_inputs
in
grouped_mm_inputs_list
:
batched_mm_inputs
=
MultiModalKwargs
.
batch
(
grouped_mm_inputs
)
batched_mm_inputs
=
MultiModalKwargs
.
batch
(
grouped_mm_inputs
,
pin_memory
=
self
.
pin_memory
)
batched_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_mm_inputs
,
batched_mm_inputs
,
device
=
self
.
device
,
device
=
self
.
device
,
...
@@ -1989,7 +1990,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1989,7 +1990,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
).
multi_modal_data
).
multi_modal_data
batched_dummy_mm_inputs
=
MultiModalKwargs
.
batch
(
batched_dummy_mm_inputs
=
MultiModalKwargs
.
batch
(
[
dummy_mm_kwargs
]
*
max_num_mm_items
)
[
dummy_mm_kwargs
]
*
max_num_mm_items
,
pin_memory
=
self
.
pin_memory
)
batched_dummy_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_dummy_mm_inputs
=
MultiModalKwargs
.
as_kwargs
(
batched_dummy_mm_inputs
,
batched_dummy_mm_inputs
,
device
=
self
.
device
,
device
=
self
.
device
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment