Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
14561fab
Unverified
Commit
14561fab
authored
Feb 24, 2026
by
Wentao Ye
Committed by
GitHub
Feb 24, 2026
Browse files
[Perf] Optimize pooling model redundant copy, 1.8% throughput improvement (#35127)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
c77f3e12
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
51 additions
and
15 deletions
+51
-15
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+51
-15
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
14561fab
...
@@ -95,7 +95,6 @@ from vllm.sequence import IntermediateTensors
...
@@ -95,7 +95,6 @@ from vllm.sequence import IntermediateTensors
from
vllm.tasks
import
GenerationTask
,
PoolingTask
,
SupportedTask
from
vllm.tasks
import
GenerationTask
,
PoolingTask
,
SupportedTask
from
vllm.tracing
import
instrument
from
vllm.tracing
import
instrument
from
vllm.utils
import
length_from_prompt_token_ids_or_embeds
from
vllm.utils
import
length_from_prompt_token_ids_or_embeds
from
vllm.utils.jsontree
import
json_map_leaves
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.math_utils
import
cdiv
,
round_up
from
vllm.utils.mem_utils
import
DeviceMemoryProfiler
,
format_gib
from
vllm.utils.mem_utils
import
DeviceMemoryProfiler
,
format_gib
from
vllm.utils.nvtx_pytorch_hooks
import
PytHooks
from
vllm.utils.nvtx_pytorch_hooks
import
PytHooks
...
@@ -268,6 +267,51 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
...
@@ -268,6 +267,51 @@ class AsyncGPUModelRunnerOutput(AsyncModelRunnerOutput):
return
output
return
output
def
_copy_pooler_output_to_cpu
(
raw_pooler_output
:
PoolerOutput
,
finished_mask
:
list
[
bool
]
)
->
list
[
torch
.
Tensor
|
None
]:
num_reqs
=
len
(
finished_mask
)
if
isinstance
(
raw_pooler_output
,
torch
.
Tensor
):
if
raw_pooler_output
.
shape
[
0
]
!=
num_reqs
:
raise
ValueError
(
"Pooler output batch size does not match finished mask size: "
f
"
{
raw_pooler_output
.
shape
[
0
]
}
!=
{
num_reqs
}
."
)
num_finished
=
sum
(
finished_mask
)
if
num_finished
==
0
:
return
[
None
]
*
num_reqs
if
num_finished
==
num_reqs
:
return
list
(
raw_pooler_output
.
to
(
"cpu"
,
non_blocking
=
True
))
# partial finished
finished_indices
=
[
i
for
i
,
include
in
enumerate
(
finished_mask
)
if
include
]
index_tensor
=
torch
.
tensor
(
finished_indices
,
device
=
raw_pooler_output
.
device
,
dtype
=
torch
.
long
)
finished_outputs
=
raw_pooler_output
.
index_select
(
0
,
index_tensor
).
to
(
"cpu"
,
non_blocking
=
True
)
partial_pooler_output
:
list
[
torch
.
Tensor
|
None
]
=
[
None
]
*
num_reqs
for
i
,
out
in
zip
(
finished_indices
,
finished_outputs
):
partial_pooler_output
[
i
]
=
out
return
partial_pooler_output
assert
isinstance
(
raw_pooler_output
,
list
)
if
len
(
raw_pooler_output
)
!=
num_reqs
:
raise
ValueError
(
"Pooler output batch size does not match finished mask size: "
f
"
{
len
(
raw_pooler_output
)
}
!=
{
num_reqs
}
."
)
pooler_output
:
list
[
torch
.
Tensor
|
None
]
=
[
None
]
*
num_reqs
for
i
,
(
out
,
include
)
in
enumerate
(
zip
(
raw_pooler_output
,
finished_mask
)):
if
include
and
out
is
not
None
:
pooler_output
[
i
]
=
out
.
to
(
"cpu"
,
non_blocking
=
True
)
return
pooler_output
class
AsyncGPUPoolingModelRunnerOutput
(
AsyncModelRunnerOutput
):
class
AsyncGPUPoolingModelRunnerOutput
(
AsyncModelRunnerOutput
):
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -289,15 +333,11 @@ class AsyncGPUPoolingModelRunnerOutput(AsyncModelRunnerOutput):
...
@@ -289,15 +333,11 @@ class AsyncGPUPoolingModelRunnerOutput(AsyncModelRunnerOutput):
default_stream
=
torch
.
cuda
.
current_stream
()
default_stream
=
torch
.
cuda
.
current_stream
()
with
torch
.
cuda
.
stream
(
async_output_copy_stream
):
with
torch
.
cuda
.
stream
(
async_output_copy_stream
):
async_output_copy_stream
.
wait_stream
(
default_stream
)
async_output_copy_stream
.
wait_stream
(
default_stream
)
raw_pooler_output_cpu
=
json_map_leaves
(
self
.
_model_runner_output
.
pooler_output
=
_copy_pooler_output_to_cpu
(
lambda
x
:
None
if
x
is
None
else
x
.
to
(
"cpu"
,
non_blocking
=
True
)
,
raw_pooler_output
=
self
.
_raw_pooler_output
,
self
.
_raw_pooler_output
,
finished_mask
=
finished_mask
,
)
)
self
.
async_copy_ready_event
.
record
()
self
.
async_copy_ready_event
.
record
()
self
.
_model_runner_output
.
pooler_output
=
[
out
if
include
else
None
for
out
,
include
in
zip
(
raw_pooler_output_cpu
,
finished_mask
)
]
def
get_output
(
self
)
->
ModelRunnerOutput
:
def
get_output
(
self
)
->
ModelRunnerOutput
:
"""Copy the device tensors to the host and return a ModelRunnerOutput.
"""Copy the device tensors to the host and return a ModelRunnerOutput.
...
@@ -2705,14 +2745,10 @@ class GPUModelRunner(
...
@@ -2705,14 +2745,10 @@ class GPUModelRunner(
async_output_copy_stream
=
self
.
async_output_copy_stream
,
async_output_copy_stream
=
self
.
async_output_copy_stream
,
)
)
raw_pooler_output
=
json_map_leaves
(
model_runner_output
.
pooler_output
=
_copy_pooler_output_to_cpu
(
lambda
x
:
None
if
x
is
None
else
x
.
to
(
"cpu"
,
non_blocking
=
True
)
,
raw_pooler_output
=
raw_pooler_output
,
raw_pooler_output
,
finished_mask
=
finished_mask
,
)
)
model_runner_output
.
pooler_output
=
[
out
if
include
else
None
for
out
,
include
in
zip
(
raw_pooler_output
,
finished_mask
)
]
self
.
_sync_device
()
self
.
_sync_device
()
return
model_runner_output
return
model_runner_output
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment