Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
fa63e710
Unverified
Commit
fa63e710
authored
Jan 26, 2025
by
Keyun Tong
Committed by
GitHub
Jan 26, 2025
Browse files
[V1][Perf] Reduce scheduling overhead in model runner after cuda sync (#12094)
Signed-off-by:
Keyun Tong
<
tongkeyun@gmail.com
>
parent
2a0309a6
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
21 additions
and
13 deletions
+21
-13
vllm/v1/outputs.py
vllm/v1/outputs.py
+1
-1
vllm/v1/sample/sampler.py
vllm/v1/sample/sampler.py
+1
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+19
-10
No files found.
vllm/v1/outputs.py
View file @
fa63e710
...
...
@@ -8,7 +8,7 @@ import torch
class
SamplerOutput
:
# [num_reqs]
sampled_token_ids
:
List
[
int
]
sampled_token_ids
:
torch
.
Tensor
# [num_reqs, max_num_logprobs + 1]
logprob_token_ids
:
Optional
[
torch
.
Tensor
]
...
...
vllm/v1/sample/sampler.py
View file @
fa63e710
...
...
@@ -50,9 +50,8 @@ class Sampler(nn.Module):
# Use int32 to reduce the tensor size.
sampled
=
sampled
.
to
(
torch
.
int32
)
# NOTE: CPU-GPU synchronization happens here.
sampler_output
=
SamplerOutput
(
sampled_token_ids
=
sampled
.
tolist
()
,
sampled_token_ids
=
sampled
,
logprob_token_ids
=
topk_indices
,
logprobs
=
topk_logprobs
,
prompt_logprob_token_ids
=
None
,
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
fa63e710
...
...
@@ -775,10 +775,10 @@ class GPUModelRunner:
sampling_metadata
=
sampling_metadata
,
)
sampled_token_ids
=
sampler_output
.
sampled_token_ids
# TODO(woosuk): The following loop can be slow since it iterates over
# the requests one by one. Optimize.
num_reqs
=
self
.
input_batch
.
num_reqs
request_seq_lens
:
List
[
Tuple
[
int
,
CachedRequestState
,
int
]]
=
[]
for
i
,
req_id
in
enumerate
(
self
.
input_batch
.
req_ids
[:
num_reqs
]):
assert
req_id
is
not
None
req_state
=
self
.
requests
[
req_id
]
...
...
@@ -787,10 +787,10 @@ class GPUModelRunner:
assert
seq_len
<=
req_state
.
num_tokens
if
seq_len
==
req_state
.
num_tokens
:
# Append the sampled token to the output token ids.
token_id
=
sampled_token_ids
[
i
]
self
.
input_batch
.
token_ids_cpu
[
i
,
seq_len
]
=
token_id
self
.
input_batch
.
num_tokens
[
i
]
+=
1
req_state
.
output_token_ids
.
append
(
token_id
)
# OPTIMIZATION: Priming the state updates for later updates.
req_state
.
output_token_ids
.
append
(
0
)
request_seq_lens
.
append
((
i
,
req_state
,
seq_len
))
else
:
# Ignore the sampled token from the partial request.
# Rewind the generator state as if the token was not sampled.
...
...
@@ -799,6 +799,21 @@ class GPUModelRunner:
# This relies on cuda-specific torch-internal impl details
generator
.
set_offset
(
generator
.
get_offset
()
-
4
)
# num_reqs entries should be non-None
assert
all
(
req_id
is
not
None
for
req_id
in
self
.
input_batch
.
req_ids
[:
num_reqs
]),
"req_ids contains None"
req_ids
=
cast
(
List
[
str
],
self
.
input_batch
.
req_ids
[:
num_reqs
])
# NOTE: GPU -> CPU Sync happens here.
# Move as many CPU operations as possible before this sync point.
sampled_token_ids
=
sampler_output
.
sampled_token_ids
.
tolist
()
# Update with the actual token ids
for
i
,
req_state
,
seq_len
in
request_seq_lens
:
token_id
=
sampled_token_ids
[
i
]
self
.
input_batch
.
token_ids_cpu
[
i
,
seq_len
]
=
token_id
req_state
.
output_token_ids
[
-
1
]
=
token_id
if
sampler_output
.
logprob_token_ids
is
None
:
logprob_token_ids
=
None
else
:
...
...
@@ -808,12 +823,6 @@ class GPUModelRunner:
else
:
logprobs
=
sampler_output
.
logprobs
.
cpu
()
# num_reqs entries should be non-None
assert
all
(
req_id
is
not
None
for
req_id
in
self
.
input_batch
.
req_ids
[:
num_reqs
]),
"req_ids contains None"
req_ids
=
cast
(
List
[
str
],
self
.
input_batch
.
req_ids
[:
num_reqs
])
model_runner_output
=
ModelRunnerOutput
(
req_ids
=
req_ids
,
req_id_to_index
=
self
.
input_batch
.
req_id_to_index
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment