Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b5571368
Unverified
Commit
b5571368
authored
Aug 31, 2025
by
Woosuk Kwon
Committed by
GitHub
Sep 01, 2025
Browse files
[Misc] Move fast prefill logic to separate method (#24013)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
acc1a6e1
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
27 additions
and
21 deletions
+27
-21
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+27
-21
No files found.
vllm/v1/worker/gpu_model_runner.py
View file @
b5571368
...
...
@@ -783,28 +783,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
logits_indices_padded
=
None
if
self
.
cache_config
.
kv_sharing_fast_prefill
:
assert
self
.
kv_sharing_fast_prefill_logits_indices
is
not
None
num_logits
=
logits_indices
.
shape
[
0
]
assert
num_logits
>
0
self
.
kv_sharing_fast_prefill_logits_indices
[:
num_logits
].
copy_
(
logits_indices_padded
=
self
.
_prepare_kv_sharing_fast_prefill
(
logits_indices
)
# There might have leftover indices in logits_indices[num_logits:]
# from previous iterations, whose values may be greater than the
# batch size in the current iteration. To ensure indices are always
# valid, we fill the padded indices with the last index.
self
.
kv_sharing_fast_prefill_logits_indices
[
num_logits
:].
fill_
(
logits_indices
[
-
1
].
item
())
if
(
self
.
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
and
num_logits
<=
self
.
cudagraph_batch_sizes
[
-
1
]):
# Use piecewise CUDA graphs.
# Add padding to the batch size.
num_logits_padded
=
self
.
vllm_config
.
pad_for_cudagraph
(
num_logits
)
else
:
num_logits_padded
=
num_logits
logits_indices_padded
=
(
self
.
kv_sharing_fast_prefill_logits_indices
[:
num_logits_padded
]
)
attn_metadata
:
dict
[
str
,
Any
]
=
{}
...
...
@@ -1109,6 +1089,32 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
return
metadata
def
_prepare_kv_sharing_fast_prefill
(
self
,
logits_indices
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
assert
self
.
kv_sharing_fast_prefill_logits_indices
is
not
None
num_logits
=
logits_indices
.
shape
[
0
]
assert
num_logits
>
0
self
.
kv_sharing_fast_prefill_logits_indices
[:
num_logits
].
copy_
(
logits_indices
)
# There might have leftover indices in logits_indices[num_logits:]
# from previous iterations, whose values may be greater than the
# batch size in the current iteration. To ensure indices are always
# valid, we fill the padded indices with the last index.
self
.
kv_sharing_fast_prefill_logits_indices
[
num_logits
:].
fill_
(
logits_indices
[
-
1
].
item
())
if
(
self
.
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
and
num_logits
<=
self
.
cudagraph_batch_sizes
[
-
1
]):
# Use piecewise CUDA graphs.
# Add padding to the batch size.
num_logits_padded
=
self
.
vllm_config
.
pad_for_cudagraph
(
num_logits
)
else
:
num_logits_padded
=
num_logits
logits_indices_padded
=
(
self
.
kv_sharing_fast_prefill_logits_indices
[:
num_logits_padded
])
return
logits_indices_padded
def
_execute_mm_encoder
(
self
,
scheduler_output
:
"SchedulerOutput"
):
scheduled_encoder_inputs
=
scheduler_output
.
scheduled_encoder_inputs
if
not
scheduled_encoder_inputs
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment