Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0377b831
Unverified
Commit
0377b831
authored
Apr 17, 2025
by
Nick Hill
Committed by
GitHub
Apr 17, 2025
Browse files
[MLA] Simplification to batch P/D reordering (#16673)
Signed-off-by:
Nick Hill
<
nhill@redhat.com
>
parent
e4755f7f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
16 deletions
+12
-16
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+5
-7
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+7
-9
No files found.
vllm/v1/attention/backends/mla/common.py
View file @
0377b831
...
@@ -415,20 +415,18 @@ class MLACommonMetadataBuilder(Generic[M]):
...
@@ -415,20 +415,18 @@ class MLACommonMetadataBuilder(Generic[M]):
# the above loop
# the above loop
num_decodes
=
len
(
decodes
)
num_decodes
=
len
(
decodes
)
num_prefills
=
len
(
prefills
)
num_prefills
=
len
(
prefills
)
first_prefill
=
0
modified_batch
=
False
modified_batch
=
False
for
i
in
range
(
1
,
min
(
num_decodes
,
num_prefills
)
+
1
):
for
i
in
range
(
1
,
min
(
num_decodes
,
num_prefills
)
+
1
):
# If the decode is at the "back" of the batch, i, we can swap it
# If the decode is at the "back" of the batch, i, we can swap it
# with the prefill closest to the front of the batch
# with the prefill closest to the front of the batch
if
decodes
[
num_decodes
-
i
]
>=
num_decodes
:
decode_idx
=
decodes
[
num_decodes
-
i
]
input_batch
.
swap_states
(
prefills
[
first_prefill
],
if
decode_idx
<
num_decodes
:
decodes
[
num_decodes
-
i
])
first_prefill
+=
1
modified_batch
=
True
else
:
break
break
input_batch
.
swap_states
(
prefills
[
i
-
1
],
decode_idx
)
modified_batch
=
True
# Save for next `build` call
# Save for next `build` call
# TODO(lucas): this is a bit of a hack, we should probably have a
# TODO(lucas): this is a bit of a hack, we should probably have a
# better way of doing this
# better way of doing this
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
0377b831
...
@@ -458,7 +458,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -458,7 +458,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if
removed_req_indices
:
if
removed_req_indices
:
self
.
input_batch
.
condense
(
removed_req_indices
)
self
.
input_batch
.
condense
(
removed_req_indices
)
if
batch_changed
:
# Some attention backends (namely MLA) may want to separate requests
# based on if the attention computation will be compute-bound or
# memory-bound. This gives them a hook to do that.
batch_reordered
=
self
.
attn_metadata_builder
.
reorder_batch
(
self
.
input_batch
,
scheduler_output
)
if
batch_changed
or
batch_reordered
:
self
.
input_batch
.
refresh_sampling_metadata
()
self
.
input_batch
.
refresh_sampling_metadata
()
def
_prepare_inputs
(
def
_prepare_inputs
(
...
@@ -471,14 +477,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -471,14 +477,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_reqs
=
self
.
input_batch
.
num_reqs
num_reqs
=
self
.
input_batch
.
num_reqs
assert
num_reqs
>
0
assert
num_reqs
>
0
# Some attention backends (namely MLA) may want to separate requests
# based on if the attention computation will be compute-bound or
# memory-bound. This gives them a hook to do that.
modified_batch
=
self
.
attn_metadata_builder
.
reorder_batch
(
self
.
input_batch
,
scheduler_output
)
if
modified_batch
:
self
.
input_batch
.
refresh_sampling_metadata
()
# OPTIMIZATION: Start copying the block table first.
# OPTIMIZATION: Start copying the block table first.
# This way, we can overlap the copy with the following CPU operations.
# This way, we can overlap the copy with the following CPU operations.
self
.
input_batch
.
block_table
.
commit
(
num_reqs
)
self
.
input_batch
.
block_table
.
commit
(
num_reqs
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment