Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ae9d0e7d
Unverified
Commit
ae9d0e7d
authored
Oct 10, 2025
by
Sage Moore
Committed by
GitHub
Oct 10, 2025
Browse files
[Bugfix] Make DP padding optional in coordinate_batch_across_dp (#26375)
Signed-off-by:
Sage Moore
<
sage@neuralmagic.com
>
parent
0e67102d
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
123 additions
and
42 deletions
+123
-42
vllm/forward_context.py
vllm/forward_context.py
+14
-1
vllm/v1/worker/dp_utils.py
vllm/v1/worker/dp_utils.py
+80
-26
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+29
-15
No files found.
vllm/forward_context.py
View file @
ae9d0e7d
...
@@ -12,6 +12,7 @@ import torch
...
@@ -12,6 +12,7 @@ import torch
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.config
import
CUDAGraphMode
,
ParallelConfig
,
VllmConfig
from
vllm.config
import
CUDAGraphMode
,
ParallelConfig
,
VllmConfig
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.v1.worker.dp_utils
import
coordinate_batch_across_dp
from
vllm.v1.worker.ubatch_utils
import
UBatchSlices
from
vllm.v1.worker.ubatch_utils
import
UBatchSlices
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
...
@@ -278,7 +279,19 @@ def set_forward_context(
...
@@ -278,7 +279,19 @@ def set_forward_context(
if
vllm_config
.
parallel_config
.
data_parallel_size
>
1
and
(
if
vllm_config
.
parallel_config
.
data_parallel_size
>
1
and
(
attn_metadata
is
not
None
or
num_tokens
is
not
None
attn_metadata
is
not
None
or
num_tokens
is
not
None
):
):
assert
num_tokens_across_dp
is
not
None
# If num_tokens_across_dp hasn't already been initialized, then
# initialize it here. Both DP padding and Microbatching will be
# disabled.
if
num_tokens_across_dp
is
None
:
assert
ubatch_slices
is
None
assert
num_tokens
is
not
None
_
,
num_tokens_across_dp
=
coordinate_batch_across_dp
(
num_tokens_unpadded
=
num_tokens
,
parallel_config
=
vllm_config
.
parallel_config
,
allow_microbatching
=
False
,
allow_dp_padding
=
False
,
)
assert
num_tokens_across_dp
is
not
None
dp_metadata
=
DPMetadata
.
make
(
dp_metadata
=
DPMetadata
.
make
(
vllm_config
.
parallel_config
,
num_tokens
or
0
,
num_tokens_across_dp
vllm_config
.
parallel_config
,
num_tokens
or
0
,
num_tokens_across_dp
)
)
...
...
vllm/v1/worker/dp_utils.py
View file @
ae9d0e7d
...
@@ -7,7 +7,7 @@ import torch
...
@@ -7,7 +7,7 @@ import torch
import
torch.distributed
as
dist
import
torch.distributed
as
dist
from
vllm.config
import
ParallelConfig
from
vllm.config
import
ParallelConfig
from
vllm.distributed.parallel_state
import
get_dp_group
from
vllm.distributed.parallel_state
import
get_dp_group
,
is_global_first_rank
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.v1.worker.ubatch_utils
import
(
from
vllm.v1.worker.ubatch_utils
import
(
...
@@ -37,6 +37,7 @@ def _get_device_and_group(parallel_config: ParallelConfig):
...
@@ -37,6 +37,7 @@ def _get_device_and_group(parallel_config: ParallelConfig):
def
_run_ar
(
def
_run_ar
(
should_ubatch
:
bool
,
should_ubatch
:
bool
,
should_dp_pad
:
bool
,
orig_num_tokens_per_ubatch
:
int
,
orig_num_tokens_per_ubatch
:
int
,
padded_num_tokens_per_ubatch
:
int
,
padded_num_tokens_per_ubatch
:
int
,
parallel_config
:
ParallelConfig
,
parallel_config
:
ParallelConfig
,
...
@@ -44,10 +45,11 @@ def _run_ar(
...
@@ -44,10 +45,11 @@ def _run_ar(
dp_size
=
parallel_config
.
data_parallel_size
dp_size
=
parallel_config
.
data_parallel_size
dp_rank
=
parallel_config
.
data_parallel_rank
dp_rank
=
parallel_config
.
data_parallel_rank
device
,
group
=
_get_device_and_group
(
parallel_config
)
device
,
group
=
_get_device_and_group
(
parallel_config
)
tensor
=
torch
.
zeros
(
3
,
dp_size
,
device
=
device
,
dtype
=
torch
.
int32
)
tensor
=
torch
.
zeros
(
4
,
dp_size
,
device
=
device
,
dtype
=
torch
.
int32
)
tensor
[
0
][
dp_rank
]
=
orig_num_tokens_per_ubatch
tensor
[
0
][
dp_rank
]
=
orig_num_tokens_per_ubatch
tensor
[
1
][
dp_rank
]
=
padded_num_tokens_per_ubatch
tensor
[
1
][
dp_rank
]
=
padded_num_tokens_per_ubatch
tensor
[
2
][
dp_rank
]
=
1
if
should_ubatch
else
0
tensor
[
2
][
dp_rank
]
=
1
if
should_ubatch
else
0
tensor
[
3
][
dp_rank
]
=
1
if
should_dp_pad
else
0
dist
.
all_reduce
(
tensor
,
group
=
group
)
dist
.
all_reduce
(
tensor
,
group
=
group
)
return
tensor
return
tensor
...
@@ -72,10 +74,26 @@ def _post_process_ubatch(tensor: torch.Tensor) -> bool:
...
@@ -72,10 +74,26 @@ def _post_process_ubatch(tensor: torch.Tensor) -> bool:
return
should_ubatch
return
should_ubatch
def
_post_process_dp_padding
(
tensor
:
torch
.
Tensor
,
should_dp_pad
:
bool
)
->
torch
.
Tensor
:
num_tokens_across_dp
=
tensor
[
1
,
:]
if
should_dp_pad
:
# If DP padding is enabled, ensure that each rank is processing the same number
# of tokens
max_num_tokens
=
int
(
num_tokens_across_dp
.
max
().
item
())
return
torch
.
tensor
(
[
max_num_tokens
]
*
len
(
num_tokens_across_dp
),
device
=
"cpu"
,
dtype
=
torch
.
int32
,
)
else
:
return
num_tokens_across_dp
.
cpu
()
def
_synchronize_dp_ranks
(
def
_synchronize_dp_ranks
(
num_tokens_unpadded
:
int
,
num_tokens_unpadded
:
int
,
num_tokens_padded
:
int
,
num_tokens_padded
:
int
,
should_attempt_ubatching
:
bool
,
should_attempt_ubatching
:
bool
,
should_attempt_dp_padding
:
bool
,
parallel_config
:
ParallelConfig
,
parallel_config
:
ParallelConfig
,
)
->
tuple
[
bool
,
Optional
[
torch
.
Tensor
]]:
)
->
tuple
[
bool
,
Optional
[
torch
.
Tensor
]]:
"""
"""
...
@@ -83,57 +101,88 @@ def _synchronize_dp_ranks(
...
@@ -83,57 +101,88 @@ def _synchronize_dp_ranks(
run with microbatching or none of them do.
run with microbatching or none of them do.
2. Determines the total number of tokens that each rank will run.
2. Determines the total number of tokens that each rank will run.
All ranks will be padded out so that the run with the same number
When running microbatched or if should_attempt_dp_padding is True, all
of tokens
ranks will be padded out so that the run with the same number
of tokens
Returns: tuple[
Returns: tuple[
should_ubatch: Are all DP ranks going to microbatch
should_ubatch: Are all DP ranks going to microbatch
num_tokens_after_padding: A tensor containing the total number of
num_tokens_after_padding: A tensor containing the total number of
tokens per-microbatch for each DP rank including padding.
tokens per-microbatch for each DP rank including
any DP
padding.
]
]
"""
"""
assert
num_tokens_padded
>=
num_tokens_unpadded
assert
num_tokens_padded
>=
num_tokens_unpadded
#
First we c
oordinate between the DP ranks via an All Reduce
#
C
oordinate between the DP ranks via an All Reduce
# to determine the total number of tokens that each rank
# to determine the total number of tokens that each rank
# will run and if we are using ubatching or not.
# will run and if we are using ubatching or not.
tensor
=
_run_ar
(
tensor
=
_run_ar
(
should_ubatch
=
should_attempt_ubatching
,
should_ubatch
=
should_attempt_ubatching
,
should_dp_pad
=
should_attempt_dp_padding
,
orig_num_tokens_per_ubatch
=
num_tokens_unpadded
,
orig_num_tokens_per_ubatch
=
num_tokens_unpadded
,
padded_num_tokens_per_ubatch
=
num_tokens_padded
,
padded_num_tokens_per_ubatch
=
num_tokens_padded
,
parallel_config
=
parallel_config
,
parallel_config
=
parallel_config
,
)
)
# Ensure that each rank is processing the same nuber of tokens
should_dp_pad
=
bool
(
torch
.
all
(
tensor
[
3
]
==
1
).
item
())
num_tokens_across_dp
=
tensor
[
1
,
:]
max_num_tokens
=
int
(
num_tokens_across_dp
.
max
().
item
())
# DP ranks should all have the same value for should_attempt_dp_padding.
num_tokens_after_padding
=
torch
.
tensor
(
assert
should_attempt_dp_padding
==
should_dp_pad
[
max_num_tokens
]
*
len
(
num_tokens_across_dp
),
device
=
"cpu"
,
dtype
=
torch
.
int32
)
# Check conditions for microbatching
should_ubatch
=
_post_process_ubatch
(
tensor
)
should_ubatch
=
_post_process_ubatch
(
tensor
)
if
should_ubatch
and
not
should_dp_pad
:
if
is_global_first_rank
():
logger
.
debug
(
"Microbatching has been triggered and requires DP padding. "
"Enabling DP padding even though it has been explicitly "
"disabled."
)
should_dp_pad
=
True
# Pad all DP ranks up to the maximum token count across ranks if
# should_dp_pad is True
num_tokens_after_padding
=
_post_process_dp_padding
(
tensor
,
should_dp_pad
,
)
return
should_ubatch
,
num_tokens_after_padding
return
should_ubatch
,
num_tokens_after_padding
def
coordinate_batch_across_dp
(
def
coordinate_batch_across_dp
(
num_scheduled_tokens_per_request
:
np
.
ndarray
,
num_tokens_unpadded
:
int
,
num_tokens_unpadded
:
int
,
num_tokens_padded
:
int
,
parallel_config
:
ParallelConfig
,
allow_microbatching
:
bool
,
allow_microbatching
:
bool
,
uniform_decode
:
bool
,
allow_dp_padding
:
bool
,
parallel_config
:
ParallelConfig
,
num_tokens_padded
:
Optional
[
int
]
=
None
,
uniform_decode
:
Optional
[
bool
]
=
None
,
num_scheduled_tokens_per_request
:
Optional
[
np
.
ndarray
]
=
None
,
)
->
tuple
[
Optional
[
UBatchSlices
],
Optional
[
torch
.
Tensor
]]:
)
->
tuple
[
Optional
[
UBatchSlices
],
Optional
[
torch
.
Tensor
]]:
"""
"""
Coordinates amongst all DP ranks to determine if and how the full batch
Coordinates amongst all DP ranks to determine if and how the full batch
should be split into microbatches.
should be split into microbatches.
Args:
num_tokens_unpadded: Number of tokens without accounting for padding
allow_microbatching: If microbatching should be attempted
allow_dp_padding: If all DP ranks should be padded up to the same value
parallel_config: The parallel config
num_tokens_padded: Number of tokens including any non-DP padding (CUDA graphs,
TP, etc)
uniform_decode: Only used if allow_microbatching is True. True if the batch
only contains single token decodes
num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
number of tokens per request.
Returns: tuple[
Returns: tuple[
ubatch_slices: if this is set then all DP ranks have agreed to
ubatch_slices: if this is set then all DP ranks have agreed to
microbatch
microbatch
num_tokens_after_padding: A tensor containing the total number of
num_tokens_after_padding: A tensor containing the total number of
tokens per-microbatch for each DP rank including padding.
tokens per-microbatch for each DP rank including padding. Will be
padded up to the max value across all DP ranks when allow_dp_padding
is True.
]
]
"""
"""
...
@@ -141,21 +190,25 @@ def coordinate_batch_across_dp(
...
@@ -141,21 +190,25 @@ def coordinate_batch_across_dp(
# Early exit.
# Early exit.
return
None
,
None
return
None
,
None
# Check preconditions for microbatching
# If the caller has explicitly enabled microbatching.
should_attempt_ubatching
=
check_ubatch_thresholds
(
should_attempt_ubatching
=
False
parallel_config
,
if
allow_microbatching
:
num_tokens_unpadded
,
# Check preconditions for microbatching
uniform_decode
=
uniform_decode
,
assert
uniform_decode
is
not
None
)
should_attempt_ubatching
=
check_ubatch_thresholds
(
parallel_config
,
num_tokens_unpadded
,
uniform_decode
=
uniform_decode
,
)
# If the caller has explicitly disabled microbatching.
if
num_tokens_padded
is
None
:
if
not
allow_microbatching
:
num_tokens_padded
=
num_tokens_unpadded
should_attempt_ubatching
=
False
(
should_ubatch
,
num_tokens_after_padding
)
=
_synchronize_dp_ranks
(
(
should_ubatch
,
num_tokens_after_padding
)
=
_synchronize_dp_ranks
(
num_tokens_unpadded
,
num_tokens_unpadded
,
num_tokens_padded
,
num_tokens_padded
,
should_attempt_ubatching
,
should_attempt_ubatching
,
allow_dp_padding
,
parallel_config
,
parallel_config
,
)
)
...
@@ -170,6 +223,7 @@ def coordinate_batch_across_dp(
...
@@ -170,6 +223,7 @@ def coordinate_batch_across_dp(
assert
num_tokens_after_padding
is
not
None
assert
num_tokens_after_padding
is
not
None
token_split_point
=
int
(
num_tokens_after_padding
[
0
].
item
())
//
2
token_split_point
=
int
(
num_tokens_after_padding
[
0
].
item
())
//
2
assert
num_scheduled_tokens_per_request
is
not
None
ubatch_slices
=
create_ubatch_slices
(
ubatch_slices
=
create_ubatch_slices
(
num_scheduled_tokens_per_request
,
token_split_point
num_scheduled_tokens_per_request
,
token_split_point
)
)
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
ae9d0e7d
...
@@ -1178,13 +1178,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1178,13 +1178,21 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
uniform_decode
=
(
uniform_decode
=
(
max_num_scheduled_tokens
==
self
.
uniform_decode_query_len
max_num_scheduled_tokens
==
self
.
uniform_decode_query_len
)
and
(
total_num_scheduled_tokens
==
num_reqs
*
max_num_scheduled_tokens
)
)
and
(
total_num_scheduled_tokens
==
num_reqs
*
max_num_scheduled_tokens
)
# Disable DP padding when running eager to avoid excessive padding when
# running prefills. This lets us set enforce_eager on the prefiller in
# a P/D setup and still use CUDA graphs (enabled by this padding) on the
# decoder.
allow_dp_padding
=
self
.
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
ubatch_slices
,
num_tokens_across_dp
=
coordinate_batch_across_dp
(
ubatch_slices
,
num_tokens_across_dp
=
coordinate_batch_across_dp
(
num_scheduled_tokens
,
num_tokens_unpadded
=
num_tokens_unpadded
,
num_tokens_unpadded
,
parallel_config
=
self
.
parallel_config
,
num_tokens_padded
,
allow_microbatching
=
True
,
self
.
parallel_config
,
allow_dp_padding
=
allow_dp_padding
,
True
,
num_tokens_padded
=
num_tokens_padded
,
uniform_decode
,
uniform_decode
=
uniform_decode
,
num_scheduled_tokens_per_request
=
num_scheduled_tokens
,
)
)
self
.
seq_lens
.
np
[:
num_reqs
]
=
(
self
.
seq_lens
.
np
[:
num_reqs
]
=
(
...
@@ -2436,12 +2444,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -2436,12 +2444,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
use_cascade_attn
,
use_cascade_attn
,
)
=
self
.
_prepare_inputs
(
scheduler_output
)
)
=
self
.
_prepare_inputs
(
scheduler_output
)
dp_rank
=
self
.
parallel_config
.
data_parallel_rank
if
ubatch_slices
:
if
ubatch_slices
:
assert
num_tokens_across_dp
is
not
None
assert
num_tokens_across_dp
is
not
None
num_input_tokens
=
int
(
num_tokens_across_dp
[
0
].
item
())
num_input_tokens
=
int
(
num_tokens_across_dp
[
dp_rank
].
item
())
self
.
pad_out_ubatch_slice
(
ubatch_slices
,
num_input_tokens
)
self
.
pad_out_ubatch_slice
(
ubatch_slices
,
num_input_tokens
)
elif
num_tokens_across_dp
is
not
None
:
elif
num_tokens_across_dp
is
not
None
:
num_input_tokens
=
int
(
num_tokens_across_dp
[
0
].
item
())
num_input_tokens
=
int
(
num_tokens_across_dp
[
dp_rank
].
item
())
else
:
else
:
num_input_tokens
=
self
.
_get_num_input_tokens
(
num_input_tokens
=
self
.
_get_num_input_tokens
(
scheduler_output
.
total_num_scheduled_tokens
scheduler_output
.
total_num_scheduled_tokens
...
@@ -3256,19 +3265,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -3256,19 +3265,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
num_scheduled_tokens
=
np
.
array
(
num_scheduled_tokens_list
,
dtype
=
np
.
int32
)
num_scheduled_tokens
=
np
.
array
(
num_scheduled_tokens_list
,
dtype
=
np
.
int32
)
total_num_scheduled_tokens
=
int
(
num_scheduled_tokens
.
sum
())
total_num_scheduled_tokens
=
int
(
num_scheduled_tokens
.
sum
())
# Disable DP padding when running eager
allow_dp_padding
=
self
.
compilation_config
.
cudagraph_mode
!=
CUDAGraphMode
.
NONE
# We currently only microbatch if the number of tokens is
# We currently only microbatch if the number of tokens is
# over a certain threshold.
# over a certain threshold.
ubatch_slices
,
num_tokens_across_dp
=
coordinate_batch_across_dp
(
ubatch_slices
,
num_tokens_across_dp
=
coordinate_batch_across_dp
(
num_scheduled_tokens
,
num_tokens_unpadded
=
total_num_scheduled_tokens
,
total_num_scheduled_tokens
,
parallel_config
=
self
.
vllm_config
.
parallel_config
,
total_num_scheduled_tokens
,
allow_microbatching
=
allow_microbatching
,
self
.
vllm_config
.
parallel_config
,
allow_dp_padding
=
allow_dp_padding
,
allow_microbatching
,
num_tokens_padded
=
total_num_scheduled_tokens
,
uniform_decode
,
uniform_decode
=
uniform_decode
,
num_scheduled_tokens_per_request
=
num_scheduled_tokens
,
)
)
num_tokens_after_padding
=
num_tokens
num_tokens_after_padding
=
num_tokens
if
num_tokens_across_dp
is
not
None
:
if
num_tokens_across_dp
is
not
None
:
num_tokens_after_padding
=
int
(
num_tokens_across_dp
[
0
])
dp_rank
=
self
.
parallel_config
.
data_parallel_rank
num_tokens_after_padding
=
int
(
num_tokens_across_dp
[
dp_rank
])
attn_metadata
:
Optional
[
PerLayerAttnMetadata
]
=
None
attn_metadata
:
Optional
[
PerLayerAttnMetadata
]
=
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment