Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7b7cdce9
Unverified
Commit
7b7cdce9
authored
Jan 19, 2026
by
Woosuk Kwon
Committed by
GitHub
Jan 19, 2026
Browse files
[Model Runner V2] Refactor get_cudagraph_and_dp_padding (#32625)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
12dab78f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
54 additions
and
67 deletions
+54
-67
vllm/v1/worker/gpu/dp_utils.py
vllm/v1/worker/gpu/dp_utils.py
+40
-5
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+14
-62
No files found.
vllm/v1/worker/gpu/dp_utils.py
View file @
7b7cdce9
...
@@ -6,6 +6,12 @@ import torch.distributed as dist
...
@@ -6,6 +6,12 @@ import torch.distributed as dist
from
vllm.distributed.parallel_state
import
get_dp_group
from
vllm.distributed.parallel_state
import
get_dp_group
def
make_num_tokens_across_dp
(
dp_size
:
int
,
num_tokens
:
int
)
->
torch
.
Tensor
|
None
:
if
dp_size
==
1
:
return
None
return
torch
.
full
((
dp_size
,),
num_tokens
,
dtype
=
torch
.
int32
,
device
=
"cpu"
)
def
get_batch_metadata_across_dp
(
def
get_batch_metadata_across_dp
(
num_tokens
:
int
,
num_tokens
:
int
,
cudagraph_size
:
int
,
cudagraph_size
:
int
,
...
@@ -22,10 +28,39 @@ def get_batch_metadata_across_dp(
...
@@ -22,10 +28,39 @@ def get_batch_metadata_across_dp(
return
tensor
[
0
],
tensor
[
1
]
return
tensor
[
0
],
tensor
[
1
]
def
make_num_tokens_across_dp
(
def
get_cudagraph_and_dp_padding
(
dp_size
:
int
,
num_tokens
:
int
,
num_tokens
:
int
,
)
->
torch
.
Tensor
|
None
:
cudagraph_size
:
int
|
None
,
dp_size
:
int
,
dp_rank
:
int
,
)
->
tuple
[
bool
,
int
,
torch
.
Tensor
|
None
]:
if
dp_size
==
1
:
if
dp_size
==
1
:
return
None
if
cudagraph_size
is
not
None
:
return
torch
.
full
((
dp_size
,),
num_tokens
,
dtype
=
torch
.
int32
,
device
=
"cpu"
)
return
True
,
cudagraph_size
,
None
else
:
return
False
,
num_tokens
,
None
if
num_tokens
==
0
:
cudagraph_size
=
0
elif
cudagraph_size
is
None
:
cudagraph_size
=
-
1
num_tokens_across_dp
,
cudagraph_size_across_dp
=
get_batch_metadata_across_dp
(
num_tokens
,
cudagraph_size
,
dp_size
,
dp_rank
)
if
torch
.
all
(
num_tokens_across_dp
==
0
).
item
():
# All ranks have zero tokens to run.
return
False
,
0
,
None
if
torch
.
all
(
cudagraph_size_across_dp
!=
-
1
).
item
():
# All ranks use CUDA graph or have zero tokens.
# Use CUDA graph for all ranks.
# Pad all ranks to the maximum CUDA graph size.
max_cudagraph_size
=
int
(
cudagraph_size_across_dp
.
max
().
item
())
num_tokens_across_dp
[:]
=
max_cudagraph_size
return
True
,
max_cudagraph_size
,
num_tokens_across_dp
else
:
# Some ranks do not use CUDA graph. Use eager mode for all ranks.
# No padding is needed except for ranks that have no tokens to run.
num_tokens_across_dp
=
torch
.
clamp
(
num_tokens_across_dp
,
min
=
1
)
num_tokens_after_padding
=
int
(
num_tokens_across_dp
[
dp_rank
].
item
())
return
False
,
num_tokens_after_padding
,
num_tokens_across_dp
vllm/v1/worker/gpu/model_runner.py
View file @
7b7cdce9
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
gc
import
gc
import
time
import
time
from
collections.abc
import
Iterable
from
copy
import
deepcopy
from
copy
import
deepcopy
from
typing
import
Any
from
typing
import
Any
...
@@ -37,7 +36,7 @@ from vllm.v1.worker.gpu.block_table import BlockTables
...
@@ -37,7 +36,7 @@ from vllm.v1.worker.gpu.block_table import BlockTables
from
vllm.v1.worker.gpu.buffer_utils
import
UvaBufferPool
from
vllm.v1.worker.gpu.buffer_utils
import
UvaBufferPool
from
vllm.v1.worker.gpu.cudagraph_utils
import
CudaGraphManager
from
vllm.v1.worker.gpu.cudagraph_utils
import
CudaGraphManager
from
vllm.v1.worker.gpu.dp_utils
import
(
from
vllm.v1.worker.gpu.dp_utils
import
(
get_
batch_metadata_across_dp
,
get_
cudagraph_and_dp_padding
,
make_num_tokens_across_dp
,
make_num_tokens_across_dp
,
)
)
from
vllm.v1.worker.gpu.input_batch
import
(
from
vllm.v1.worker.gpu.input_batch
import
(
...
@@ -877,60 +876,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -877,60 +876,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
)
return
draft_tokens
return
draft_tokens
def
get_cudagraph_and_dp_padding
(
self
,
total_num_scheduled_tokens
:
int
,
num_tokens_per_request
:
Iterable
[
int
],
)
->
tuple
[
CUDAGraphMode
,
int
,
torch
.
Tensor
|
None
]:
dp_size
=
self
.
parallel_config
.
data_parallel_size
if
dp_size
==
1
:
# No DP. Only consider CUDA graphs.
if
total_num_scheduled_tokens
==
0
:
# Special case: no tokens to run.
return
CUDAGraphMode
.
NONE
,
0
,
None
cudagraph_size
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
total_num_scheduled_tokens
,
num_tokens_per_request
)
if
cudagraph_size
is
not
None
:
# Use full CUDA graph.
return
CUDAGraphMode
.
FULL
,
cudagraph_size
,
None
# Fall back to eager mode.
# TODO(woosuk): Support piecewise CUDA graphs.
return
CUDAGraphMode
.
NONE
,
total_num_scheduled_tokens
,
None
# Consider DP padding and CUDA graph.
if
total_num_scheduled_tokens
==
0
:
# Special handling is needed for 0.
cudagraph_size_before_dp
:
int
|
None
=
0
else
:
cudagraph_size_before_dp
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
total_num_scheduled_tokens
,
num_tokens_per_request
)
if
cudagraph_size_before_dp
is
None
:
cudagraph_size_before_dp
=
-
1
assert
cudagraph_size_before_dp
is
not
None
dp_rank
=
self
.
parallel_config
.
data_parallel_rank
num_tokens_across_dp
,
cudagraph_size_across_dp
=
get_batch_metadata_across_dp
(
total_num_scheduled_tokens
,
cudagraph_size_before_dp
,
dp_size
,
dp_rank
,
)
if
all
(
cudagraph_size_across_dp
>=
0
):
# If all ranks can use CUDA graph, pad to the maximum number of tokens
# across DP and use CUDA graph.
num_tokens_after_padding
=
int
(
cudagraph_size_across_dp
.
max
().
item
())
cudagraph_mode
=
CUDAGraphMode
.
FULL
else
:
# If any of the ranks cannot use CUDA graph, use eager mode for all ranks.
# No padding is needed except for ranks that have no tokens to run.
num_tokens_across_dp
=
torch
.
clamp
(
num_tokens_across_dp
,
min
=
1
)
num_tokens_after_padding
=
num_tokens_across_dp
[
dp_rank
]
cudagraph_mode
=
CUDAGraphMode
.
NONE
return
cudagraph_mode
,
num_tokens_after_padding
,
num_tokens_across_dp
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
execute_model
(
def
execute_model
(
self
,
self
,
...
@@ -951,10 +896,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -951,10 +896,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# No need to run the model.
# No need to run the model.
return
EMPTY_MODEL_RUNNER_OUTPUT
return
EMPTY_MODEL_RUNNER_OUTPUT
cudagraph_mode
,
num_tokens_after_padding
,
num_tokens_across_dp
=
(
# Get the CUDA graph size. None means no CUDA graph is used.
self
.
get_cudagraph_and_dp_padding
(
cudagraph_size
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
scheduler_output
.
total_num_scheduled_tokens
,
scheduler_output
.
num_scheduled_tokens
.
values
(),
)
use_cudagraph
,
num_tokens_after_padding
,
num_tokens_across_dp
=
(
get_cudagraph_and_dp_padding
(
scheduler_output
.
total_num_scheduled_tokens
,
scheduler_output
.
total_num_scheduled_tokens
,
scheduler_output
.
num_scheduled_tokens
.
values
(),
cudagraph_size
,
self
.
parallel_config
.
data_parallel_size
,
self
.
parallel_config
.
data_parallel_rank
,
)
)
)
)
if
num_tokens_after_padding
==
0
:
if
num_tokens_after_padding
==
0
:
...
@@ -1006,7 +958,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1006,7 +958,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# FIXME(woosuk): Fix warmup for LoRA.
# FIXME(woosuk): Fix warmup for LoRA.
# Run model.
# Run model.
if
cudagraph
_mode
==
CUDAGraphMode
.
FULL
:
if
use_
cudagraph
:
# Run CUDA graph.
# Run CUDA graph.
# NOTE(woosuk): Here, we don't need to pass the input tensors,
# NOTE(woosuk): Here, we don't need to pass the input tensors,
# because they are already copied to the CUDA graph input buffers.
# because they are already copied to the CUDA graph input buffers.
...
@@ -1015,7 +967,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1015,7 +967,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
)
else
:
else
:
# Run PyTorch model in eager mode.
# Run PyTorch model in eager mode.
# TODO(woosuk): Support piecewise CUDA graph.
positions
=
input_batch
.
positions
positions
=
input_batch
.
positions
if
self
.
uses_mrope
:
if
self
.
uses_mrope
:
assert
input_batch
.
mrope_positions
is
not
None
assert
input_batch
.
mrope_positions
is
not
None
...
@@ -1024,7 +975,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1024,7 +975,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch
.
attn_metadata
,
input_batch
.
attn_metadata
,
self
.
vllm_config
,
self
.
vllm_config
,
num_tokens
=
input_batch
.
num_tokens_after_padding
,
num_tokens
=
input_batch
.
num_tokens_after_padding
,
cudagraph_runtime_mode
=
cudagraph_mode
,
# TODO(woosuk): Support piecewise CUDA graph.
cudagraph_runtime_mode
=
CUDAGraphMode
.
NONE
,
num_tokens_across_dp
=
num_tokens_across_dp
,
num_tokens_across_dp
=
num_tokens_across_dp
,
):
):
hidden_states
=
self
.
model
(
hidden_states
=
self
.
model
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment