Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
43fada53
Unverified
Commit
43fada53
authored
Jan 19, 2026
by
Woosuk Kwon
Committed by
GitHub
Jan 19, 2026
Browse files
[Model Runner V2] Refactor `dummy_run` (#32533)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
4a5299c9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
34 additions
and
55 deletions
+34
-55
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
+2
-3
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+31
-46
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+1
-6
No files found.
vllm/v1/worker/gpu/cudagraph_utils.py
View file @
43fada53
...
@@ -13,7 +13,6 @@ from vllm.config.compilation import CUDAGraphMode
...
@@ -13,7 +13,6 @@ from vllm.config.compilation import CUDAGraphMode
from
vllm.distributed.parallel_state
import
graph_capture
,
is_global_first_rank
from
vllm.distributed.parallel_state
import
graph_capture
,
is_global_first_rank
from
vllm.forward_context
import
set_forward_context
from
vllm.forward_context
import
set_forward_context
from
vllm.v1.attention.backend
import
AttentionMetadataBuilder
from
vllm.v1.attention.backend
import
AttentionMetadataBuilder
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.worker.gpu.attn_utils
import
build_attn_metadata
from
vllm.v1.worker.gpu.attn_utils
import
build_attn_metadata
from
vllm.v1.worker.gpu.block_table
import
BlockTables
from
vllm.v1.worker.gpu.block_table
import
BlockTables
...
@@ -60,12 +59,12 @@ class CudaGraphManager:
...
@@ -60,12 +59,12 @@ class CudaGraphManager:
def
get_cudagraph_size
(
def
get_cudagraph_size
(
self
,
self
,
scheduler_output
:
SchedulerOutput
,
num_tokens_after_padding
:
int
,
num_tokens_after_padding
:
int
,
num_tokens_per_request
:
Iterable
[
int
],
)
->
int
|
None
:
)
->
int
|
None
:
return
get_cudagraph_size
(
return
get_cudagraph_size
(
num_tokens_after_padding
,
num_tokens_after_padding
,
scheduler_output
.
num_scheduled_tokens
.
val
ues
()
,
num_tokens_per_req
ues
t
,
self
.
cudagraph_sizes
,
self
.
cudagraph_sizes
,
self
.
cudagraph_mode
,
self
.
cudagraph_mode
,
)
)
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
43fada53
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
gc
import
gc
import
time
import
time
from
collections.abc
import
Iterable
from
copy
import
deepcopy
from
copy
import
deepcopy
from
typing
import
Any
from
typing
import
Any
...
@@ -288,46 +289,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -288,46 +289,24 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
skip_attn
:
bool
=
True
,
skip_attn
:
bool
=
True
,
**
kwargs
,
**
kwargs
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Create a dummy scheduler output.
num_reqs
=
min
(
num_tokens
,
self
.
max_num_reqs
)
num_reqs
=
min
(
num_tokens
,
self
.
max_num_reqs
)
input_batch
=
InputBatch
.
make_dummy
(
num_tokens_per_request
=
[
num_tokens
//
num_reqs
]
*
num_reqs
num_reqs
=
num_reqs
,
num_tokens_per_request
[
-
1
]
+=
num_tokens
%
num_reqs
num_tokens
=
num_tokens
,
assert
sum
(
num_tokens_per_request
)
==
num_tokens
input_buffers
=
self
.
input_buffers
,
num_scheduled_tokens
=
{
device
=
self
.
device
,
f
"_dummy_req_
{
i
}
"
:
num_tokens_per_request
[
i
]
for
i
in
range
(
num_reqs
)
)
}
if
self
.
uses_mrope
:
dummy_scheduler_output
=
SchedulerOutput
.
make_empty
()
input_batch
.
mrope_positions
=
self
.
mrope_states
.
mrope_positions
[
dummy_scheduler_output
.
total_num_scheduled_tokens
=
num_tokens
:,
:
num_tokens
dummy_scheduler_output
.
num_scheduled_tokens
=
num_scheduled_tokens
]
if
self
.
supports_mm_inputs
:
# Execute the model.
input_batch
.
inputs_embeds
=
self
.
encoder_runner
.
inputs_embeds
[:
num_tokens
]
self
.
execute_model
(
if
not
skip_attn
:
dummy_scheduler_output
,
dummy_run
=
True
,
skip_attn_for_dummy_run
=
skip_attn
self
.
prepare_dummy_attn_metadata
(
input_batch
)
dp_size
=
self
.
parallel_config
.
data_parallel_size
num_tokens_across_dp
=
make_num_tokens_across_dp
(
dp_size
,
num_tokens
)
num_sampled_tokens
=
np
.
ones
(
input_batch
.
num_reqs
,
dtype
=
np
.
int32
)
positions
=
input_batch
.
positions
if
self
.
uses_mrope
:
positions
=
input_batch
.
mrope_positions
with
(
self
.
maybe_dummy_run_with_lora
(
self
.
lora_config
,
input_batch
.
num_scheduled_tokens
,
num_sampled_tokens
,
),
set_forward_context
(
input_batch
.
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
num_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
,
),
):
hidden_states
=
self
.
model
(
input_ids
=
input_batch
.
input_ids
,
positions
=
positions
,
inputs_embeds
=
input_batch
.
inputs_embeds
,
)
)
assert
self
.
execute_model_state
is
not
None
hidden_states
,
input_batch
=
self
.
execute_model_state
sample_hidden_states
=
hidden_states
[
input_batch
.
logits_indices
]
sample_hidden_states
=
hidden_states
[
input_batch
.
logits_indices
]
return
hidden_states
,
sample_hidden_states
return
hidden_states
,
sample_hidden_states
...
@@ -893,9 +872,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -893,9 +872,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
def
get_cudagraph_and_dp_padding
(
def
get_cudagraph_and_dp_padding
(
self
,
self
,
scheduler_output
:
SchedulerOutput
,
total_num_scheduled_tokens
:
int
,
num_tokens_per_request
:
Iterable
[
int
],
)
->
tuple
[
CUDAGraphMode
,
int
,
torch
.
Tensor
|
None
]:
)
->
tuple
[
CUDAGraphMode
,
int
,
torch
.
Tensor
|
None
]:
total_num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
dp_size
=
self
.
parallel_config
.
data_parallel_size
dp_size
=
self
.
parallel_config
.
data_parallel_size
if
dp_size
==
1
:
if
dp_size
==
1
:
# No DP. Only consider CUDA graphs.
# No DP. Only consider CUDA graphs.
...
@@ -904,7 +883,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -904,7 +883,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return
CUDAGraphMode
.
NONE
,
0
,
None
return
CUDAGraphMode
.
NONE
,
0
,
None
cudagraph_size
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
cudagraph_size
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
scheduler_output
,
total_num_scheduled_tokens
total_num_scheduled_tokens
,
num_tokens_per_request
)
)
if
cudagraph_size
is
not
None
:
if
cudagraph_size
is
not
None
:
# Use full CUDA graph.
# Use full CUDA graph.
...
@@ -919,7 +898,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -919,7 +898,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cudagraph_size_before_dp
:
int
|
None
=
0
cudagraph_size_before_dp
:
int
|
None
=
0
else
:
else
:
cudagraph_size_before_dp
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
cudagraph_size_before_dp
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
scheduler_output
,
total_num_scheduled_tokens
total_num_scheduled_tokens
,
num_tokens_per_request
)
)
if
cudagraph_size_before_dp
is
None
:
if
cudagraph_size_before_dp
is
None
:
cudagraph_size_before_dp
=
-
1
cudagraph_size_before_dp
=
-
1
...
@@ -951,6 +930,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -951,6 +930,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
scheduler_output
:
SchedulerOutput
,
scheduler_output
:
SchedulerOutput
,
intermediate_tensors
:
Any
|
None
=
None
,
intermediate_tensors
:
Any
|
None
=
None
,
dummy_run
:
bool
=
False
,
dummy_run
:
bool
=
False
,
skip_attn_for_dummy_run
:
bool
=
False
,
)
->
ModelRunnerOutput
|
None
:
)
->
ModelRunnerOutput
|
None
:
assert
intermediate_tensors
is
None
assert
intermediate_tensors
is
None
if
not
dummy_run
:
if
not
dummy_run
:
...
@@ -965,7 +945,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -965,7 +945,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return
EMPTY_MODEL_RUNNER_OUTPUT
return
EMPTY_MODEL_RUNNER_OUTPUT
cudagraph_mode
,
num_tokens_after_padding
,
num_tokens_across_dp
=
(
cudagraph_mode
,
num_tokens_after_padding
,
num_tokens_across_dp
=
(
self
.
get_cudagraph_and_dp_padding
(
scheduler_output
)
self
.
get_cudagraph_and_dp_padding
(
scheduler_output
.
total_num_scheduled_tokens
,
scheduler_output
.
num_scheduled_tokens
.
values
(),
)
)
)
if
num_tokens_after_padding
==
0
:
if
num_tokens_after_padding
==
0
:
# All DP ranks have zero tokens to run.
# All DP ranks have zero tokens to run.
...
@@ -999,7 +982,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -999,7 +982,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
:
input_batch
.
num_tokens_after_padding
:
input_batch
.
num_tokens_after_padding
]
]
else
:
else
:
# No actual tokens to run. A dummy run for DP.
# No actual tokens to run. A dummy run for DP
or memory profiling
.
num_reqs
=
min
(
num_tokens_after_padding
,
self
.
max_num_reqs
)
num_reqs
=
min
(
num_tokens_after_padding
,
self
.
max_num_reqs
)
input_batch
=
InputBatch
.
make_dummy
(
input_batch
=
InputBatch
.
make_dummy
(
num_reqs
=
num_reqs
,
num_reqs
=
num_reqs
,
...
@@ -1011,7 +994,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
...
@@ -1011,7 +994,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch
.
mrope_positions
=
self
.
mrope_states
.
mrope_positions
[
input_batch
.
mrope_positions
=
self
.
mrope_states
.
mrope_positions
[
:,
:
num_tokens_after_padding
:,
:
num_tokens_after_padding
]
]
if
not
skip_attn_for_dummy_run
:
self
.
prepare_dummy_attn_metadata
(
input_batch
)
self
.
prepare_dummy_attn_metadata
(
input_batch
)
# FIXME(woosuk): Fix warmup for LoRA.
# Run model.
# Run model.
if
cudagraph_mode
==
CUDAGraphMode
.
FULL
:
if
cudagraph_mode
==
CUDAGraphMode
.
FULL
:
...
...
vllm/v1/worker/gpu_worker.py
View file @
43fada53
...
@@ -662,11 +662,6 @@ class Worker(WorkerBase):
...
@@ -662,11 +662,6 @@ class Worker(WorkerBase):
self
.
profiler
.
stop
()
self
.
profiler
.
stop
()
def
execute_dummy_batch
(
self
)
->
None
:
def
execute_dummy_batch
(
self
)
->
None
:
if
self
.
use_v2_model_runner
:
self
.
model_runner
.
execute_model
(
SchedulerOutput
.
make_empty
(),
dummy_run
=
True
)
else
:
self
.
model_runner
.
_dummy_run
(
1
,
uniform_decode
=
True
)
self
.
model_runner
.
_dummy_run
(
1
,
uniform_decode
=
True
)
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment