Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
43fada53
Unverified
Commit
43fada53
authored
Jan 19, 2026
by
Woosuk Kwon
Committed by
GitHub
Jan 19, 2026
Browse files
[Model Runner V2] Refactor `dummy_run` (#32533)
Signed-off-by:
Woosuk Kwon
<
woosuk.kwon@berkeley.edu
>
parent
4a5299c9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
34 additions
and
55 deletions
+34
-55
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
+2
-3
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+31
-46
vllm/v1/worker/gpu_worker.py
vllm/v1/worker/gpu_worker.py
+1
-6
No files found.
vllm/v1/worker/gpu/cudagraph_utils.py
View file @
43fada53
...
...
@@ -13,7 +13,6 @@ from vllm.config.compilation import CUDAGraphMode
from
vllm.distributed.parallel_state
import
graph_capture
,
is_global_first_rank
from
vllm.forward_context
import
set_forward_context
from
vllm.v1.attention.backend
import
AttentionMetadataBuilder
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.worker.gpu.attn_utils
import
build_attn_metadata
from
vllm.v1.worker.gpu.block_table
import
BlockTables
...
...
@@ -60,12 +59,12 @@ class CudaGraphManager:
def
get_cudagraph_size
(
self
,
scheduler_output
:
SchedulerOutput
,
num_tokens_after_padding
:
int
,
num_tokens_per_request
:
Iterable
[
int
],
)
->
int
|
None
:
return
get_cudagraph_size
(
num_tokens_after_padding
,
scheduler_output
.
num_scheduled_tokens
.
val
ues
()
,
num_tokens_per_req
ues
t
,
self
.
cudagraph_sizes
,
self
.
cudagraph_mode
,
)
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
43fada53
...
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
gc
import
time
from
collections.abc
import
Iterable
from
copy
import
deepcopy
from
typing
import
Any
...
...
@@ -288,47 +289,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
skip_attn
:
bool
=
True
,
**
kwargs
,
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Create a dummy scheduler output.
num_reqs
=
min
(
num_tokens
,
self
.
max_num_reqs
)
input_batch
=
InputBatch
.
make_dummy
(
num_reqs
=
num_reqs
,
num_tokens
=
num_tokens
,
input_buffers
=
self
.
input_buffers
,
device
=
self
.
device
,
num_tokens_per_request
=
[
num_tokens
//
num_reqs
]
*
num_reqs
num_tokens_per_request
[
-
1
]
+=
num_tokens
%
num_reqs
assert
sum
(
num_tokens_per_request
)
==
num_tokens
num_scheduled_tokens
=
{
f
"_dummy_req_
{
i
}
"
:
num_tokens_per_request
[
i
]
for
i
in
range
(
num_reqs
)
}
dummy_scheduler_output
=
SchedulerOutput
.
make_empty
()
dummy_scheduler_output
.
total_num_scheduled_tokens
=
num_tokens
dummy_scheduler_output
.
num_scheduled_tokens
=
num_scheduled_tokens
# Execute the model.
self
.
execute_model
(
dummy_scheduler_output
,
dummy_run
=
True
,
skip_attn_for_dummy_run
=
skip_attn
)
if
self
.
uses_mrope
:
input_batch
.
mrope_positions
=
self
.
mrope_states
.
mrope_positions
[
:,
:
num_tokens
]
if
self
.
supports_mm_inputs
:
input_batch
.
inputs_embeds
=
self
.
encoder_runner
.
inputs_embeds
[:
num_tokens
]
if
not
skip_attn
:
self
.
prepare_dummy_attn_metadata
(
input_batch
)
dp_size
=
self
.
parallel_config
.
data_parallel_size
num_tokens_across_dp
=
make_num_tokens_across_dp
(
dp_size
,
num_tokens
)
num_sampled_tokens
=
np
.
ones
(
input_batch
.
num_reqs
,
dtype
=
np
.
int32
)
positions
=
input_batch
.
positions
if
self
.
uses_mrope
:
positions
=
input_batch
.
mrope_positions
with
(
self
.
maybe_dummy_run_with_lora
(
self
.
lora_config
,
input_batch
.
num_scheduled_tokens
,
num_sampled_tokens
,
),
set_forward_context
(
input_batch
.
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
num_tokens
,
num_tokens_across_dp
=
num_tokens_across_dp
,
),
):
hidden_states
=
self
.
model
(
input_ids
=
input_batch
.
input_ids
,
positions
=
positions
,
inputs_embeds
=
input_batch
.
inputs_embeds
,
)
sample_hidden_states
=
hidden_states
[
input_batch
.
logits_indices
]
assert
self
.
execute_model_state
is
not
None
hidden_states
,
input_batch
=
self
.
execute_model_state
sample_hidden_states
=
hidden_states
[
input_batch
.
logits_indices
]
return
hidden_states
,
sample_hidden_states
@
torch
.
inference_mode
()
...
...
@@ -893,9 +872,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
def
get_cudagraph_and_dp_padding
(
self
,
scheduler_output
:
SchedulerOutput
,
total_num_scheduled_tokens
:
int
,
num_tokens_per_request
:
Iterable
[
int
],
)
->
tuple
[
CUDAGraphMode
,
int
,
torch
.
Tensor
|
None
]:
total_num_scheduled_tokens
=
scheduler_output
.
total_num_scheduled_tokens
dp_size
=
self
.
parallel_config
.
data_parallel_size
if
dp_size
==
1
:
# No DP. Only consider CUDA graphs.
...
...
@@ -904,7 +883,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return
CUDAGraphMode
.
NONE
,
0
,
None
cudagraph_size
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
scheduler_output
,
total_num_scheduled_tokens
total_num_scheduled_tokens
,
num_tokens_per_request
)
if
cudagraph_size
is
not
None
:
# Use full CUDA graph.
...
...
@@ -919,7 +898,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cudagraph_size_before_dp
:
int
|
None
=
0
else
:
cudagraph_size_before_dp
=
self
.
cudagraph_manager
.
get_cudagraph_size
(
scheduler_output
,
total_num_scheduled_tokens
total_num_scheduled_tokens
,
num_tokens_per_request
)
if
cudagraph_size_before_dp
is
None
:
cudagraph_size_before_dp
=
-
1
...
...
@@ -951,6 +930,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
scheduler_output
:
SchedulerOutput
,
intermediate_tensors
:
Any
|
None
=
None
,
dummy_run
:
bool
=
False
,
skip_attn_for_dummy_run
:
bool
=
False
,
)
->
ModelRunnerOutput
|
None
:
assert
intermediate_tensors
is
None
if
not
dummy_run
:
...
...
@@ -965,7 +945,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return
EMPTY_MODEL_RUNNER_OUTPUT
cudagraph_mode
,
num_tokens_after_padding
,
num_tokens_across_dp
=
(
self
.
get_cudagraph_and_dp_padding
(
scheduler_output
)
self
.
get_cudagraph_and_dp_padding
(
scheduler_output
.
total_num_scheduled_tokens
,
scheduler_output
.
num_scheduled_tokens
.
values
(),
)
)
if
num_tokens_after_padding
==
0
:
# All DP ranks have zero tokens to run.
...
...
@@ -999,7 +982,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
:
input_batch
.
num_tokens_after_padding
]
else
:
# No actual tokens to run. A dummy run for DP.
# No actual tokens to run. A dummy run for DP
or memory profiling
.
num_reqs
=
min
(
num_tokens_after_padding
,
self
.
max_num_reqs
)
input_batch
=
InputBatch
.
make_dummy
(
num_reqs
=
num_reqs
,
...
...
@@ -1011,7 +994,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch
.
mrope_positions
=
self
.
mrope_states
.
mrope_positions
[
:,
:
num_tokens_after_padding
]
self
.
prepare_dummy_attn_metadata
(
input_batch
)
if
not
skip_attn_for_dummy_run
:
self
.
prepare_dummy_attn_metadata
(
input_batch
)
# FIXME(woosuk): Fix warmup for LoRA.
# Run model.
if
cudagraph_mode
==
CUDAGraphMode
.
FULL
:
...
...
vllm/v1/worker/gpu_worker.py
View file @
43fada53
...
...
@@ -662,12 +662,7 @@ class Worker(WorkerBase):
self
.
profiler
.
stop
()
def
execute_dummy_batch
(
self
)
->
None
:
if
self
.
use_v2_model_runner
:
self
.
model_runner
.
execute_model
(
SchedulerOutput
.
make_empty
(),
dummy_run
=
True
)
else
:
self
.
model_runner
.
_dummy_run
(
1
,
uniform_decode
=
True
)
self
.
model_runner
.
_dummy_run
(
1
,
uniform_decode
=
True
)
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
return
self
.
model_runner
.
add_lora
(
lora_request
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment