Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
20ee418a
Unverified
Commit
20ee418a
authored
Nov 22, 2025
by
Woosuk Kwon
Committed by
GitHub
Nov 22, 2025
Browse files
[Model Runner V2] Minor fix for cudagraph_utils (#29256)
parent
389aa1b2
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
14 deletions
+6
-14
vllm/v1/worker/gpu/cudagraph_utils.py
vllm/v1/worker/gpu/cudagraph_utils.py
+5
-14
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+1
-0
No files found.
vllm/v1/worker/gpu/cudagraph_utils.py
View file @
20ee418a
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
gc
from
contextlib
import
contextmanager
from
unittest.mock
import
patch
import
numpy
as
np
import
torch
...
...
@@ -140,6 +139,7 @@ class CudaGraphManager:
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
batch_size
,
cudagraph_runtime_mode
=
CUDAGraphMode
.
NONE
,
num_tokens_across_dp
=
num_tokens_across_dp
,
):
hidden_states
=
model
(
...
...
@@ -148,15 +148,16 @@ class CudaGraphManager:
)
if
self
.
hidden_states
is
None
:
self
.
hidden_states
=
torch
.
empty_like
(
hidden_states
)
torch
.
cuda
.
synchronize
()
# Capture the graph.
graph
=
torch
.
cuda
.
CUDAGraph
()
with
(
patch
(
"torch.cuda.empty_cache"
,
lambda
:
None
),
set_forward_context
(
attn_metadata
,
self
.
vllm_config
,
num_tokens
=
batch_size
,
cudagraph_runtime_mode
=
CUDAGraphMode
.
NONE
,
num_tokens_across_dp
=
num_tokens_across_dp
,
),
torch
.
cuda
.
graph
(
graph
,
self
.
pool
),
...
...
@@ -183,7 +184,7 @@ class CudaGraphManager:
if
is_global_first_rank
():
sizes_to_capture
=
tqdm
(
sizes_to_capture
,
desc
=
"Capturing CUDA graphs"
)
with
freeze_gc
(),
graph_capture
(
device
=
self
.
device
):
with
graph_capture
(
device
=
self
.
device
):
for
batch_size
in
sizes_to_capture
:
self
.
capture_graph
(
batch_size
,
...
...
@@ -199,13 +200,3 @@ class CudaGraphManager:
self
.
graphs
[
batch_size
].
replay
()
assert
self
.
hidden_states
is
not
None
return
self
.
hidden_states
[:
batch_size
]
@
contextmanager
def
freeze_gc
():
gc
.
collect
()
gc
.
freeze
()
try
:
yield
finally
:
gc
.
unfreeze
()
vllm/v1/worker/gpu/model_runner.py
View file @
20ee418a
...
...
@@ -298,6 +298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return
0
start_time
=
time
.
perf_counter
()
torch
.
cuda
.
empty_cache
()
start_free_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
with
self
.
maybe_setup_dummy_loras
(
self
.
lora_config
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment