[Model Runner V2] Minor fix for cudagraph_utils (#29256)

20ee418a · Woosuk Kwon · GitHub · 389aa1b2 · 20ee418a · 20ee418a
Unverified Commit 20ee418a authored Nov 22, 2025 by Woosuk Kwon Committed by GitHub Nov 22, 2025
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 14 deletions

vllm/v1/worker/gpu/cudagraph_utils.py vllm/v1/worker/gpu/cudagraph_utils.py +5 -14

vllm/v1/worker/gpu/model_runner.py vllm/v1/worker/gpu/model_runner.py +1 -0

No files found.
--- a/vllm/v1/worker/gpu/cudagraph_utils.py
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import gc
-from contextlib import contextmanager
+from unittest.mock import patch

 import numpy as np
 import torch
@@ -140,6 +139,7 @@ class CudaGraphManager:
            attn_metadata,
            self.vllm_config,
            num_tokens=batch_size,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
            num_tokens_across_dp=num_tokens_across_dp,
        ):
            hidden_states = model(
@@ -148,15 +148,16 @@ class CudaGraphManager:
            )
            if self.hidden_states is None:
                self.hidden_states = torch.empty_like(hidden_states)
-        torch.cuda.synchronize()

        # Capture the graph.
        graph = torch.cuda.CUDAGraph()
        with (
+            patch("torch.cuda.empty_cache", lambda: None),
            set_forward_context(
                attn_metadata,
                self.vllm_config,
                num_tokens=batch_size,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
                num_tokens_across_dp=num_tokens_across_dp,
            ),
            torch.cuda.graph(graph, self.pool),
@@ -183,7 +184,7 @@ class CudaGraphManager:
        if is_global_first_rank():
            sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")

-        with freeze_gc(), graph_capture(device=self.device):
+        with graph_capture(device=self.device):
            for batch_size in sizes_to_capture:
                self.capture_graph(
                    batch_size,
@@ -199,13 +200,3 @@ class CudaGraphManager:
        self.graphs[batch_size].replay()
        assert self.hidden_states is not None
        return self.hidden_states[:batch_size]
-
-
-@contextmanager
-def freeze_gc():
-    gc.collect()
-    gc.freeze()
-    try:
-        yield
-    finally:
-        gc.unfreeze()
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -298,6 +298,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            return 0

        start_time = time.perf_counter()
+        torch.cuda.empty_cache()
        start_free_gpu_memory = torch.cuda.mem_get_info()[0]

        with self.maybe_setup_dummy_loras(self.lora_config):