[BugFix] Fix use_cudagraph=False (#19612)

Signed-off-by: Richard Zou <zou3519@gmail.com>

[BugFix] Fix use_cudagraph=False (#19612)
Signed-off-by: Richard Zou <zou3519@gmail.com>
ed333497 · Richard Zou · GitHub · d49adea1 · ed333497 · ed333497
Unverified Commit ed333497 authored Jun 18, 2025 by Richard Zou Committed by GitHub Jun 19, 2025
Showing with 35 additions and 29 deletions

tests/compile/test_config.py tests/compile/test_config.py +21 -24

vllm/compilation/counter.py vllm/compilation/counter.py +3 -0

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +11 -5

No files found.
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
-import torch
 import vllm
 from vllm.compilation.counter import compilation_counter
-from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig,
+from vllm.config import VllmConfig
-                         set_current_vllm_config)
-from .piecewise.test_simple import SillyModel
 def test_use_cudagraphs_dynamic(monkeypatch):
@@ -22,23 +18,24 @@ def test_use_cudagraphs_dynamic(monkeypatch):
 @pytest.mark.parametrize("enabled", [True, False])
-def test_use_cudagraphs(enabled):
+def test_use_cudagraphs(vllm_runner, monkeypatch, enabled):
    assert vllm.envs.VLLM_USE_V1
-    vllm_config = VllmConfig(compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
+    # Disable multiprocessing so that the counter is in the same process
-        use_cudagraph=enabled,
+    monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
-        cudagraph_capture_sizes=[100],
-    ))
+    compilation_config = {
-    with set_current_vllm_config(vllm_config):
+        "cudagraph_capture_sizes": [100],
-        model = SillyModel(vllm_config=vllm_config, prefix='')
+        "use_cudagraph": enabled,
+    }
-    inputs = torch.randn(100, device="cuda")
+    with (
+            compilation_counter.expect(
-    with compilation_counter.expect(
+                num_graphs_seen=1,
-            num_graphs_seen=1,  # one graph for the model
+                num_gpu_runner_capture_triggers=1 if enabled else 0,
-            num_cudagraph_captured=1 if enabled else 0,
+                num_cudagraph_captured=13 if enabled else 0,
-    ):
+            ),
-        # first run is warmup
+            # loading the model causes compilation (if enabled) to happen
-        model(inputs)
+            vllm_runner('facebook/opt-125m',
-        # second run does CUDAGraphs recording (if enabled)
+                        compilation_config=compilation_config,
-        model(inputs)
+                        gpu_memory_utilization=0.4) as _):
+        pass
--- a/vllm/compilation/counter.py
+++ b/vllm/compilation/counter.py
@@ -15,6 +15,9 @@ class CompilationCounter:
    # not including the splitting ops
    num_piecewise_capturable_graphs_seen: int = 0
    num_backend_compilations: int = 0
+    # Number of gpu_model_runner attempts to trigger CUDAGraphs capture
+    num_gpu_runner_capture_triggers: int = 0
+    # Number of CUDAGraphs captured
    num_cudagraph_captured: int = 0
    # InductorAdapter.compile calls
    num_inductor_compiles: int = 0

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -18,6 +18,7 @@ import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
+from vllm.compilation.counter import compilation_counter
 from vllm.config import (CompilationLevel, VllmConfig,
                         get_layers_from_vllm_config)
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
@@ -200,9 +201,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            block_sizes=[self.cache_config.block_size],
        )
-        self.use_cuda_graph = (self.compilation_config.level
+        self.use_cuda_graph = (
-                               == CompilationLevel.PIECEWISE
+            self.vllm_config.compilation_config.level
-                               and not self.model_config.enforce_eager)
+            == CompilationLevel.PIECEWISE
+            and self.vllm_config.compilation_config.use_cudagraph
+            and not self.model_config.enforce_eager)
        # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
        # The convention is different.
        # self.cudagraph_batch_sizes sorts in ascending order.
@@ -2058,10 +2061,13 @@ class GPUModelRunner(LoRAModelRunnerMixin):
    def capture_model(self) -> None:
        if not self.use_cuda_graph:
            logger.warning(
-                "Skipping CUDA graph capture. Please add "
+                "Skipping CUDA graph capture. To turn on CUDA graph capture, "
-                "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE)
+                "set -O %s and ensure `use_cudagraph` was not manually set to "
+                "False", CompilationLevel.PIECEWISE)
            return
+        compilation_counter.num_gpu_runner_capture_triggers += 1
        start_time = time.perf_counter()
        start_free_gpu_memory = torch.cuda.mem_get_info()[0]