[Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)

Signed-off-by: mgoin <mgoin64@gmail.com>

[Perf] Change default CUDAGraphMode from PIECEWISE to FULL_AND_PIECEWISE (#25444)
Signed-off-by: mgoin <mgoin64@gmail.com>
24fab45d · Michael Goin · GitHub · 63400259 · 24fab45d · 24fab45d
Unverified Commit 24fab45d authored Sep 23, 2025 by Michael Goin Committed by GitHub Sep 23, 2025
Showing with 32 additions and 7 deletions

vllm/config/__init__.py vllm/config/__init__.py +8 -1

vllm/config/compilation.py vllm/config/compilation.py +3 -4

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +21 -2

No files found.
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -509,8 +509,15 @@ class VllmConfig:
            if self.compilation_config.cudagraph_mode is None:
                if envs.VLLM_USE_V1 and self.compilation_config.level \
                    == CompilationLevel.PIECEWISE:
+                    # default to full and piecewise for most models
                    self.compilation_config.cudagraph_mode = \
-                        CUDAGraphMode.PIECEWISE
+                        CUDAGraphMode.FULL_AND_PIECEWISE
+
+                    # pooling model does not support full cudagraphs
+                    if self.model_config is not None and \
+                        self.model_config.pooler_config is not None:
+                        self.compilation_config.cudagraph_mode = \
+                            CUDAGraphMode.PIECEWISE
                else:
                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE


--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -228,15 +228,14 @@ class CompilationConfig:
    The mode of the cudagraph:

    - NONE, no cudagraph capture.
-    - PIECEWISE. (v1 default)
+    - PIECEWISE.
    - FULL.
    - FULL_DECODE_ONLY.
-    - FULL_AND_PIECEWISE.
+    - FULL_AND_PIECEWISE. (v1 default)

    PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
    incompatible ops (i.e. some attention ops) outside the cudagraph
    for general flexibility.
-    This is the default mode.

    FULL mode: Capture full cudagraph for all batches. Can be good for small
    models or workloads with small prompts; not supported by many backends.
@@ -249,7 +248,7 @@ class CompilationConfig:
    
    FULL_AND_PIECEWISE mode: Capture full cudagraph for decode batches and
    piecewise cudagraph for prefill and mixed prefill-decode batches.
-    This is like the most performant mode for most models.
+    This is the most performant mode for most models and is the default.

    Currently, the cudagraph mode is only used for the v1 engine.
    Note that the cudagraph logic is generally orthogonal to the 

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2947,8 +2947,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                # TODO(luka) better system for describing dummy batches
                seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
            else:
-                # Make sure max_model_len is used at the graph capture time.
-                seq_lens = self.max_model_len
+                seq_lens = max_query_len
            self.seq_lens.np[:num_reqs] = seq_lens
            self.seq_lens.np[num_reqs:] = 0
            self.seq_lens.copy_to_gpu()
@@ -3541,6 +3540,26 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    CUDAGraphMode.FULL_DECODE_ONLY
            logger.warning(msg)

+        # check that if we are doing decode full-cudagraphs it is supported
+        if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+                and min_cg_support == AttentionCGSupport.NEVER):
+            msg = (f"CUDAGraphMode.{cudagraph_mode.name} is not supported "
+                   f"with {min_cg_builder_name} backend (support: "
+                   f"{min_cg_support})")
+            if (self.compilation_config.level == CompilationLevel.PIECEWISE and
+                (self.compilation_config.splitting_ops_contain_attention()
+                 or self.compilation_config.use_inductor_graph_partition)):
+                msg += "; setting cudagraph_mode=PIECEWISE because "\
+                    "attention is compiled piecewise"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.PIECEWISE
+            else:
+                msg += "; setting cudagraph_mode=NONE because "\
+                    "attention is not compiled piecewise"
+                cudagraph_mode = self.compilation_config.cudagraph_mode = \
+                    CUDAGraphMode.NONE
+            logger.warning(msg)
+
        # check that if we are doing spec-decode + decode full-cudagraphs it is
        # supported
        if (cudagraph_mode.decode_mode() == CUDAGraphMode.FULL