[Perf] Change default CUDAGraphMode from FULL_AND_PIECEWISE to PIECEWISE

d3fa2342 · zhuwenwen · 55989b60 · d3fa2342 · d3fa2342 · d3fa2342
Commit d3fa2342 authored Jan 06, 2026 by zhuwenwen
Showing with 58 additions and 50 deletions

vllm/config/compilation.py vllm/config/compilation.py +1 -1

vllm/config/vllm.py vllm/config/vllm.py +24 -21

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +33 -28

No files found.
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -78,7 +78,7 @@ class CUDAGraphMode(enum.Enum):
        return self.has_mode(CUDAGraphMode.PIECEWISE)

    def max_cudagraph_mode(self) -> "CUDAGraphMode":
-        return CUDAGraphMode(max(self.value) if not envs.VLLM_USE_PIECEWISE else min(self.value)) if self.separate_routine() else self
+        return CUDAGraphMode(max(self.value)) if self.separate_routine() else self

    def has_full_cudagraphs(self) -> bool:
        return self.max_cudagraph_mode() == CUDAGraphMode.FULL

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -694,6 +694,7 @@ class VllmConfig:
        if current_platform.support_static_graph_mode():
            # if cudagraph_mode has full cudagraphs, we need to check support
            if model_config := self.model_config:
+                if not envs.VLLM_USE_PIECEWISE:
                    if (
                        self.compilation_config.cudagraph_mode.has_full_cudagraphs()
                        and model_config.pooler_config is not None
@@ -716,6 +717,8 @@ class VllmConfig:
                        self.compilation_config.cudagraph_mode = (
                            CUDAGraphMode.FULL_DECODE_ONLY
                        )
+                else:
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE

            # disable cudagraph when enforce eager execution
            if self.model_config is not None and self.model_config.enforce_eager:

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -4109,7 +4109,11 @@ class GPUModelRunner(
                # TODO(luka) better system for describing dummy batches
                seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
            else:
-                seq_lens = max_query_len  # type: ignore[assignment]
+                if not envs.VLLM_USE_PIECEWISE:
+                    seq_lens = max_query_len
+                else:
+                    # Make sure max_model_len is used at the graph capture time.
+                    seq_lens = self.max_model_len
            self.seq_lens.np[:num_reqs] = seq_lens
            self.seq_lens.np[num_reqs:] = 0
            self.seq_lens.copy_to_gpu()
@@ -4825,6 +4829,7 @@ class GPUModelRunner(
            logger.warning(msg)

        # check that if we are doing decode full-cudagraphs it is supported
+        if not envs.VLLM_USE_PIECEWISE:
            if (
                cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
                and min_cg_support == AttentionCGSupport.NEVER