update fa full_cuda_graph support

513f17a4 · zhuwenwen · cc6f327a · 513f17a4 · 513f17a4 · 513f17a4
Commit 513f17a4 authored Aug 08, 2025 by zhuwenwen
Showing with 9 additions and 8 deletions

vllm/config.py vllm/config.py +1 -1

vllm/v1/attention/backends/flash_attn.py vllm/v1/attention/backends/flash_attn.py +7 -6

vllm/v1/worker/gpu_model_runner.py vllm/v1/worker/gpu_model_runner.py +1 -1

No files found.
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4106,7 +4106,7 @@ class CompilationConfig:
    are always used, it can set this to False. Otherwise, it should
    set this to True, and the compiler will copy the input to an
    internally managed buffer. Default is False."""
-    full_cuda_graph: bool = False
+    full_cuda_graph: bool = True
    """whether to use a full cuda graph for the entire forward pass rather than
    splitting certain operations such as attention into subgraphs. Thus this
    flag cannot be used together with splitting_ops. This may provide

--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -163,7 +163,7 @@ def _get_sliding_window_configs(
 class FlashAttentionMetadataBuilder(
        AttentionMetadataBuilder[FlashAttentionMetadata]):
-    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
+    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3 or current_platform.is_rocm() 
    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
                 block_table: BlockTable):
@@ -183,10 +183,11 @@ class FlashAttentionMetadataBuilder(
        self.max_num_splits = 0  # No upper bound on the number of splits.
        self.aot_schedule = (get_flash_attn_version() == 3)
        self.use_full_cuda_graph = compilation_config.full_cuda_graph
-        if not current_platform.is_rocm() and self.use_full_cuda_graph:
+        if self.use_full_cuda_graph:
-            if not self.aot_schedule:
+            if not current_platform.is_rocm():
-                raise ValueError(
+                if not self.aot_schedule:
-                    "AoT scheduling is required for full cuda graph.")
+                    raise ValueError(
+                        "AoT scheduling is required for full cuda graph.")
            capture_sizes = compilation_config.cudagraph_capture_sizes
            if not capture_sizes:
                raise ValueError(
@@ -373,7 +374,7 @@ class FlashAttentionMetadataBuilder(
            scheduler_metadata = self.scheduler_metadata[:n]
        max_num_splits = 0
-        if (not current_platform.is_rocm() and self.use_full_cuda_graph
+        if (self.use_full_cuda_graph
                and num_actual_tokens <= self.max_cudagraph_size):
            # NOTE(woosuk): Setting num_splits > 1 may increase the memory
            # usage, because the intermediate buffers of size [num_splits,

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2385,7 +2385,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                block_table_i,
            )
-            if (not current_platform.is_rocm() and self.full_cuda_graph
+            if (self.full_cuda_graph
                    and not attn_metadata_builder_i.full_cudagraph_supported):
                raise ValueError(
                    f"Full CUDAGraph not supported for "