[Perf] Increase default max splits for FA3 full cudagraphs (#25495)

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>

[Perf] Increase default max splits for FA3 full cudagraphs (#25495)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
e0b24ea0 · Lucas Wilkinson · GitHub · bde2a1a8 · e0b24ea0
Unverified Commit e0b24ea0 authored Sep 23, 2025 by Lucas Wilkinson Committed by GitHub Sep 23, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

vllm/envs.py vllm/envs.py +2 -2

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -119,7 +119,7 @@ if TYPE_CHECKING:
    VLLM_SERVER_DEV_MODE: bool = False
    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
    VLLM_MLA_DISABLE: bool = False
-    VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16
+    VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
    VLLM_RAY_BUNDLE_INDICES: str = ""
    VLLM_CUDART_SO_PATH: Optional[str] = None
@@ -1017,7 +1017,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # max number splits for cuda graph decode
    "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH":
    lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-                          "16")),
+                          "32")),

    # Number of GPUs per worker in Ray, if it is set to be a fraction,
    # it allows ray to schedule multiple actors on a single GPU,