Unverified Commit e0b24ea0 authored by Lucas Wilkinson's avatar Lucas Wilkinson Committed by GitHub
Browse files

[Perf] Increase default max splits for FA3 full cudagraphs (#25495)


Signed-off-by: default avatarLucas Wilkinson <lwilkins@redhat.com>
parent bde2a1a8
...@@ -119,7 +119,7 @@ if TYPE_CHECKING: ...@@ -119,7 +119,7 @@ if TYPE_CHECKING:
VLLM_SERVER_DEV_MODE: bool = False VLLM_SERVER_DEV_MODE: bool = False
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
VLLM_MLA_DISABLE: bool = False VLLM_MLA_DISABLE: bool = False
VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 16 VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH: int = 32
VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_CUDART_SO_PATH: Optional[str] = None VLLM_CUDART_SO_PATH: Optional[str] = None
...@@ -1017,7 +1017,7 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -1017,7 +1017,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# max number splits for cuda graph decode # max number splits for cuda graph decode
"VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH":
lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH", lambda: int(os.getenv("VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
"16")), "32")),
# Number of GPUs per worker in Ray, if it is set to be a fraction, # Number of GPUs per worker in Ray, if it is set to be a fraction,
# it allows ray to schedule multiple actors on a single GPU, # it allows ray to schedule multiple actors on a single GPU,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment