add VLLM_USE_TRITON_OPT_MLA to use optimized MLA attention

40083064 · zhuwenwen · 0a130908 · 40083064 · 40083064
Commit 40083064 authored Feb 28, 2025 by zhuwenwen
Expand all Show whitespace changes
Inline Side-by-side

Showing with 1064 additions and 18 deletions

vllm/attention/ops/triton_decode_attention.py vllm/attention/ops/triton_decode_attention.py +1059 -18

vllm/envs.py vllm/envs.py +5 -0

No files found.
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
    VLLM_NCCL_SO_PATH: Optional[str] = None
    LD_LIBRARY_PATH: Optional[str] = None
    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_USE_TRITON_OPT_MLA: bool = False
    VLLM_USE_OPT_OP: bool = False
    VLLM_USE_TC_PAGED_ATTN: bool = False
    VLLM_USE_PA_PRINT_PARAM: bool = False 
@@ -565,6 +566,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_MLA_DISABLE":
    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
    
+    # If set, vLLM will use optimized MLA attention optimizations.
+    "VLLM_USE_TRITON_OPT_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
+
    # Flag that can control whether or not we perform matrix-absorption for MLA
    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
    # matrices reduces the runtime FLOPs needed to compute MLA but requires