Commit 40083064 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_USE_TRITON_OPT_MLA to use optimized MLA attention

parent 0a130908
...@@ -15,6 +15,7 @@ if TYPE_CHECKING: ...@@ -15,6 +15,7 @@ if TYPE_CHECKING:
VLLM_NCCL_SO_PATH: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None
LD_LIBRARY_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None
VLLM_USE_TRITON_FLASH_ATTN: bool = False VLLM_USE_TRITON_FLASH_ATTN: bool = False
VLLM_USE_TRITON_OPT_MLA: bool = False
VLLM_USE_OPT_OP: bool = False VLLM_USE_OPT_OP: bool = False
VLLM_USE_TC_PAGED_ATTN: bool = False VLLM_USE_TC_PAGED_ATTN: bool = False
VLLM_USE_PA_PRINT_PARAM: bool = False VLLM_USE_PA_PRINT_PARAM: bool = False
...@@ -564,6 +565,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -564,6 +565,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the MLA attention optimizations. # If set, vLLM will disable the MLA attention optimizations.
"VLLM_MLA_DISABLE": "VLLM_MLA_DISABLE":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# Flag that can control whether or not we perform matrix-absorption for MLA # Flag that can control whether or not we perform matrix-absorption for MLA
# decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment