add VLLM_USE_TRITON_OPT_MLA to use optimized MLA attention

update mla optest

add VLLM_USE_TRITON_OPT_MLA to use optimized MLA attention
update mla optest
1ee40c4d · zhuwenwen · e3a0d6bb · 1ee40c4d · 1ee40c4d · 1ee40c4d
Commit 1ee40c4d authored Mar 11, 2025 by zhuwenwen
5 changed files
--- a/setup.py
+++ b/setup.py
@@ -508,10 +508,10 @@ def get_version_add(sha: Optional[str] = None) -> str:
            if sha is None:
                sha = get_sha(vllm_root)
            if (major, minor) == ('2', '4'):
-                version = 'das.opt1.cust1.' + sha[:7]
+                version = 'das.opt1.' + sha[:7]
    else:
        if (major, minor) == ('2', '4'):
-            version = 'das.opt1.cust1'
+            version = 'das.opt1'


    # dtk version
@@ -696,7 +696,7 @@ package_data = {
        "model_executor/layers/fused_moe/configs/*.json",
        "model_executor/layers/quantization/utils/configs/*.json",
        "benchmarks/*.py",
-        "model_executor/layers/quantization/configs/w8a8/*.json",
+        "attention/backends/configs/*.json",
        "model_executor/layers/quantization/configs/awq/*.json"
    ]
 }

--- a/tests/kernels/test_triton_decode_attention.py
+++ b/tests/kernels/test_triton_decode_attention.py
@@ -2,9 +2,9 @@

 import pytest
 import torch
+import triton

-from vllm.attention.ops.triton_decode_attention import decode_attention_fwd
-
+from vllm.attention.ops.triton_decode_attention import decode_attention_fwd, decode_attention_v1, decode_attention_v2

 def cdiv(a, b):
    return (a + b - 1) // b
@@ -25,13 +25,13 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
    sm_scale = 1.0 / (D_QK**0.5)
    num_kv_splits = 8

-    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE)
+    num_pages_per_batch = cdiv(seq_len, PAGE_SIZE) # 向上取整:65, (1027+16-1)//16
    req_to_page = torch.randint(0,
                                CACHE_SIZE // PAGE_SIZE,
-                                (B, num_pages_per_batch, 1),
+                                (B, num_pages_per_batch, 1), #shape为（B, num_pages_per_batch, 1）的tensor,大小取值为0 至cache_size//page_size
                                device="cuda")
    req_to_token = req_to_page * PAGE_SIZE
-    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE)
+    req_to_token = req_to_token.expand(B, num_pages_per_batch, PAGE_SIZE) # 维度扩展，从torch.Size([3, 65, 1])扩展至torch.Size([3, 65, 16])
    req_to_token = req_to_token + torch.arange(PAGE_SIZE, device="cuda").view(
        1, 1, -1)
    req_to_token = req_to_token.view(B, -1)
@@ -50,12 +50,20 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
    
    b_seq_len = torch.full((B, ), seq_len, device="cuda")

+    b_start_loc = torch.arange(0, k_buffer.shape[0] * PAGE_SIZE, k_buffer.shape[0] * PAGE_SIZE // q.shape[0], device="cuda").to(torch.int32)
+    attn_logits_v1 = torch.empty(
+               (q.shape[1], k_buffer.shape[0]*PAGE_SIZE),
+               dtype=torch.float16,
+               device="cuda")
+
    attn_logits = torch.empty(
        (B, H_Q, num_kv_splits, D_V + 1),
        dtype=torch.float32,
        device="cuda",
    )
    
+    quantiles = [0.5, 0.2, 0.8]
+
    # Call the original implementation.
    decode_attention_fwd(
        q,
@@ -87,5 +95,81 @@ def test_decode_attention(B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE):
        sm_scale,
        PAGE_SIZE,
    )
-
    assert torch.allclose(o, o1)
+    
+    # v0_tc_ms, v0_tc_min_ms, v0_tc_max_ms = triton.testing.do_bench(lambda:
+    # decode_attention_fwd(
+    #     q,
+    #     k_buffer,
+    #     v_buffer,
+    #     o1,
+    #     req_to_page,
+    #     b_seq_len,
+    #     attn_logits,
+    #     num_kv_splits,
+    #     sm_scale,
+    #     PAGE_SIZE,
+
+    # ), quantiles=quantiles)
+    # print("print mla decode attention ori kernel [B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE] min cost :",[B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE], v0_tc_ms)
+    
+    decode_attention_v1(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        req_to_page,
+        b_start_loc,
+        b_seq_len,
+        attn_logits_v1,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE, 
+    )
+    assert torch.allclose(o, o1, atol=1e-2, rtol=1e-2)
+    
+    # v1_tc_ms, v1_tc_min_ms, v1_tc_max_ms = triton.testing.do_bench(lambda:
+    # decode_attention_v1(
+    #     q,
+    #     k_buffer,
+    #     v_buffer,
+    #     o1,
+    #     req_to_page,
+    #     b_start_loc,
+    #     b_seq_len,
+    #     attn_logits_v1,
+    #     num_kv_splits,
+    #     sm_scale,
+    #     PAGE_SIZE, 
+    # ), quantiles=quantiles)
+    # print("print mla decode attention v1 kernel [B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE] min cost :",[B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE], v1_tc_ms)
+    
+        
+    decode_attention_v2(
+        q,
+        k_buffer,
+        v_buffer,
+        o1,
+        req_to_page,
+        b_seq_len,
+        attn_logits,
+        num_kv_splits,
+        sm_scale,
+        PAGE_SIZE, 
+    )
+    assert torch.allclose(o, o1, atol=1e-2, rtol=1e-2)
+
+    # v2_tc_ms, v2_tc_min_ms, v2_tc_max_ms = triton.testing.do_bench(lambda:
+    # decode_attention_v2(
+    #     q,
+    #     k_buffer,
+    #     v_buffer,
+    #     o1,
+    #     req_to_page,
+    #     b_seq_len,
+    #     attn_logits,
+    #     num_kv_splits,
+    #     sm_scale,
+    #     PAGE_SIZE, 
+    # ), quantiles=quantiles)
+    # print("print mla decode attention v2 kernel [B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE] min cost :",[B, L, H_Q, H_KV, D_QK, D_V, CACHE_SIZE, PAGE_SIZE], v2_tc_ms)
\ No newline at end of file
--- a/vllm/attention/backends/triton_mla.py
+++ b/vllm/attention/backends/triton_mla.py
 # SPDX-License-Identifier: Apache-2.0

+import os
+import functools
+import json
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -33,6 +36,57 @@ if TYPE_CHECKING:
    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                          ModelInputForGPUWithSamplingMetadata)

+from vllm.logger import init_logger
+logger = init_logger(__name__)
+
+
+def get_mla_config_file_name(QH: int, KVH: int, QKD: int, VD: int, cache_dtype: Optional[str]) -> str:
+    if cache_dtype == "default":
+        return f"QH={QH}_KVH={KVH}_QKD={QKD}_VD={VD}_default.json"
+    
+    device_name = torch.cuda.get_device_name().replace(" ", "_")
+    if "K100_AI" in device_name:
+        return f"QH={QH}_KVH={KVH}_QKD={QKD}_VD={VD}_{cache_dtype}_K100AI.json"
+    elif "BW" in device_name:
+        return f"QH={QH}_KVH={KVH}_QKD={QKD}_VD={VD}_{cache_dtype}_BW.json"
+    else:
+        raise ValueError(f"Unsurpport device name: {device_name}")
+
+
+@functools.lru_cache
+def get_attention_mla_configs(QH: int, KVH: int, QKD: int, VD: int, cache_dtype: Optional[str]) -> Optional[Dict[Any, Any]]:
+
+    # First look up if an optimized configuration is available in the configs
+    # directory
+    json_file_name = get_mla_config_file_name(QH, KVH, QKD, VD, cache_dtype)
+
+    config_file_path = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+    )
+    if os.path.exists(config_file_path):
+        with open(config_file_path) as f:
+            logger.info("Using decode attention configuration from %s for attention layer.", config_file_path)
+            # If a configuration has been found, return it
+            return json.load(f)
+    else:
+        logger.warning("Can not find best decode attention configuration %s for attention layer, it may not have the best performance to use default json. Please tune one. ", config_file_path)
+        
+        json_file_name = get_mla_config_file_name(16, 1, 576, 512, "default")
+        config_file_path = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name
+        )
+        if os.path.exists(config_file_path):
+            with open(config_file_path) as f:
+                logger.warning("Using default decode attention configuration from %s for attention layer. It may not have the best performance to use default json. ", config_file_path)
+                # If a configuration has been found, return it
+                return json.load(f)
+        else:
+            raise ValueError("Please surpport default config can match 16 1 576 512")
+
+    # If no optimized configuration is available, we will use the default
+    # configuration
+    return None
+

 class TritonMLABackend(AttentionBackend):

@@ -736,11 +790,14 @@ class TritonMLAImpl(MLACommonImpl[TritonMLAMetadata]):
        kv_c_cache = kv_c_and_k_pe_cache[..., :self.kv_lora_rank]
        PAGE_SIZE = kv_c_and_k_pe_cache.size(1)
        
+        # TODO
+        # config = get_attention_mla_configs(self.num_heads, 1, self.kv_lora_rank + self.qk_rope_head_dim, self.kv_lora_rank, "fp16")
+
        # Run MQA
        decode_attention_fwd(q, kv_c_and_k_pe_cache, kv_c_cache, o,
                             decode_meta.block_tables,
                             decode_meta.seq_lens_tensor, attn_logits,
-                             attn_metadata.num_kv_splits, self.scale,
+                             attn_metadata.num_kv_splits, self.scale,  # config, 
                             PAGE_SIZE)

        return self._v_up_proj_and_o_proj(o)
--- a/vllm/attention/ops/triton_decode_attention.py
+++ b/vllm/attention/ops/triton_decode_attention.py
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -15,6 +15,7 @@ if TYPE_CHECKING:
    VLLM_NCCL_SO_PATH: Optional[str] = None
    LD_LIBRARY_PATH: Optional[str] = None
    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_USE_TRITON_OPT_MLA: bool = False
    VLLM_USE_OPT_OP: bool = False
    VLLM_USE_TC_PAGED_ATTN: bool = False
    VLLM_USE_PA_PRINT_PARAM: bool = False 
@@ -574,6 +575,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_MLA_DISABLE":
    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
    
+    # If set, vLLM will use optimized MLA attention optimizations.
+    "VLLM_USE_TRITON_OPT_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
+
    # Flag that can control whether or not we perform matrix-absorption for MLA
    # decode, i.e. absorb W_UK into W_Q/W_UK and W_UV into W_O, absorbing the
    # matrices reduces the runtime FLOPs needed to compute MLA but requires