Tune paged attention parameters for AMD GPU. (#3255)

d9eb9358 · Wen-Heng (Jack) Chung · GitHub · 959dca4f · d9eb9358 · d9eb9358
Unverified Commit d9eb9358 authored Feb 01, 2025 by Wen-Heng (Jack) Chung Committed by GitHub Feb 01, 2025
Showing with 13 additions and 2 deletions

python/sglang/srt/layers/attention/triton_ops/decode_attention.py ...glang/srt/layers/attention/triton_ops/decode_attention.py +9 -2

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +4 -0

No files found.
--- a/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
+++ b/python/sglang/srt/layers/attention/triton_ops/decode_attention.py
@@ -181,6 +181,9 @@ def _decode_att_m_fwd(
    logit_cap,
 ):
    BLOCK = 64
+    # [TODO] work around SGPR limit on MI3xx
+    if is_hip_:
+        BLOCK = 8
    NUM_KV_SPLITS = num_kv_splits
    Lk = k_buffer.shape[-1]
    Lv = v_buffer.shape[-1]
@@ -194,6 +197,8 @@ def _decode_att_m_fwd(
        num_warps = 4
    else:
        num_warps = 2
+        if is_hip_:
+            num_warps = 1

    BLOCK_DMODEL = triton.next_power_of_2(Lk)
    BLOCK_DV = triton.next_power_of_2(Lv)
@@ -433,10 +438,12 @@ def _decode_grouped_att_m_fwd(
    )

    extra_kargs = {}
+    num_stages = 2
    if is_hip_:
        # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
        # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
-        extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
+        extra_kargs = {"waves_per_eu": 1, "matrix_instr_nonkdim": 16, "kpack": 2}
+        num_stages = 1

    _fwd_grouped_kernel_stage1[grid](
        q,
@@ -467,7 +474,7 @@ def _decode_grouped_att_m_fwd(
        NUM_KV_SPLITS=NUM_KV_SPLITS,
        logit_cap=logit_cap,
        num_warps=4,
-        num_stages=2,
+        num_stages=num_stages,
        Lk=Lk,
        Lv=Lv,
        **extra_kargs,

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -273,6 +273,10 @@ class ServerArgs:
        ) and check_gguf_file(self.model_path):
            self.quantization = self.load_format = "gguf"

+        # AMD-specific Triton attention KV splits default number
+        if is_hip():
+            self.triton_attention_num_kv_splits = 16
+
    @staticmethod
    def add_cli_args(parser: argparse.ArgumentParser):
        # Model and port args