[Bugfix] Fix flops comp and softmax scale in mla (#900)

* fix flops comp and softmax scale * format

[Bugfix] Fix flops comp and softmax scale in mla (#900)
* fix flops comp and softmax scale * format
16561159 · Wenxuan Tan · GitHub · 54fc6ba0 · 16561159 · 16561159
Unverified Commit 16561159 authored Sep 29, 2025 by Wenxuan Tan Committed by GitHub Sep 30, 2025
Showing with 25 additions and 14 deletions

examples/deepseek_mla/benchmark_mla.py examples/deepseek_mla/benchmark_mla.py +10 -10

examples/deepseek_mla/example_mla_decode_paged.py examples/deepseek_mla/example_mla_decode_paged.py +15 -4

No files found.
--- a/examples/deepseek_mla/benchmark_mla.py
+++ b/examples/deepseek_mla/benchmark_mla.py
@@ -87,7 +87,7 @@ def run_flash_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q,
 @torch.inference_mode()
-def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
+def run_flashinfer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens,
                   h_q, h_kv, d, dv, causal, dtype):
    # pip install flashinfer-python
    import flashinfer
@@ -128,7 +128,7 @@ def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_
        blocked_k.dtype,
    )
-    def flash_infer():
+    def flashinfer():
        output, lse = mla_wrapper.run(
            q_nope.view(-1, h_q, dv),
            q_pe.view(-1, h_q, d - dv),
@@ -137,8 +137,8 @@ def run_flash_infer(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_
            return_lse=True)
        return output.view(b, -1, h_q, dv), lse.view(b, h_q, 1)
-    out_flash, lse_flash = flash_infer()
+    out_flash, lse_flash = flashinfer()
-    t = triton.testing.do_bench(flash_infer)
+    t = triton.testing.do_bench(flashinfer)
    return out_flash, lse_flash, t
@@ -459,7 +459,7 @@ FUNC_TABLE = {
    "torch": run_torch_mla,
    "tilelang": run_flash_mla_tilelang,
    "flash_mla": run_flash_mla,
-    "flash_infer": run_flash_infer,
+    "flashinfer": run_flashinfer,
    "flash_mla_triton": run_flash_mla_triton,
 }
@@ -496,9 +496,9 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
                                       s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype)
    torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
-    if target not in ["flash_infer", "flash_mla_triton", "tilelang"
+    if target not in ["flashinfer", "flash_mla_triton", "tilelang"
-                     ] and baseline not in ["flash_infer", "flash_mla_triton", "tilelang"]:
+                     ] and baseline not in ["flashinfer", "flash_mla_triton", "tilelang"]:
-        # flash_infer has a different lse return value
+        # flashinfer has a different lse return value
        # flash_mla_triton and flash_mla_tilelang doesn't return lse
        torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
@@ -554,7 +554,7 @@ available_targets = [
    "torch",
    "tilelang",
    "flash_mla",
-    "flash_infer",
+    "flashinfer",
    "flash_mla_triton",
 ]

--- a/examples/deepseek_mla/example_mla_decode_paged.py
+++ b/examples/deepseek_mla/example_mla_decode_paged.py
@@ -11,8 +11,19 @@ import math
    out_idx=[8], pass_configs={
        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
    })
-def mla_decode_tilelang(batch, h_q, h_kv, max_seqlen_pad, dv, dpe, block_N, block_H, num_split,
+def mla_decode_tilelang(batch,
-                        block_size, softmax_scale):
+                        h_q,
+                        h_kv,
+                        max_seqlen_pad,
+                        dv,
+                        dpe,
+                        block_N,
+                        block_H,
+                        num_split,
+                        block_size,
+                        softmax_scale=None):
+    if softmax_scale is None:
+        softmax_scale = (dv + dpe)**-0.5
    scale = float(softmax_scale * 1.44269504)  # log2(e)
    dtype = "float16"
    accum_dtype = "float"
@@ -322,7 +333,7 @@ def run_tilelang_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s
    num_kv_splits = 1
    BLOCK_N = 64
    BLOCK_H = min(64, h_q // h_kv)
-    softmax_scale = (d + dv)**-0.5
+    softmax_scale = d**-0.5
    out_partial = torch.empty(b, h_q, num_kv_splits, dv, dtype=dtype, device=q.device)
    glse = torch.empty(b, h_q, num_kv_splits, dtype=dtype, device=q.device)
@@ -379,7 +390,7 @@ if __name__ == "__main__":
    max_seqlen = cache_seqlens.max().item()
    max_seqlen_pad = math.ceil(max_seqlen / 256) * 256
-    total_flops = s_q * total_seqlens * h_q * (d + dv) * 2
+    total_flops = s_q * total_seqlens * h_q * d * 2
    q = torch.randn(b, s_q, h_q, d, dtype=dtype, device=device)
    block_table = torch.arange(