Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480)

243e745d · Elfie Guo · GitHub · 61a0e600 · 243e745d
Unverified Commit 243e745d authored Aug 21, 2025 by Elfie Guo Committed by GitHub Aug 21, 2025
Show whitespace changes
Inline Side-by-side

Showing with 2 additions and 0 deletions

python/sglang/srt/models/deepseek_v2.py python/sglang/srt/models/deepseek_v2.py +2 -0

No files found.
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module):
            attention_backend == "flashinfer"
            or attention_backend == "fa3"
            or attention_backend == "flashmla"
+            or attention_backend == "trtllm_mla"
+            or attention_backend == "cutlass_mla"
        ):
            # Use MHA with chunked KV cache when prefilling on long sequences.
            sum_extend_prefix_lens = (