Unverified Commit 243e745d authored by Elfie Guo's avatar Elfie Guo Committed by GitHub
Browse files

Add trtllm_mla and cutlass_mla for ragged fmha for chunked prefill (#9480)

parent 61a0e600
...@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module): ...@@ -999,6 +999,8 @@ class DeepseekV2AttentionMLA(nn.Module):
attention_backend == "flashinfer" attention_backend == "flashinfer"
or attention_backend == "fa3" or attention_backend == "fa3"
or attention_backend == "flashmla" or attention_backend == "flashmla"
or attention_backend == "trtllm_mla"
or attention_backend == "cutlass_mla"
): ):
# Use MHA with chunked KV cache when prefilling on long sequences. # Use MHA with chunked KV cache when prefilling on long sequences.
sum_extend_prefix_lens = ( sum_extend_prefix_lens = (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment