Fix FA3 DeepSeek prefill performance regression (#5624)

Co-authored-by: ispobock <ispobaoke@gmail.com>

Fix FA3 DeepSeek prefill performance regression (#5624)
Co-authored-by: ispobock <ispobaoke@gmail.com>
4418f599 · JieXin Liang · GitHub · 04f2abcb · 4418f599
Unverified Commit 4418f599 authored Apr 22, 2025 by JieXin Liang Committed by GitHub Apr 22, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

python/sglang/srt/models/deepseek_v2.py python/sglang/srt/models/deepseek_v2.py +6 -2

No files found.
--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -583,13 +583,17 @@ class DeepseekV2AttentionMLA(nn.Module):
                return AttnForwardMethod.MLA
        elif self.attention_backend == "fa3":
            # Flash Attention: Use MHA with chunked KV cache when prefilling on long sequences.
+            if forward_batch.extend_prefix_lens_cpu is not None:
+                sum_extend_prefix_lens = sum(forward_batch.extend_prefix_lens_cpu)
            if (
                forward_batch.forward_mode.is_extend()
                and not self.disable_chunked_prefix_cache
                and not forward_batch.forward_mode.is_target_verify()
                and not forward_batch.forward_mode.is_draft_extend()
-                and sum(forward_batch.extend_prefix_lens_cpu)
+                and (
-                >= self.chunked_prefix_cache_threshold
+                    sum_extend_prefix_lens >= self.chunked_prefix_cache_threshold
+                    or sum_extend_prefix_lens == 0
+                )
            ):
                return AttnForwardMethod.MHA_CHUNKED_KV
            else: