Unverified Commit df7014a8 authored by strgrb's avatar strgrb Committed by GitHub
Browse files

avoid cudaStreamSynchronize in DeepSeekV2AttentionMLA (#4577)


Co-authored-by: default avatarZhang Kaihong <zhangkaihong.zkh@alibaba-inc.com>
parent 49420741
......@@ -658,7 +658,7 @@ class DeepseekV2AttentionMLA(nn.Module):
and forward_batch.forward_mode.is_extend()
and not forward_batch.forward_mode.is_target_verify()
and not forward_batch.forward_mode.is_draft_extend()
and forward_batch.extend_prefix_lens.sum() == 0
and sum(forward_batch.extend_prefix_lens_cpu) == 0
)
else:
# Triton: Use normal computation for prefill and use weight absorption for extend/decode
......@@ -666,7 +666,7 @@ class DeepseekV2AttentionMLA(nn.Module):
forward_batch.forward_mode.is_extend()
and not forward_batch.forward_mode.is_target_verify()
and not forward_batch.forward_mode.is_draft_extend()
and forward_batch.extend_prefix_lens.sum() == 0
and sum(forward_batch.extend_prefix_lens_cpu) == 0
)
def forward(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment