[Pytorch] Update context parallel softmax lse correction func (#716)

[Pytorch] Update context parallel softmax lse correction func. Signed-off-by: kitefang <kitefang@tencent.com> Co-authored-by: kitefang <kitefang@tencent.com>

[Pytorch] Update context parallel softmax lse correction func (#716)
[Pytorch] Update context parallel softmax lse correction func. Signed-off-by: kitefang <kitefang@tencent.com> Co-authored-by: kitefang <kitefang@tencent.com>
59bfc17b · Kite0011 · GitHub · c38779be · 59bfc17b
Unverified Commit 59bfc17b authored Mar 21, 2024 by Kite0011 Committed by GitHub Mar 20, 2024
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 3 deletions

transformer_engine/pytorch/attention.py transformer_engine/pytorch/attention.py +4 -3

No files found.
--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py
@@ -483,9 +483,10 @@ def flash_attn_fwd_out_correction(out, out_per_step, softmax_lse, softmax_lse_pe
 @jit_fuser
 def flash_attn_fwd_softmax_lse_correction(softmax_lse, softmax_lse_per_step):
    """Merge softmax stats of each step in Attention with context parallelism"""
-    softmax_lse.exp_()
+    max_scale = torch.max(softmax_lse, softmax_lse_per_step)
-    softmax_lse.add_(softmax_lse_per_step.to(torch.double).exp())
+    min_scale = torch.min(softmax_lse, softmax_lse_per_step)
-    softmax_lse.log_()
+    new_scale = max_scale + torch.log(1 + torch.exp(min_scale - max_scale))
+    softmax_lse.copy_(new_scale)
 class AttnFuncWithCP(torch.autograd.Function):