fix bwd error of context parallelism implementation with FA v2 (#498)

fix bwd error with FA v2 Signed-off-by: Xiaowei Ren <xren@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

fix bwd error of context parallelism implementation with FA v2 (#498)
fix bwd error with FA v2 Signed-off-by: Xiaowei Ren <xren@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
74eb7c33 · Xiaowei Ren · GitHub · d20ba9fb · 74eb7c33
Unverified Commit 74eb7c33 authored Nov 03, 2023 by Xiaowei Ren Committed by GitHub Nov 03, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 1 deletion

transformer_engine/pytorch/attention.py transformer_engine/pytorch/attention.py +2 -1

No files found.
--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py
@@ -572,6 +572,7 @@ class FlashAttnUnpaddedFuncWithCP(torch.autograd.Function):
        # [b, np, sq] -> [b, np, 2, sq//2]
        softmax_lse_ = softmax_lse.view(*softmax_lse.shape[:-1], 2, softmax_lse.shape[-1]//2)
+        softmax_lse_ = softmax_lse_[..., 1, :].contiguous()
        # [b*sq, np, hn] -> [b, 2, sq//2, np, hn]
        out = out.view(*q.shape)
        dout = dout.view(*q.shape)
@@ -659,7 +660,7 @@ class FlashAttnUnpaddedFuncWithCP(torch.autograd.Function):
                    out_ = out[:, 1, ...].contiguous().view(-1, *out.shape[-2:])
                    dout_ = dout[:, 1, ...].contiguous().view(-1, *dout.shape[-2:])
                    _flash_attn_backward(
-                        dout_, q_, kv_[0], kv_[1], out_, softmax_lse_[..., 1, :],
+                        dout_, q_, kv_[0], kv_[1], out_, softmax_lse_,
                        dq_, dkv_[0], dkv_[1], cu_seqlens_q//2, cu_seqlens_k,
                        ctx.max_seqlen_q//2, ctx.max_seqlen_k,
                        ctx.dropout_p, ctx.softmax_scale, False,