fix: fixes multi head attention for context parallel: rotary embedding to use...

fix: fixes multi head attention for context parallel: rotary embedding to use padded cu_seq_lens (#2077) fix: fixes mha to use padded cu_seq_lens during cp Signed-off-by: Jonathan Mitchell <jomitchell@nvidia.com>

fix: fixes multi head attention for context parallel: rotary embedding to use...
fix: fixes multi head attention for context parallel: rotary embedding to use padded cu_seq_lens (#2077) fix: fixes mha to use padded cu_seq_lens during cp Signed-off-by: Jonathan Mitchell <jomitchell@nvidia.com>
6ba98d43 · jomitchellnv · GitHub · c654e4fe · 6ba98d43
Unverified Commit 6ba98d43 authored Aug 15, 2025 by jomitchellnv Committed by GitHub Aug 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 2 deletions

transformer_engine/pytorch/attention/multi_head_attention.py transformer_engine/pytorch/attention/multi_head_attention.py +9 -2

No files found.
--- a/transformer_engine/pytorch/attention/multi_head_attention.py
+++ b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -907,12 +907,19 @@ class MultiheadAttention(torch.nn.Module):
                q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
                k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]

+            if pad_between_seqs:
+                rotary_pos_cu_seq_lens_q = cu_seqlens_q_padded
+                rotary_pos_cu_seq_lens_kv = cu_seqlens_kv_padded
+            else:
+                rotary_pos_cu_seq_lens_q = cu_seqlens_q
+                rotary_pos_cu_seq_lens_kv = cu_seqlens_kv
+
            query_layer = apply_rotary_pos_emb(
                query_layer,
                q_pos_emb,
                self.qkv_format,
                fused=True,
-                cu_seqlens=cu_seqlens_q,
+                cu_seqlens=rotary_pos_cu_seq_lens_q,
                cp_size=self.cp_size,
                cp_rank=self.cp_rank,
                interleaved=self.rotary_pos_interleaved,
@@ -922,7 +929,7 @@ class MultiheadAttention(torch.nn.Module):
                k_pos_emb,
                self.qkv_format,
                fused=True,
-                cu_seqlens=cu_seqlens_kv,
+                cu_seqlens=rotary_pos_cu_seq_lens_kv,
                cp_size=self.cp_size,
                cp_rank=self.cp_rank,
                interleaved=self.rotary_pos_interleaved,