Avoid redundant computation for cu_seqlens (#535)

avoid redundant computation for cu_seqlens Signed-off-by: Hongbin Liu <hongbinl@nvidia.com> Co-authored-by: Hongbin Liu <hongbinl@nvidia.com> Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>

Avoid redundant computation for cu_seqlens (#535)
avoid redundant computation for cu_seqlens Signed-off-by: Hongbin Liu <hongbinl@nvidia.com> Co-authored-by: Hongbin Liu <hongbinl@nvidia.com> Co-authored-by: Tim Moon <4406448+timmoon10@users.noreply.github.com>
fad3044b · Hongbin Liu · GitHub · 82555b3f · fad3044b
Unverified Commit fad3044b authored Jan 02, 2024 by Hongbin Liu Committed by GitHub Jan 01, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 14 deletions

transformer_engine/pytorch/attention.py transformer_engine/pytorch/attention.py +18 -14

No files found.
--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py
@@ -1398,20 +1398,24 @@ class FlashAttention(torch.nn.Module):
                    query_layer_packed, key_layer_packed, value_layer_packed)
                cu_seqlens_q, cu_seqlens_kv = _cu_seqlens_q, _cu_seqlens_kv
            else:
-                if cu_seqlens_q is None:
-                    cu_seqlens_q = torch.arange(
-                            0,
-                            (batch_size + 1) * max_seqlen_q,
-                            step=max_seqlen_q,
-                            dtype=torch.int32,
-                            device=query_layer.device)
-                if cu_seqlens_kv is None:
-                    cu_seqlens_kv = torch.arange(
-                            0,
-                            (batch_size + 1) * max_seqlen_kv,
-                            step=max_seqlen_kv,
-                            dtype=torch.int32,
-                            device=key_layer.device)
+                if self.layer_number == 1:
+                    if cu_seqlens_q is None:
+                        cu_seqlens_q = torch.arange(
+                                0,
+                                (batch_size + 1) * max_seqlen_q,
+                                step=max_seqlen_q,
+                                dtype=torch.int32,
+                                device=query_layer.device)
+                    if cu_seqlens_kv is None:
+                        cu_seqlens_kv = torch.arange(
+                                0,
+                                (batch_size + 1) * max_seqlen_kv,
+                                step=max_seqlen_kv,
+                                dtype=torch.int32,
+                                device=key_layer.device)
+                    _cu_seqlens_q, _cu_seqlens_kv = cu_seqlens_q, cu_seqlens_kv
+                else:
+                    cu_seqlens_q, cu_seqlens_kv = _cu_seqlens_q, _cu_seqlens_kv
        elif qkv_format == 'thd':
            assert not context_parallel, "thd format is not supported for context parallelism!"
            assert (_flash_attn_2_available