move cp_group setting to DotProductAttention (#468)

* rename set_context_parallel_running to set_context_parallel_group Signed-off-by: xren <xren@nvidia.com> * bug fix Signed-off-by: xren <xren@nvidia.com> --------- Signed-off-by: xren <xren@nvidia.com>

move cp_group setting to DotProductAttention (#468)
* rename set_context_parallel_running to set_context_parallel_group Signed-off-by: xren <xren@nvidia.com> * bug fix Signed-off-by: xren <xren@nvidia.com> --------- Signed-off-by: xren <xren@nvidia.com>
2574a1ca · Xiaowei Ren · GitHub · d7511ec4 · 2574a1ca · 2574a1ca
Unverified Commit 2574a1ca authored Oct 11, 2023 by Xiaowei Ren Committed by GitHub Oct 11, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 23 additions and 9 deletions

transformer_engine/pytorch/attention.py transformer_engine/pytorch/attention.py +19 -5

transformer_engine/pytorch/transformer.py transformer_engine/pytorch/transformer.py +4 -4

No files found.
--- a/transformer_engine/pytorch/attention.py
+++ b/transformer_engine/pytorch/attention.py
@@ -1914,6 +1914,17 @@ class DotProductAttention(torch.nn.Module):
        return hidden_states
+    def set_context_parallel_group(
+        self,
+        cp_group: Union[dist_group_type, None],
+        cp_global_ranks: List[int],
+        cp_stream: torch.cuda.Stream,
+    ) -> None:
+        """Set CP group"""
+        self.cp_group = cp_group
+        self.cp_global_ranks = cp_global_ranks
+        self.cp_stream = cp_stream
    def forward(
        self,
        query_layer: torch.Tensor,
@@ -2549,16 +2560,19 @@ class MultiheadAttention(torch.nn.Module):
        """Set TP group"""
        self.tp_group = tp_group
-    def set_context_parallel_running(
+    def set_context_parallel_group(
        self,
        cp_group: Union[dist_group_type, None],
        cp_global_ranks: List[int],
        cp_stream: torch.cuda.Stream,
    ) -> None:
-        """Set CP group and CP dual-stream running"""
+        """Set CP group"""
-        self.core_attention.cp_group = cp_group
+        # Deep iterate but skip self to avoid infinite recursion.
-        self.core_attention.cp_global_ranks = cp_global_ranks
+        for index, child in enumerate(self.modules()):
-        self.core_attention.cp_stream = cp_stream
+            if index == 0:
+                continue
+            if hasattr(child, "set_context_parallel_group"):
+                child.set_context_parallel_group(cp_group, cp_global_ranks, cp_stream)
    def forward(
        self,

--- a/transformer_engine/pytorch/transformer.py
+++ b/transformer_engine/pytorch/transformer.py
@@ -433,19 +433,19 @@ class TransformerLayer(torch.nn.Module):
            if hasattr(child, "set_tensor_parallel_group"):
                child.set_tensor_parallel_group(tp_group)
-    def set_context_parallel_running(
+    def set_context_parallel_group(
        self,
        cp_group: Union[dist_group_type, None],
        cp_global_ranks: List[int],
        cp_stream: torch.cuda.Stream,
    ) -> None:
-        """Set CP group and CP dual-stream running"""
+        """Set CP group"""
        # Deep iterate but skip self to avoid infinite recursion.
        for index, child in enumerate(self.modules()):
            if index == 0:
                continue
-            if hasattr(child, "set_context_parallel_running"):
+            if hasattr(child, "set_context_parallel_group"):
-                child.set_context_parallel_running(cp_group, cp_global_ranks, cp_stream)
+                child.set_context_parallel_group(cp_group, cp_global_ranks, cp_stream)
    def forward(
        self,