`wgrad` should be zero'ed out if a weight parameter is shared among multiple layers (#545)

wgrad should be zero'ed out if a weight parameter is shared among multiple layers Signed-off-by: Deepak Narayanan <dnarayanan@nvidia.com>

`wgrad` should be zero'ed out if a weight parameter is shared among multiple layers (#545)
wgrad should be zero'ed out if a weight parameter is shared among multiple layers Signed-off-by: Deepak Narayanan <dnarayanan@nvidia.com>
387397a2 · Deepak Narayanan · GitHub · 753eed31 · 387397a2 · 387397a2
Unverified Commit 387397a2 authored Nov 30, 2023 by Deepak Narayanan Committed by GitHub Nov 30, 2023
3 changed files
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -532,6 +532,13 @@ class _LayerNormLinear(torch.autograd.Function):
            # Handle custom DDP from mcore.
            if ctx.fuse_wgrad_accumulation and hasattr(weight, 'grad_added_to_main_grad'):
                weight.grad_added_to_main_grad = True
+                if getattr(weight, 'zero_out_wgrad', False):
+                    wgrad = torch.zeros(weight.main_grad.shape,
+                                        dtype=weight.dtype,
+                                        device=torch.cuda.current_device(),
+                                        requires_grad=False
+                                       )
+                else:
                    wgrad = torch.empty(weight.main_grad.shape,
                                        dtype=weight.dtype,
                                        device=torch.cuda.current_device(),

--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -910,6 +910,13 @@ class _LayerNormMLP(torch.autograd.Function):
            # Handle custom DDP from mcore.
            if ctx.fuse_wgrad_accumulation and hasattr(fc1_weight, 'grad_added_to_main_grad'):
                fc1_weight.grad_added_to_main_grad = True
+                if getattr(weight, 'zero_out_wgrad', False):
+                    fc1_wgrad = torch.zeros(fc1_weight.main_grad.shape,
+                                            dtype=fc1_weight.dtype,
+                                            device=torch.cuda.current_device(),
+                                            requires_grad=False
+                                           )
+                else:
                    fc1_wgrad = torch.empty(fc1_weight.main_grad.shape,
                                            dtype=fc1_weight.dtype,
                                            device=torch.cuda.current_device(),
@@ -924,6 +931,13 @@ class _LayerNormMLP(torch.autograd.Function):
            # Handle custom DDP from mcore.
            if ctx.fuse_wgrad_accumulation and hasattr(fc2_weight, 'grad_added_to_main_grad'):
                fc2_weight.grad_added_to_main_grad = True
+                if getattr(weight, 'zero_out_wgrad', False):
+                    fc2_wgrad = torch.zeros(fc2_weight.main_grad.shape,
+                                            dtype=fc2_weight.dtype,
+                                            device=torch.cuda.current_device(),
+                                            requires_grad=False
+                                           )
+                else:
                    fc2_wgrad = torch.empty(fc2_weight.main_grad.shape,
                                            dtype=fc2_weight.dtype,
                                            device=torch.cuda.current_device(),

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -473,6 +473,13 @@ class _Linear(torch.autograd.Function):
            # Handle custom DDP from mcore.
            if ctx.fuse_wgrad_accumulation and hasattr(weight, 'grad_added_to_main_grad'):
                weight.grad_added_to_main_grad = True
+                if getattr(weight, 'zero_out_wgrad', False):
+                    wgrad = torch.zeros(weight.main_grad.shape,
+                                        dtype=weight.dtype,
+                                        device=torch.cuda.current_device(),
+                                        requires_grad=False
+                                       )
+                else:
                    wgrad = torch.empty(weight.main_grad.shape,
                                        dtype=weight.dtype,
                                        device=torch.cuda.current_device(),