[PyTorch] Fix wgrads for GroupedLinear when weights don't require grad (#1258)

Fix wgrad for GroupedLinear when weights doesn't require grad Signed-off-by: Xin Yao <xiny@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

[PyTorch] Fix wgrads for GroupedLinear when weights don't require grad (#1258)
Fix wgrad for GroupedLinear when weights doesn't require grad Signed-off-by: Xin Yao <xiny@nvidia.com> Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
2d7020e2 · Xin Yao · GitHub · 9001081d · 2d7020e2
Unverified Commit 2d7020e2 authored Oct 17, 2024 by Xin Yao Committed by GitHub Oct 17, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 29 additions and 27 deletions

transformer_engine/pytorch/module/grouped_linear.py transformer_engine/pytorch/module/grouped_linear.py +29 -27

No files found.
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -443,36 +443,38 @@ class _GroupedLinear(torch.autograd.Function):
                clear_tensor_data(*inputmats)
                clear_tensor_data(*inputmats_t)

-            if not ctx.use_bias:
-                grad_biases = [None] * ctx.num_gemms
-
-        def handle_custom_ddp_from_mcore(w, wgrad):
-            if w.requires_grad:
-                if ctx.fuse_wgrad_accumulation and hasattr(w, "grad_added_to_main_grad"):
-                    w.grad_added_to_main_grad = True
-                    if getattr(w, "zero_out_wgrad", False):
-                        wgrad = torch.zeros(
-                            w.main_grad.shape,
-                            dtype=w.dtype,
-                            device=torch.cuda.current_device(),
-                            requires_grad=False,
-                        )
+                def handle_custom_ddp_from_mcore(w, wgrad):
+                    if w.requires_grad:
+                        if ctx.fuse_wgrad_accumulation and hasattr(w, "grad_added_to_main_grad"):
+                            w.grad_added_to_main_grad = True
+                            if getattr(w, "zero_out_wgrad", False):
+                                wgrad = torch.zeros(
+                                    w.main_grad.shape,
+                                    dtype=w.dtype,
+                                    device=torch.cuda.current_device(),
+                                    requires_grad=False,
+                                )
+                            else:
+                                wgrad = torch.empty(
+                                    w.main_grad.shape,
+                                    dtype=w.dtype,
+                                    device=torch.cuda.current_device(),
+                                    requires_grad=False,
+                                )
+                        elif ctx.fuse_wgrad_accumulation:
+                            wgrad = None
                    else:
-                        wgrad = torch.empty(
-                            w.main_grad.shape,
-                            dtype=w.dtype,
-                            device=torch.cuda.current_device(),
-                            requires_grad=False,
-                        )
-                elif ctx.fuse_wgrad_accumulation:
-                    wgrad = None
+                        wgrad = None
+                    return wgrad
+
+                wgrad_list = [
+                    handle_custom_ddp_from_mcore(w, wgrad) for w, wgrad in zip(weights, wgrad_list)
+                ]
            else:
-                wgrad = None
-            return wgrad
+                wgrad_list = [None] * ctx.num_gemms

-        wgrad_list = [
-            handle_custom_ddp_from_mcore(w, wgrad) for w, wgrad in zip(weights, wgrad_list)
-        ]
+            if not ctx.use_bias:
+                grad_biases = [None] * ctx.num_gemms

        if ctx.reduce_and_update_bwd_fp8_tensors and not is_graph_capturing():
            FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False)