add swap env

d19a5a44 · evt_fugx1 · c4bb6049 · d19a5a44 · d19a5a44
Commit d19a5a44 authored Aug 19, 2025 by evt_fugx1
Showing with 4 additions and 4 deletions

transformer_engine/pytorch/module/layernorm_linear.py transformer_engine/pytorch/module/layernorm_linear.py +2 -2

transformer_engine/pytorch/module/linear.py transformer_engine/pytorch/module/linear.py +2 -2

No files found.
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -427,7 +427,7 @@ class _LayerNormLinear(torch.autograd.Function):
            )
            nvtx_range_pop(f"{nvtx_label}.fsdp_scatter")

-            if cpu_offloading:
+            if cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                ctx.grad_added_to_main_grad = hasattr(weight, "grad_added_to_main_grad")

                if ctx.grad_added_to_main_grad:
@@ -556,7 +556,7 @@ class _LayerNormLinear(torch.autograd.Function):

            # For CPU offloading, we offloaded weight and weight.main_grad to different tensors,
            # we need to connect them into one.
-            if ctx.cpu_offloading:
+            if ctx.cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                if ctx.grad_added_to_main_grad:
                    origin_weight = ctx.weight_object
                if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation:

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -368,7 +368,7 @@ class _Linear(torch.autograd.Function):
            )
            nvtx_range_pop(f"{nvtx_label}.fsdp_scatter")

-            if cpu_offloading:
+            if cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                ctx.grad_added_to_main_grad = hasattr(weight, "grad_added_to_main_grad")

                if ctx.grad_added_to_main_grad:
@@ -459,7 +459,7 @@ class _Linear(torch.autograd.Function):
                else None
            )

-            if ctx.cpu_offloading:
+            if ctx.cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                if ctx.grad_added_to_main_grad:
                    weight = ctx.weight_object
                if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation: