Merge branch 'develop_v2.5_swap' into 'develop_v2.5'

add swap env See merge request dcutoolkit/deeplearing/TransformerEngine!40

Merge branch 'develop_v2.5_swap' into 'develop_v2.5'
add swap env See merge request dcutoolkit/deeplearing/TransformerEngine!40
686e93cd · yuguo · c4bb6049 · d19a5a44 · 686e93cd · 686e93cd
Commit 686e93cd authored Aug 20, 2025 by yuguo
Showing with 4 additions and 4 deletions

transformer_engine/pytorch/module/layernorm_linear.py transformer_engine/pytorch/module/layernorm_linear.py +2 -2

transformer_engine/pytorch/module/linear.py transformer_engine/pytorch/module/linear.py +2 -2

No files found.
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -427,7 +427,7 @@ class _LayerNormLinear(torch.autograd.Function):
            )
            nvtx_range_pop(f"{nvtx_label}.fsdp_scatter")
-            if cpu_offloading:
+            if cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                ctx.grad_added_to_main_grad = hasattr(weight, "grad_added_to_main_grad")
                if ctx.grad_added_to_main_grad:
@@ -556,7 +556,7 @@ class _LayerNormLinear(torch.autograd.Function):
            # For CPU offloading, we offloaded weight and weight.main_grad to different tensors,
            # we need to connect them into one.
-            if ctx.cpu_offloading:
+            if ctx.cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                if ctx.grad_added_to_main_grad:
                    origin_weight = ctx.weight_object
                if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation:

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -368,7 +368,7 @@ class _Linear(torch.autograd.Function):
            )
            nvtx_range_pop(f"{nvtx_label}.fsdp_scatter")
-            if cpu_offloading:
+            if cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                ctx.grad_added_to_main_grad = hasattr(weight, "grad_added_to_main_grad")
                if ctx.grad_added_to_main_grad:
@@ -459,7 +459,7 @@ class _Linear(torch.autograd.Function):
                else None
            )
-            if ctx.cpu_offloading:
+            if ctx.cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
                if ctx.grad_added_to_main_grad:
                    weight = ctx.weight_object
                if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation: