Develop v2.10

rollback activation offloading implementation See merge request dcutoolkit/deeplearing/TransformerEngine!70 Co-authored-by: dongcl <791582849@qq.com>

Develop v2.10
rollback activation offloading implementation See merge request dcutoolkit/deeplearing/TransformerEngine!70 Co-authored-by: dongcl <791582849@qq.com>
13123839 · dongchl · wenjh · e6f2caf5 · 13123839 · 13123839
Commit 13123839 authored Jan 09, 2026 by dongchl Committed by wenjh Jan 09, 2026
5 changed files
--- a/transformer_engine/pytorch/module/batched_linear.py
+++ b/transformer_engine/pytorch/module/batched_linear.py
@@ -95,7 +95,6 @@ class _BatchLinear(torch.autograd.Function):
        activation_dtype: torch.dtype,
        parallel_mode: Union[str, None],
        is_grad_enabled: bool,
-        fine_grained_activation_offloading,
        *weights_and_biases: Union[Float8Tensor, torch.Tensor, None],
    ) -> torch.Tensor:
        batch_num = int(os.getenv("NVTE_MOE_BATCHCOUNT", "2"))
@@ -160,33 +159,6 @@ class _BatchLinear(torch.autograd.Function):
                        if t is not None:
                            t.activation_offloading = True

-            for i in range(num_gemms):
-                weights[i].offloading_activation = False
-                weights[i].main_grad.offloading_activation = False
-                if weights_fp8[i] is not None:
-                    weights_fp8[i].offloading_activation = False
-
-            ctx.fine_grained_activation_offloading = fine_grained_activation_offloading
-
-            if fine_grained_activation_offloading and cpu_offloading:
-                raise ValueError(
-                    f"Do not use fine_grained_activation_offloading and cpu_offloading at the same time."
-                )
-
-            if (
-                fine_grained_activation_offloading
-                and weights[0].requires_grad
-                and fuse_wgrad_accumulation
-            ):
-                grad_added_to_main_grad_list = []
-                for weight in weights:
-                    if weight.requires_grad and hasattr(weight, "grad_added_to_main_grad"):
-                        grad_added_to_main_grad_list.append(weight.grad_added_to_main_grad)
-                        weight.grad_added_to_main_grad = True
-                    else:
-                        grad_added_to_main_grad_list.append(None)
-                ctx.grad_added_to_main_grad_list = grad_added_to_main_grad_list
-
            ctx.save_for_backward(
                None,
                *saved_inputmats,
@@ -194,7 +166,7 @@ class _BatchLinear(torch.autograd.Function):
                *weights,
                *weights_fp8,
                *[
-                    w.main_grad if (cpu_offloading or fine_grained_activation_offloading) and fuse_wgrad_accumulation else None
+                    w.main_grad if cpu_offloading and fuse_wgrad_accumulation else None
                    for w in weights
                ],
            )
@@ -233,13 +205,11 @@ class _BatchLinear(torch.autograd.Function):
            weights = saved_tensors[2 * ctx.num_gemms : 3 * ctx.num_gemms]
            weights_fp8 = saved_tensors[3 * ctx.num_gemms : 4 * ctx.num_gemms]
            main_grads = saved_tensors[4 * ctx.num_gemms :]
-            if (ctx.cpu_offloading or ctx.fine_grained_activation_offloading) and ctx.fuse_wgrad_accumulation:
+            if ctx.cpu_offloading and ctx.fuse_wgrad_accumulation:
                for i in range(ctx.num_gemms):
                    w = torch.nn.Parameter(weights[i], weights[i].requires_grad)
                    w.main_grad = main_grads[i]
                    weights[i] = w
-                    if ctx.fine_grained_activation_offloading and weights[i].requires_grad:
-                        weights[i].grad_added_to_main_grad = ctx.grad_added_to_main_grad_list[i]

            global _GEMM_INPUT, _GEMM_WEIGHT, _GRAD_OUTPUT
            grad_output = grad_output.contiguous()
@@ -371,7 +341,6 @@ class _BatchLinear(torch.autograd.Function):
            None,  # activation_dtype
            None,  # parallel_mode
            None,  # is_grad_enabled
-            None,  # fine_grained_activation_offloading
            *wgrad_list,
            *([None] * ctx.num_gemms),  # weights_fp8
            *grad_biases,
@@ -462,7 +431,6 @@ class BatchedLinear(TransformerEngineBaseModule):
        device: Union[torch.device, str] = "cuda",
        ub_overlap_rs: bool = False,
        ub_overlap_ag: bool = False,
-        fine_grained_activation_offloading: bool = False,
        ub_name: Optional[str] = None,
        delay_wgrad_compute: bool = False,
    ) -> None:
@@ -486,8 +454,6 @@ class BatchedLinear(TransformerEngineBaseModule):
        self.get_rng_state_tracker = get_rng_state_tracker
        self.rng_tracker_name = rng_tracker_name

-        self.fine_grained_activation_offloading = fine_grained_activation_offloading
-
        self.wgrad_store = WeightGradStore(delay_wgrad_compute)

        global _GEMM_INPUT, _GEMM_WEIGHT, _GEMM_OUTPUT
@@ -665,7 +631,6 @@ class BatchedLinear(TransformerEngineBaseModule):
                self.activation_dtype,
                self.parallel_mode,
                torch.is_grad_enabled(),
-                self.fine_grained_activation_offloading,
                *weight_tensors,
                *weight_tensors_fp8,
                *bias_tensors,

--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -84,7 +84,6 @@ class _GroupedLinear(torch.autograd.Function):
        module,
        skip_fp8_weight_update,
        save_original_input,
-        fine_grained_activation_offloading,
        *weights_and_biases,
    ) -> torch.Tensor:
        # pylint: disable=missing-function-docstring
@@ -222,16 +221,6 @@ class _GroupedLinear(torch.autograd.Function):
            else:
                inputmats = [None] * num_gemms

-            for i in range(num_gemms):
-                weights[i].offloading_activation = False
-                weights_fp8[i].offloading_activation = False
-                biases[i].offloading_activation = False
-            ctx.fine_grained_activation_offloading = fine_grained_activation_offloading
-
-            if fine_grained_activation_offloading and cpu_offloading:
-                raise ValueError(
-                    f"Do not use fine_grained_activation_offloading and cpu_offloading at the same time."
-                )
            if cpu_offloading:
                ctx.grad_added_to_main_grad = hasattr(weights[0], "grad_added_to_main_grad")

@@ -244,21 +233,6 @@ class _GroupedLinear(torch.autograd.Function):
                    ctx.weight_objects = []
                    for weight in weights:
                        ctx.weight_objects.append(weight)
-            if (
-                fine_grained_activation_offloading
-                and weights[0].requires_grad
-                and fuse_wgrad_accumulation
-            ):
-                grad_added_to_main_grad_list = []
-                ctx.grad_added_to_main_grad = hasattr(weights[0], "grad_added_to_main_grad")
-                for weight in weights:
-                    if ctx.grad_added_to_main_grad:
-                        grad_added_to_main_grad_list.append(weight.grad_added_to_main_grad)
-                        weight.grad_added_to_main_grad = True
-                        ctx.weight_objects.append(weight)
-                    else:
-                        grad_added_to_main_grad_list.append(None)
-                ctx.grad_added_to_main_grad_list = grad_added_to_main_grad_list

            tensors_to_save, tensor_objects = prepare_for_saving(
                *inputmats,
@@ -322,15 +296,12 @@ class _GroupedLinear(torch.autograd.Function):
            biases = saved_tensors[3 * N : 4 * N]
            main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs]

-            if ctx.cpu_offloading or ctx.fine_grained_activation_offloading:
+            if ctx.cpu_offloading:
                if ctx.grad_added_to_main_grad:
                    for i, weight in enumerate(ctx.weight_objects):
                        origin_weights[i] = ctx.weight_objects[i]
                        ctx.weight_objects[i] = None

-                        if ctx.fine_grained_activation_offloading:
-                            origin_weights[i].grad_added_to_main_grad = ctx.grad_added_to_main_grad_list[i]
-
            if ctx.fuse_wgrad_accumulation:
                for i in range(N):
                    origin_weights[i].main_grad = main_grads[i]
@@ -545,7 +516,6 @@ class _GroupedLinear(torch.autograd.Function):
            None,
            None,
            None,
-            None,
            *wgrad_list,
            *grad_biases,
        )
@@ -629,7 +599,6 @@ class GroupedLinear(TransformerEngineBaseModule):
        ub_overlap_rs: bool = False,
        ub_overlap_ag: bool = False,
        ub_name: Optional[str] = None,
-        fine_grained_activation_offloading: bool = False,
        delay_wgrad_compute: bool = False,
        save_original_input: bool = False,
    ) -> None:
@@ -652,7 +621,6 @@ class GroupedLinear(TransformerEngineBaseModule):
        ), "GroupedLinear doesn't support Userbuffer overlap."
        self.get_rng_state_tracker = get_rng_state_tracker
        self.rng_tracker_name = rng_tracker_name
-        self.fine_grained_activation_offloading = fine_grained_activation_offloading

        self.wgrad_store = WeightGradStore(delay_wgrad_compute)

@@ -872,7 +840,6 @@ class GroupedLinear(TransformerEngineBaseModule):
                self,
                skip_fp8_weight_update,
                self.save_original_input,
-                self.fine_grained_activation_offloading,
                *weight_tensors,
                *bias_tensors,
            )

--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -40,7 +40,6 @@ from ..utils import (
    nvtx_range_push,
    requires_grad,
    needs_quantized_gemm,
-    get_activation_offloading,
 )
 from ..distributed import (
    set_tensor_model_parallel_attributes,
@@ -137,7 +136,6 @@ class _LayerNormLinear(torch.autograd.Function):
        ub_bulk_wgrad: bool,
        ub_bulk_dgrad: bool,
        ub_name: str,
-        fine_grained_activation_offloading: bool,
        fsdp_group: Union[dist_group_type, None],
        module: torch.nn.Module,
        skip_fp8_weight_update: bool,
@@ -593,11 +591,10 @@ class _LayerNormLinear(torch.autograd.Function):

            # For CPU offloading, we offloaded weight and weight.main_grad to different tensors,
            # we need to connect them into one.
-            if ctx.cpu_offloading or ctx.fine_grained_activation_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
-                if ctx.has_grad_added_to_main_grad:
+            if ctx.cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
+                if ctx.grad_added_to_main_grad:
                    origin_weight = ctx.weight_object
-                    if ctx.fine_grained_activation_offloading:
-                        origin_weight.grad_added_to_main_grad = ctx.grad_added_to_main_grad
+
            if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation:
                origin_weight.main_grad = main_grad

@@ -1077,7 +1074,6 @@ class _LayerNormLinear(torch.autograd.Function):
            None,  # ub_bulk_dgrad
            None,  # ub_bulk_wgrad
            None,  # ub_name
-            None,  # fine_grained_activation_offloading
            None,  # fsdp_group
            None,  # debug
            None,  # module
@@ -1215,7 +1211,6 @@ class LayerNormLinear(TransformerEngineBaseModule):
        delay_wgrad_compute: bool = False,
        symmetric_ar_type: Optional[str] = None,
        name: str = None,
-        fine_grained_activation_offloading: bool = False,
    ) -> None:
        super().__init__()

@@ -1234,7 +1229,6 @@ class LayerNormLinear(TransformerEngineBaseModule):
        )
        self.zero_centered_gamma = zero_centered_gamma
        self.symmetric_ar_type = symmetric_ar_type
-        self.fine_grained_activation_offloading = fine_grained_activation_offloading

        self.wgrad_store = WeightGradStore(delay_wgrad_compute, ub_bulk_wgrad)
        self.name = name
@@ -1640,7 +1634,6 @@ class LayerNormLinear(TransformerEngineBaseModule):
                self.ub_bulk_wgrad,
                self.ub_bulk_dgrad,
                self.ub_name,
-                self.fine_grained_activation_offloading,
                self.fsdp_group,
                self,
                skip_fp8_weight_update,

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -39,7 +39,6 @@ from ..utils import (
    assert_dim_for_all_gather,
    nvtx_range_pop,
    nvtx_range_push,
-    get_activation_offloading,
 )
 from ..distributed import (
    set_tensor_model_parallel_attributes,
@@ -417,30 +416,10 @@ class _Linear(torch.autograd.Function):
            )
            nvtx_range_pop(f"{nvtx_label}.fsdp_scatter")

-            ctx.fine_grained_activation_offloading = fine_grained_activation_offloading
-
-            if fine_grained_activation_offloading and cpu_offloading:
-                raise ValueError(
-                    f"Do not use fine_grained_activation_offloading and cpu_offloading at the same time."
-                )
-
-            if (
-                fine_grained_activation_offloading
-                and weight.requires_grad
-                and fuse_wgrad_accumulation
-            ):
-                if hasattr(weight, "grad_added_to_main_grad"):
-                    ctx.has_grad_added_to_main_grad = True
-                    ctx.grad_added_to_main_grad = weight.grad_added_to_main_grad
-                    weight.grad_added_to_main_grad = True
-                    ctx.weight_object = weight
-                else:
-                    ctx.has_grad_added_to_main_grad = False
-
            if cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
-                ctx.has_grad_added_to_main_grad = hasattr(weight, "grad_added_to_main_grad")
+                ctx.grad_added_to_main_grad = hasattr(weight, "grad_added_to_main_grad")

-                if ctx.has_grad_added_to_main_grad:
+                if ctx.grad_added_to_main_grad:
                    # If you are passing torch.nn.Parameter through the Torch hooks, you will
                    # get back torch.Tensor. Torch rips off the Parameter wrapper.
                    # You need to preserve the weight object to have all the attributes user
@@ -537,11 +516,10 @@ class _Linear(torch.autograd.Function):
                else None
            )

-            if ctx.cpu_offloading or ctx.fine_grained_activation_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
-                if ctx.has_grad_added_to_main_grad:
+            if ctx.cpu_offloading or int(os.getenv("NVTE_SWAP_OVERLAP_GRAD", "0")):
+                if ctx.grad_added_to_main_grad:
                    weight = ctx.weight_object
-                    if ctx.fine_grained_activation_offloading:
-                        weight.grad_added_to_main_grad = ctx.grad_added_to_main_grad
+
            if ctx.requires_wgrad and ctx.fuse_wgrad_accumulation:
                weight.main_grad = main_grad

@@ -1031,7 +1009,6 @@ class _Linear(torch.autograd.Function):
            None,  # ub_bulk_dgrad
            None,  # ub_bulk_wgrad
            None,  # ub_name
-            None,  # fine_grained_activation_offloading
            None,  # fp8_output
            None,  # fsdp_group
            None,  # module
@@ -1156,7 +1133,6 @@ class Linear(TransformerEngineBaseModule):
        symmetric_ar_type: Optional[str] = None,
        save_original_input: bool = False,
        name: Optional[str] = None,
-        fine_grained_activation_offloading: bool = False,
    ) -> None:
        super().__init__()

@@ -1172,7 +1148,6 @@ class Linear(TransformerEngineBaseModule):
        self.symmetric_ar_type = symmetric_ar_type
        self.save_original_input = save_original_input
        self.name = name
-        self.fine_grained_activation_offloading = fine_grained_activation_offloading

        self.wgrad_store = WeightGradStore(delay_wgrad_compute, ub_bulk_wgrad)

@@ -1521,7 +1496,6 @@ class Linear(TransformerEngineBaseModule):
                self.ub_bulk_dgrad,
                self.ub_bulk_wgrad,
                self.ub_name,
-                self.fine_grained_activation_offloading,
                fp8_output,
                self.fsdp_group,
                self,

--- a/transformer_engine/pytorch/utils.py
+++ b/transformer_engine/pytorch/utils.py
@@ -804,30 +804,3 @@ def make_weak_ref(x):
    if x is None:
        return None
    raise TypeError(f"Invalid type {type(x)} to make weak ref")
-
-
-ActivationOffloadEnabled = False
-
-def get_activation_offloading():
-    global ActivationOffloadEnabled
-    return ActivationOffloadEnabled
-
-
-def set_activation_offloading(activation_offloading):
-    global ActivationOffloadEnabled
-    ActivationOffloadEnabled = activation_offloading
-
-
-class ActivationOffloadContextManager:
-    """A reusable context manager for switch ActivationOffloadEnabled"""
-
-    def __init__(self, activation_offloading):
-        self.activation_offloading = activation_offloading
-
-    def __enter__(self):
-        self.origin_cpu_offloading = get_activation_offloading()
-        set_activation_offloading(self.activation_offloading)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        set_activation_offloading(self.origin_cpu_offloading)