Merge branch 'develop_v2.9' into 'develop_v2.9'

grouped_linear supports activation offloading See merge request dcutoolkit/deeplearing/TransformerEngine!62

Merge branch 'develop_v2.9' into 'develop_v2.9'
grouped_linear supports activation offloading See merge request dcutoolkit/deeplearing/TransformerEngine!62
177291ac · wenjh · 99e60246 · db0ad945 · 177291ac
Commit 177291ac authored Dec 04, 2025 by wenjh
Show whitespace changes
Inline Side-by-side

Showing with 29 additions and 1 deletion

transformer_engine/pytorch/module/grouped_linear.py transformer_engine/pytorch/module/grouped_linear.py +29 -1

No files found.
--- a/transformer_engine/pytorch/module/grouped_linear.py
+++ b/transformer_engine/pytorch/module/grouped_linear.py
@@ -213,6 +213,16 @@ class _GroupedLinear(torch.autograd.Function):
                    if isinstance(weight, QuantizedTensorStorage):
                        weight.update_usage(columnwise_usage=True)
+            for i in range(num_gemms):
+                weights[i].offloading_activation = False
+                weights_fp8[i].offloading_activation = False
+                biases[i].offloading_activation = False
+            ctx.fine_grained_activation_offloading = fine_grained_activation_offloading
+            if fine_grained_activation_offloading and cpu_offloading:
+                raise ValueError(
+                    f"Do not use fine_grained_activation_offloading and cpu_offloading at the same time."
+                )
            if cpu_offloading:
                ctx.grad_added_to_main_grad = hasattr(weights[0], "grad_added_to_main_grad")
@@ -225,6 +235,21 @@ class _GroupedLinear(torch.autograd.Function):
                    ctx.weight_objects = []
                    for weight in weights:
                        ctx.weight_objects.append(weight)
+            if (
+                fine_grained_activation_offloading
+                and weights[0].requires_grad
+                and fuse_wgrad_accumulation
+            ):
+                grad_added_to_main_grad_list = []
+                ctx.grad_added_to_main_grad = hasattr(weights[0], "grad_added_to_main_grad")
+                for weight in weights:
+                    if ctx.grad_added_to_main_grad:
+                        grad_added_to_main_grad_list.append(weight.grad_added_to_main_grad)
+                        weight.grad_added_to_main_grad = True
+                        ctx.weight_objects.append(weight)
+                    else:
+                        grad_added_to_main_grad_list.append(None)
+                ctx.grad_added_to_main_grad_list = grad_added_to_main_grad_list
            tensors_to_save, tensor_objects = prepare_for_saving(
                *inputmats,
@@ -288,12 +313,15 @@ class _GroupedLinear(torch.autograd.Function):
            biases = saved_tensors[3 * N : 4 * N]
            main_grads = [main_grad_func() for main_grad_func in ctx.main_grad_funcs]
-            if ctx.cpu_offloading:
+            if ctx.cpu_offloading or ctx.fine_grained_activation_offloading:
                if ctx.grad_added_to_main_grad:
                    for i, weight in enumerate(ctx.weight_objects):
                        origin_weights[i] = ctx.weight_objects[i]
                        ctx.weight_objects[i] = None
+                        if ctx.fine_grained_activation_offloading:
+                            origin_weights[i].grad_added_to_main_grad = ctx.grad_added_to_main_grad_list[i]
                if ctx.fuse_wgrad_accumulation:
                    for i in range(N):
                        origin_weights[i].main_grad = main_grads[i]