Delete extra tensor objects after restoring float8 tensors (#1500)

* delete extra tensor objects after restoring float8 tensors Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * nit fix Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * fix the leak in float8tensor and mxfloat8tensor classes Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * uncomment the fix Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix lint Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> --------- Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

Delete extra tensor objects after restoring float8 tensors (#1500)
* delete extra tensor objects after restoring float8 tensors Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * nit fix Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * fix the leak in float8tensor and mxfloat8tensor classes Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * uncomment the fix Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix lint Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> --------- Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
d3efaebb · Sudhakar Singh · GitHub · 303c6d16 · d3efaebb · d3efaebb
Unverified Commit d3efaebb authored Feb 28, 2025 by Sudhakar Singh Committed by GitHub Feb 28, 2025
7 changed files
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -448,6 +448,9 @@ class _LayerNormLinear(torch.autograd.Function):
                mu,
                rsigma,
            ) = restore_from_saved(ctx.tensor_objects, saved_tensors)
+            # Delete the references to tensor objects once they've been consumed
+            # by the `restore_from_saved` method to construct back the actual tensors.
+            ctx.tensor_objects = None

            # Since main_grad can be modified inplace, it should not be a part of saved_tensors
            main_grad = (

--- a/transformer_engine/pytorch/module/layernorm_mlp.py
+++ b/transformer_engine/pytorch/module/layernorm_mlp.py
@@ -567,6 +567,10 @@ class _LayerNormMLP(torch.autograd.Function):
                mu,
                rsigma,
            ) = restore_from_saved(ctx.tensor_objects, saved_tensors)
+            # Delete the references to tensor objects once they've been consumed
+            # by the `restore_from_saved` method to construct back the actual tensors.
+            ctx.tensor_objects = None
+
            # Since main_grad can be modified inplace, it should not be a part of saved_tensors
            fc1_weight_main_grad = (
                ctx.fc1_main_grad

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -354,6 +354,9 @@ class _Linear(torch.autograd.Function):
            inputmat, weight_fp8, weight, bias = (  # pylint: disable=unbalanced-tuple-unpacking
                restore_from_saved(ctx.tensor_objects, saved_tensors)
            )
+            # Delete the references to tensor objects once they've been consumed
+            # by the `restore_from_saved` method to construct back the actual tensors.
+            ctx.tensor_objects = None

            # Since main_grad can be modified inplace, it should not be a part of saved_tensors
            main_grad = (

--- a/transformer_engine/pytorch/tensor/_internal/float8_tensor_base.py
+++ b/transformer_engine/pytorch/tensor/_internal/float8_tensor_base.py
@@ -105,8 +105,8 @@ class Float8TensorBase:

        """
        tensors = [self._data, self._transpose]
-        # self._data = None
-        # self._transpose = None
+        self._data = None
+        self._transpose = None
        return tensors, self

    def restore_from_saved(

--- a/transformer_engine/pytorch/tensor/_internal/mxfp8_tensor_base.py
+++ b/transformer_engine/pytorch/tensor/_internal/mxfp8_tensor_base.py
@@ -100,8 +100,8 @@ class MXFP8TensorBase:

        """
        tensors = [self._rowwise_data, self._columnwise_data]
-        # self._rowwise_data = None
-        # self._columnwise_data = None
+        self._rowwise_data = None
+        self._columnwise_data = None
        return tensors, self

    def restore_from_saved(

--- a/transformer_engine/pytorch/tensor/float8_tensor.py
+++ b/transformer_engine/pytorch/tensor/float8_tensor.py
@@ -348,6 +348,15 @@ class Float8Tensor(Float8TensorBase, QuantizedTensor):
        self._transpose = torch.Tensor() if self._transpose is not None else None
        self._transpose_invalid = True

+    def prepare_for_saving(self) -> Tuple[list[Optional[torch.Tensor]], Float8TensorBase]:
+        """Prepare the tensor base for saving for backward
+
+        After calling this, the tensor instance does not hold any
+        data.
+
+        """
+        return [self], None
+
    @classmethod
    def __torch_dispatch__(cls, func, types, args, kwargs=None):


--- a/transformer_engine/pytorch/tensor/mxfp8_tensor.py
+++ b/transformer_engine/pytorch/tensor/mxfp8_tensor.py
@@ -285,6 +285,15 @@ class MXFP8Tensor(MXFP8TensorBase, QuantizedTensor):
        self._rowwise_data = torch.Tensor() if self._rowwise_data is not None else None
        self._columnwise_data = torch.Tensor() if self._columnwise_data is not None else None

+    def prepare_for_saving(self) -> Tuple[list[Optional[torch.Tensor]], MXFP8TensorBase]:
+        """Prepare the tensor base for saving for backward
+
+        After calling this, the tensor instance does not hold any
+        data.
+
+        """
+        return [self], None
+
    @classmethod
    def __torch_dispatch__(cls, func, types, args, kwargs=None):