Deprecate unused APIs (#321)

* Deprecate unused APIs Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * review comments Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Review Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

Deprecate unused APIs (#321)
* Deprecate unused APIs Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * review comments Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> * Review Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com> --------- Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
58d2ebab · Kirthi Shankar Sivamani · GitHub · b172bad8 · 58d2ebab · 58d2ebab
Unverified Commit 58d2ebab authored Jul 14, 2023 by Kirthi Shankar Sivamani Committed by GitHub Jul 14, 2023
Showing with 154 additions and 130 deletions

transformer_engine/pytorch/module/layernorm_linear.py transformer_engine/pytorch/module/layernorm_linear.py +77 -65

transformer_engine/pytorch/module/linear.py transformer_engine/pytorch/module/linear.py +77 -65

No files found.
--- a/transformer_engine/pytorch/module/layernorm_linear.py
+++ b/transformer_engine/pytorch/module/layernorm_linear.py
@@ -4,6 +4,7 @@

 """LayerNormLinear API"""
 import os
+import warnings
 from typing import Union, Optional, Callable, Tuple, List, Dict, Any


@@ -538,6 +539,11 @@ class LayerNormLinear(TransformerEngineBaseModule):
    r"""
    Applies layer normalization followed by linear transformation to the incoming data.

+    .. warning::
+
+        Argument :attr:`skip_weight_param_allocation` is deprecated and will
+        be fully removed in future releases.
+
    Parameters
    ----------
    in_features : int
@@ -585,9 +591,6 @@ class LayerNormLinear(TransformerEngineBaseModule):
                   used to decide whether this Linear layer is Column Parallel Linear or Row
                   Parallel Linear as described `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
                   When set to `None`, no communication is performed.
-    skip_weight_param_allocation: bool, default = `False`
-                                 if set to `True`, weight parameter is not allocated and must be
-                                 passed as a keyword argument `weight` during the forward pass.

    Optimization parameters
    -----------------------
@@ -633,6 +636,14 @@ class LayerNormLinear(TransformerEngineBaseModule):
    ) -> None:
        super().__init__()

+        if skip_weight_param_allocation:
+            warnings.warn(
+                "Argument `skip_weight_param_allocation` is deprecated and"
+                "will be fully removed in future releases. It is ignored"
+                "starting from v0.11.",
+                category=DeprecationWarning,
+            )
+
        params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype
        self.in_features = in_features
        self.out_features = out_features
@@ -695,7 +706,6 @@ class LayerNormLinear(TransformerEngineBaseModule):
        setattr(self.layer_norm_bias, "sequence_parallel", self.sequence_parallel)
        self.reset_layer_norm_parameters()

-        if not skip_weight_param_allocation:
        self.weight_tensor = torch.empty(
            self.out_features, self.in_features,
            device=torch.cuda.current_device(),
@@ -821,17 +831,15 @@ class LayerNormLinear(TransformerEngineBaseModule):
        """
        Apply layer normalization to the input followed by a linear transformation.

+        .. warning::
+
+            Arguments :attr:`weight` and :attr:`bias` are deprecated and will
+            be fully removed in future releases.
+
        Parameters
        ----------
        inp : torch.Tensor
             Input tensor.
-        weight : torch.Tensor, default = None
-                An optional weight tensor for the module. This argument is compulsory if module
-                is initialized with `skip_weight_param_allocation=True`
-        bias : torch.Tensor, default = None
-              An optional bias tensor for the module. This argument is compulsory if module
-              is initialized with `skip_weight_param_allocation=True` and one of `use_bias`
-              or `return_bias`
        is_first_microbatch : {True, False, None}, default = None
                             During training using either gradient accumulation or
                             pipeline parallelism a minibatch of data is further split
@@ -847,16 +855,20 @@ class LayerNormLinear(TransformerEngineBaseModule):
                               produced)
        """

+        if weight is not None or bias is not None:
+            raise RuntimeError(
+                "Arguments `weight` and `bias` are deprecated and "
+                "will be fully removed in future releases."
+            )
+
        with self.prepare_forward(inp, is_first_microbatch) as inp:
            bias_tensor = (
-                bias if bias is not None
-                else self.bias if self.parameters_split is None
+                self.bias if self.parameters_split is None
                else self.bias_tensor if not torch.is_grad_enabled()
                else self.noop_cat("bias_tensor", self.bias_names)
            )
            weight_tensor = (
-                weight if weight is not None
-                else self.weight if self.parameters_split is None
+                self.weight if self.parameters_split is None
                else self.weight_tensor if not torch.is_grad_enabled()
                else self.noop_cat("weight_tensor", self.weight_names)
            )

--- a/transformer_engine/pytorch/module/linear.py
+++ b/transformer_engine/pytorch/module/linear.py
@@ -3,6 +3,7 @@
 # See LICENSE for license information.

 """Linear API"""
+import warnings
 from typing import Union, Optional, Callable, Tuple, List, Dict, Any

 import torch
@@ -441,6 +442,11 @@ class Linear(TransformerEngineBaseModule):

    On NVIDIA GPUs it is a drop-in replacement for `torch.nn.Linear`.

+    .. warning::
+
+        Argument :attr:`skip_weight_param_allocation` is deprecated and will
+        be fully removed in future releases.
+
    Parameters
    ----------
    in_features : int
@@ -474,9 +480,6 @@ class Linear(TransformerEngineBaseModule):
                   used to decide whether this Linear layer is Column Parallel Linear or Row
                   Parallel Linear as described `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
                   When set to `None`, no communication is performed.
-    skip_weight_param_allocation: bool, default = `False`
-                                 if set to `True`, weight parameter is not allocated and must be
-                                 passed as a keyword argument `weight` during the forward pass.

    Optimization parameters
    -----------------------
@@ -518,6 +521,14 @@ class Linear(TransformerEngineBaseModule):
    ) -> None:
        super().__init__()

+        if skip_weight_param_allocation:
+            warnings.warn(
+                "Argument `skip_weight_param_allocation` is deprecated and"
+                "will be fully removed in future releases. It has ignored"
+                "starting from v0.11.",
+                category=DeprecationWarning,
+            )
+
        params_dtype = torch.get_default_dtype() if params_dtype is None else params_dtype
        self.in_features = in_features
        self.out_features = out_features
@@ -558,7 +569,6 @@ class Linear(TransformerEngineBaseModule):

        self.sequence_parallel = (self.tp_size > 1) and sequence_parallel

-        if not skip_weight_param_allocation:
        self.weight_tensor = torch.empty(
            self.out_features, self.in_features,
            device=torch.cuda.current_device(),
@@ -668,17 +678,15 @@ class Linear(TransformerEngineBaseModule):
        """
        Apply the linear transformation to the input.

+        .. warning::
+
+            Arguments :attr:`weight` and :attr:`bias` are deprecated and will
+            be fully removed in future releases.
+
        Parameters
        ----------
        inp : torch.Tensor
             Input tensor.
-        weight : torch.Tensor, default = None
-                An optional weight tensor for the module. This argument is compulsory if module
-                is initialized with `skip_weight_param_allocation=True`
-        bias : torch.Tensor, default = None
-              An optional bias tensor for the module. This argument is compulsory if module
-              is initialized with `skip_weight_param_allocation=True` and one of `use_bias`
-              or `return_bias`
        is_first_microbatch : {True, False, None}, default = None
                             During training using either gradient accumulation or
                             pipeline parallelism a minibatch of data is further split
@@ -694,16 +702,20 @@ class Linear(TransformerEngineBaseModule):
                               produced)
        """

+        if weight is not None or bias is not None:
+            raise RuntimeError(
+                "Arguments `weight` and `bias` are deprecated and "
+                "will be fully removed in future releases."
+            )
+
        with self.prepare_forward(inp, is_first_microbatch) as inp:
            bias_tensor = (
-                bias if bias is not None
-                else self.bias if self.parameters_split is None
+                self.bias if self.parameters_split is None
                else self.bias_tensor if not torch.is_grad_enabled()
                else self.noop_cat("bias_tensor", self.bias_names)
            )
            weight_tensor = (
-                weight if weight is not None
-                else self.weight if self.parameters_split is None
+                self.weight if self.parameters_split is None
                else self.weight_tensor if not torch.is_grad_enabled()
                else self.noop_cat("weight_tensor", self.weight_names)
            )