[transformer][pipeline parallel] warn if deallocation is enabled (#1365)

This is cherry-picked for easier comparison with megatron-lm.

[transformer][pipeline parallel] warn if deallocation is enabled (#1365)
This is cherry-picked for easier comparison with megatron-lm.
2b7d280b · Masaki Kozuki · GitHub · 77f9d73c · 2b7d280b · 2b7d280b
Unverified Commit 2b7d280b authored Apr 29, 2022 by Masaki Kozuki Committed by GitHub Apr 29, 2022
3 changed files
--- a/apex/transformer/pipeline_parallel/schedules/common.py
+++ b/apex/transformer/pipeline_parallel/schedules/common.py
@@ -335,6 +335,9 @@ def backward_step(
        input_tensor:
        output_tensor:
        output_tensor_grad:
+    Keyword Arguments:
+        grad_scaler:
+        deallocate_pipeline_outputs: Experimental.
    Returns:
        input_tensor_grad
    """

--- a/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py
+++ b/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_with_interleaving.py
 from typing import List, Union, Optional, Sequence
+import warnings
 import torch
@@ -70,6 +71,12 @@ def _forward_backward_pipelining_with_interleaving(
    if not isinstance(model, list):
        raise RuntimeError("`model` must be a list of `nn.Module`'s'")
+    if deallocate_pipeline_outputs:
+        warnings.warn(
+            "`deallocate_pipeline_outputs` is experimental and subject to change. "
+            "This option is not recommended."
+        )
    num_model_chunks: int = len(model)
    input_tensors: List[List[Union[None, torch.Tensor]]] = [
        [] for _ in range(num_model_chunks)

--- a/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py
+++ b/apex/transformer/pipeline_parallel/schedules/fwd_bwd_pipelining_without_interleaving.py
 from typing import Union, List, Optional, Sequence
+import warnings
 import torch
@@ -196,6 +197,12 @@ def forward_backward_pipelining_without_interleaving(
    """
    # timers = get_timers()
+    if deallocate_pipeline_outputs:
+        warnings.warn(
+            "`deallocate_pipeline_outputs` is experimental and subject to change. "
+            "This option is not recommended."
+        )
    model: List[torch.nn.Module] = listify_model(model)
    if len(model) != 1:
        msg = f"`model` is expected be a `nn.Module`, but {type(model)}"