fixed.

65e6bc32 · Lawrence McAfee · 4c598f9d · 65e6bc32 · 65e6bc32 · 65e6bc32
Commit 65e6bc32 authored Mar 15, 2022 by Lawrence McAfee
Showing with 16 additions and 42 deletions

megatron/model/fused_layer_norm.py megatron/model/fused_layer_norm.py +16 -28

megatron/model/language_model.py megatron/model/language_model.py +0 -6

megatron/schedules.py megatron/schedules.py +0 -8

No files found.
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -23,6 +23,8 @@ from torch.nn.parameter import Parameter
 from torch.nn import init
 import importlib

+from megatron.mpu import make_viewless_tensor
+
 try:
    from apex.contrib.layer_norm.layer_norm import FastLayerNormFN
    HAVE_PERSIST_LAYER_NORM = True
@@ -100,35 +102,21 @@ class MixedFusedLayerNorm(torch.nn.Module):
    init.zeros_(self.bias)


-  # def forward(self, input):
-
-  #   if self.no_persist_layer_norm:
-  #       return FusedLayerNormAffineFunction.apply(
-  #         input, self.weight, self.bias, self.normalized_shape, self.eps)
-  #   else:
-  #       return FastLayerNormFN.apply(
-  #         input, self.weight, self.bias, self.eps)
  def forward(self, input):

    if self.no_persist_layer_norm:
-        result = FusedLayerNormAffineFunction.apply(
-            input, self.weight, self.bias, self.normalized_shape, self.eps)
+        return FusedLayerNormAffineFunction.apply(
+          input, self.weight, self.bias, self.normalized_shape, self.eps)
    else:
-        result = FastLayerNormFN.apply(
-            input, self.weight, self.bias, self.eps)
-        result = make_viewless_tensor(inp = input, requires_grad = input.requires_grad, keep_grad = True)
-    # >>>
-    # if torch.distributed.get_rank() == 3:
-    #     # from lutil import pax
-    #     # pax({"result": result})
-    #     from megatron import get_args
-    #     args = get_args()
-    #     raise Exception("r %d ... hid %d, persist %d, view %d." % (
-    #         torch.distributed.get_rank(),
-    #         args.hidden_size,
-    #         not args.no_persist_layer_norm,
-    #         result._base is not None,
-    #     ))
-    # <<<
-    return result
-
+        output = FastLayerNormFN.apply(
+          input, self.weight, self.bias, self.eps)
+
+        # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
+        # a populated '_base' field). This will result in schedule.py's
+        # deallocate_output_tensor() throwing an error, so a viewless tensor is
+        # created to prevent this.
+        output = make_viewless_tensor(inp = output,
+                                      requires_grad = input.requires_grad,
+                                      keep_graph = True)
+
+        return output
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -337,12 +337,6 @@ class TransformerLanguageModel(MegatronModule):
        else:
            self.encoder = None

-        # >>>
-        # if torch.distributed.get_rank() == 3:
-        #     print(self.encoder)
-        #     raise Exception("bye.")
-        # <<<
-
        # Decoder (usually set to False, True if part of an encoder-decoder
        # architecture and in decoder-only stage).
        if self.add_decoder:

--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -651,14 +651,6 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
        if not forward_only:
            input_tensors.append(input_tensor)
            output_tensors.append(output_tensor)
-            # >>>
-            if output_tensor[0]._base is not None:
-                # from lutil import pax
-                # pax({
-                #     "output tensor / 0" : output_tensor[0],
-                # })
-                raise Exception(">>>>>> r %d, output / 0 == view." % torch.distributed.get_rank())
-            # <<<
            deallocate_output_tensor(output_tensor[0])

    # Before running 1F1B, need to receive first forward tensor.