found root source of t5 issue (fast layer norm)

bea16fa3 · Lawrence McAfee · 3af6725d · bea16fa3 · bea16fa3
Commit bea16fa3 authored Feb 01, 2022 by Lawrence McAfee
Show whitespace changes
Inline Side-by-side

Showing with 29 additions and 2 deletions

megatron/model/transformer.py megatron/model/transformer.py +22 -2

megatron/schedules.py megatron/schedules.py +7 -0

No files found.
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -546,7 +546,7 @@ class ParallelTransformerLayer(MegatronModule):
 class NoopTransformerLayer(MegatronModule):
    """A single 'no-op' transformer layer.

-    The sole purpose of this layer is for when args.standalone_embedding_stage
+    The sole purpose of this layer is for when args.standalone_embed_stage
    == True. ?????
    """

@@ -804,6 +804,26 @@ class ParallelTransformer(MegatronModule):
            # Reverting data format change [s b h] --> [b s h].
            hidden_states = hidden_states.transpose(0, 1).contiguous()
            output = self.final_layernorm(hidden_states)
+            # >>>
+            # if True or output._base is not None:
+            #     # from lutil import pax, tp
+            #     # pax({
+            #     #     "hidden_states" : tp(hidden_states),
+            #     #     "output" : tp(output),
+            #     # })
+            #     # raise Exception(">>> rank %d, view %d, hid '%s', out '%s'. <<<" %(
+            #     #     torch.distributed.get_rank(),
+            #     #     output._base is not None,
+            #     #     str(hidden_states.shape),
+            #     #     str(output.shape),
+            #     # ))
+            #     args = get_args()
+            #     raise Exception(">>> rank %d, hid %d, view %d. <<<" %(
+            #         torch.distributed.get_rank(),
+            #         args.hidden_size,
+            #         output._base is not None,
+            #     ))
+            # <<<
        else:
            output = hidden_states


--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -143,6 +143,9 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r

    unwrapped_model.set_input_tensor(input_tensor)
    output_tensor, loss_func = forward_step_func(data_iterator, model)
+    # >>>
+    mpu.assert_viewless_tensor(output_tensor)
+    # <<<
    if mpu.is_pipeline_last_stage():
        output_tensor = loss_func(output_tensor)
        loss, loss_reduced = output_tensor
@@ -150,6 +153,10 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
        losses_reduced.append(loss_reduced)
    timers('forward-compute').stop()

+    # >>>
+    mpu.assert_viewless_tensor(output_tensor)
+    # <<<
+
    # If T5 model (or other model with encoder and decoder)
    # and in decoder stack, then send encoder_hidden_state
    # downstream as well.