more cleanup

8fc5e323 · Lawrence McAfee · c1e4526b · 8fc5e323
Commit 8fc5e323 authored Jan 13, 2022 by Lawrence McAfee
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 60 deletions

megatron/schedules.py megatron/schedules.py +9 -60

No files found.
--- a/megatron/schedules.py
+++ b/megatron/schedules.py
@@ -28,10 +28,6 @@ from megatron.model import DistributedDataParallel as LocalDDP
 from megatron.model import Float16Module
 from megatron.model import ModelType

-# >>>
-from lutil import pax, tp, KEY_RANK
-# <<<
-
 def get_forward_backward_func():
    args = get_args()
    if mpu.get_pipeline_model_parallel_world_size() > 1:
@@ -46,36 +42,6 @@ def get_forward_backward_func():
        forward_backward_func = forward_backward_no_pipelining
    return forward_backward_func

-# >>>
-# def free_output_tensor(output_tensors, deallocate_pipeline_outputs):
-#     '''Pseudo-free (i.e., set to scalar) the output tensor's '.data' field.
-
-#     This method should be called right after the output tensor has been
-#     sent to the next pipeline stage. At this point, the output tensor is
-#     only useful for its '.grad_fn' field, and not its '.data'.
-#     '''
-#     # >>>
-#     # raise Exception("hi.")
-#     # <<<
-#     if not deallocate_pipeline_outputs or output_tensors is None:
-#         return
-#     if isinstance(output_tensors, torch.Tensor):
-#         output_tensors = [output_tensors]
-#     for output_tensor in output_tensors:
-#         # >>>
-#         # if output_tensor.nelement() < 10:
-#         #     # raise Exception("interesting.")
-#         #     continue
-#         # <<<
-#         # >>>
-#         # output_tensor.data = torch.cuda.FloatTensor([0])
-#         output_tensor.data = torch.empty(
-#             (1,),
-#             device = torch.cuda.current_device(),
-#             dtype = output_tensor.dtype,
-#         )
-#         # <<<
-# <<<
 def deallocate_output_tensor(out):
    '''Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.

@@ -118,21 +84,15 @@ def custom_backward(output, grad_output):
        )

    # Call c++ engine [ see torch/csrc/autograd/python_engine.cpp ]
-    # >>>
-    try:
-        Variable._execution_engine.run_backward(
-            tensors = (output,),
-            grad_tensors = (grad_output,),
-            keep_graph = False,
-            create_graph = False,
-            inputs = tuple(),
-            allow_unreachable=True,
-            accumulate_grad=True,
-        )
-    except Exception as e:
-        print(">>>> rank = %d. <<<<" % torch.distributed.get_rank())
-        raise e
-    # <<<
+    Variable._execution_engine.run_backward(
+        tensors = (output,),
+        grad_tensors = (grad_output,),
+        keep_graph = False,
+        create_graph = False,
+        inputs = tuple(),
+        allow_unreachable=True,
+        accumulate_grad=True,
+    )
        

 def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_reduced):
@@ -163,14 +123,6 @@ def forward_step(forward_step_func, data_iterator, model, input_tensor, losses_r
        losses_reduced.append(loss_reduced)
    timers('forward-compute').stop()

-    # >>>
-    # if torch.distributed.get_rank() == 4:
-    #     pax(4, {
-    #         "output_tensor" : tp(output_tensor),
-    #         "input_tensor[-1]" : tp(input_tensor[-1]),
-    #     })
-    # <<<
-
    # If T5 model (or other model with encoder and decoder)
    # and in decoder stack, then send encoder_hidden_state
    # downstream as well.
@@ -425,9 +377,6 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
                    tensor_shape=tensor_shape,
                    timers=timers)
        input_tensors[next_forward_model_chunk_id].append(input_tensor)
-        # >>>
-        pax({"output_tensor": output_tensor})
-        # <<<
        deallocate_output_tensor(output_tensor)

    # Run 1F1B in steady state.