Small notes in comments in response to Jared's comments

aa9cae27 · Deepak Narayanan · dd079406 · aa9cae27 · aa9cae27
Commit aa9cae27 authored Oct 28, 2020 by Deepak Narayanan
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 2 deletions

megatron/fp16/loss_scaler.py megatron/fp16/loss_scaler.py +4 -0

megatron/training.py megatron/training.py +5 -2

No files found.
--- a/megatron/fp16/loss_scaler.py
+++ b/megatron/fp16/loss_scaler.py
@@ -205,6 +205,10 @@ class DynamicLossScaler:
        return grad_in
    def backward(self, output_tensor, retain_graph=False, output_tensor_grad=None):
+        # If output_tensor_grad is None, this is the last stage, and
+        # output_tensor is actually the loss and needs to be scaled.
+        # Otherwise, output_tensor does not need to be scaled again since
+        # output_tensor_grad is already scaled.
        if output_tensor_grad is None:
            scaled_output_tensor = output_tensor * self.loss_scale
        else:

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -361,14 +361,17 @@ def train_step(forward_step_func, data_iterator,
    # Compute number of microbatches in a minibatch.
    num_microbatches_in_minibatch = args.num_microbatches_in_minibatch
-    # TODO: Switch to the following schedule when async communication is supported
+    # TODO: Switch to the following schedule to facilitate more
-    # so that we can facilitate mroe memory-efficient training.
+    # memory-efficient training.
    # num_warmup_microbatches = \
    #     (torch.distributed.get_world_size(group=mpu.get_pipeline_model_parallel_group()) -
    #      torch.distributed.get_rank(group=mpu.get_pipeline_model_parallel_group()) - 1)
    # num_warmup_microbatches = min(
    #     num_warmup_microbatches,
    #     num_microbatches_in_minibatch)
+    # For now, perform training without warmup. Perform forward
+    # passes for all microbatches, then backward passes for all
+    # microbatches.
    num_warmup_microbatches = num_microbatches_in_minibatch
    input_tensors = []