Bugfix in main training loop: Update master_grads only after grads are correctly accumulated

9b558566 · Deepak Narayanan · 767e6e92 · 9b558566
Commit 9b558566 authored Oct 28, 2020 by Deepak Narayanan
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 6 deletions

megatron/training.py megatron/training.py +6 -6

No files found.
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -400,12 +400,6 @@ def train_step(forward_step_func, data_iterator,
                               fp32_allreduce=args.fp32_allreduce)
        timers('allreduce').stop()

-    # Update master gradients.
-    timers('backward-master-grad').start()
-    if args.fp16:
-        optimizer.update_master_grads()
-    timers('backward-master-grad').stop()
-
    # All-reduce across first and last stages.
    timers('backward-embedding-all-reduce').start()
    if (mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()) and \
@@ -419,6 +413,12 @@ def train_step(forward_step_func, data_iterator,
                                     group=mpu.get_embedding_group())
    timers('backward-embedding-all-reduce').stop()

+    # Update master gradients.
+    timers('backward-master-grad').start()
+    if args.fp16:
+        optimizer.update_master_grads()
+    timers('backward-master-grad').stop()
+
    # Clipping gradients helps prevent the exploding gradient.
    timers('backward-clip-grad').start()
    if args.clip_grad > 0.: