comments

81ad628e · Vijay Korthikanti · f2bf5a56 · 81ad628e
Commit 81ad628e authored Dec 21, 2021 by Vijay Korthikanti
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 0 deletions

megatron/training.py megatron/training.py +5 -0

No files found.
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -428,12 +428,17 @@ def train_step(forward_step_func, data_iterator,
                grad = word_embeddings_weight.grad
            torch.distributed.all_reduce(grad, group=mpu.get_embedding_group())

+    # All-reduce position_embeddings grad across first (encoder) and split (decoder) 
+    # stages to ensure that position embeddings parameters stay in sync.
+    # This should only run for T5 models with pipeline parallelism
    if mpu.is_rank_in_position_embedding_group() and \
            mpu.get_pipeline_model_parallel_world_size() > 1 and \
            args.pipeline_model_parallel_split_rank is not None:
        unwrapped_model = model[0]
        unwrapped_model = unwrap_model(
            unwrapped_model, (torchDDP, LocalDDP, Float16Module))
+        assert args.DDP_impl == 'local', \
+            'T5 model is only supported with local DDP mode'
        grad = unwrapped_model.language_model.embedding.position_embeddings.weight.main_grad
        torch.distributed.all_reduce(grad, group=mpu.get_position_embedding_group())
    timers('backward-embedding-all-reduce').stop()