set grads to none for the contig. buffers

df6e3cd7 · mshoeybi · 68797d90 · df6e3cd7
Commit df6e3cd7 authored Aug 18, 2021 by mshoeybi
Hide whitespace changes
Inline Side-by-side

Showing with 1 addition and 2 deletions

megatron/training.py megatron/training.py +1 -2

No files found.
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -354,8 +354,7 @@ def train_step(forward_step_func, data_iterator,
    if args.DDP_impl == 'local' and args.use_contiguous_buffers_in_ddp:
        for partition in model:
            partition.zero_grad_buffer()
-    else:
-        optimizer.zero_grad()
+    optimizer.zero_grad()

    forward_backward_func = get_forward_backward_func()
    losses_reduced = forward_backward_func(