Move gradient division to before the allreduce

This is consistent with upstream, and safer against overflow.

Move gradient division to before the allreduce
This is consistent with upstream, and safer against overflow.
e4af2d90 · mcarilli · GitHub · 2f204bca · e4af2d90
Unverified Commit e4af2d90 authored Oct 03, 2018 by mcarilli Committed by GitHub Oct 03, 2018
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 3 deletions

apex/parallel/distributed.py apex/parallel/distributed.py +3 -3

No files found.
--- a/apex/parallel/distributed.py
+++ b/apex/parallel/distributed.py
@@ -11,14 +11,14 @@ import copy
 def apply_flat_dist_call(bucket, call, extra_args=None):
    coalesced = _flatten_dense_tensors(bucket)

+    if call is dist.all_reduce:
+        coalesced /= dist.get_world_size()
+    
    if extra_args is not None:
        call(coalesced, *extra_args)
    else:
        call(coalesced)
        
-    if call is dist.all_reduce:
-        coalesced /= dist.get_world_size()
-        
    for buf, synced in zip(bucket, _unflatten_dense_tensors(coalesced, bucket)):
        buf.copy_(synced)