[minor] ShardGradScaler - only wait for the last handle (#382)

* super minor, opportunistic micro optim

[minor] ShardGradScaler - only wait for the last handle (#382)
* super minor, opportunistic micro optim
1a636557 · Benjamin Lefaudeux · GitHub · ce9e7e48 · 1a636557
Unverified Commit 1a636557 authored Feb 11, 2021 by Benjamin Lefaudeux Committed by GitHub Feb 11, 2021
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 7 deletions

fairscale/optim/grad_scaler.py fairscale/optim/grad_scaler.py +8 -7

No files found.
--- a/fairscale/optim/grad_scaler.py
+++ b/fairscale/optim/grad_scaler.py
@@ -49,10 +49,11 @@ class ShardedGradScaler(TorchGradScaler):

        # Synchronize the detected inf across the ranks
        optimizer_state = self._per_optimizer_states[id(optimizer)]
-        handles = [
-            dist.all_reduce(v, async_op=True, group=self.group)
-            for v in optimizer_state["found_inf_per_device"].values()
-        ]
-
-        # Make sure that the calls are done before moving out
-        _ = list(map(lambda x: x.wait(), handles))
+        last_handle = None
+        for v in optimizer_state["found_inf_per_device"].values():
+            last_handle = dist.all_reduce(v, async_op=True, group=self.group)
+
+        # Make sure that the calls are done before moving out.
+        # The calls are executed in sequence, waiting for the last one is enough
+        if last_handle is not None:
+            last_handle.wait()