Replace timer print rank 0 with logging (#732)

* Use log dist function instead of print * Expose ranks Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

Replace timer print rank 0 with logging (#732)
* Use log dist function instead of print * Expose ranks Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
6fb16100 · Sean Naren · GitHub · 248f6383 · 6fb16100
Unverified Commit 6fb16100 authored Feb 12, 2021 by Sean Naren Committed by GitHub Feb 12, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 12 deletions

deepspeed/utils/timer.py deepspeed/utils/timer.py +3 -12

No files found.
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -4,6 +4,7 @@ Copyright 2019 The Microsoft DeepSpeed Team

 import time
 import torch
+from deepspeed.utils.logging import log_dist

 from deepspeed.utils import logger

@@ -15,14 +16,6 @@ except ImportError:
    pass


-def print_rank_0(message):
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            print(message)
-    else:
-        print(message)
-
-
 class SynchronizedWallClockTimer:
    """Group of timers. Borrowed from Nvidia Megatron code"""
    class Timer:
@@ -88,7 +81,7 @@ class SynchronizedWallClockTimer:
            torch.cuda.max_memory_cached() / (1024 * 1024 * 1024))
        return " | {} | {} | {} | {}".format(alloc, max_alloc, cache, max_cache)

-    def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False):
+    def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False, ranks=None):
        """Log a group of timers."""
        assert normalizer > 0.0
        string = f'rank={torch.distributed.get_rank()} time (ms)'
@@ -98,9 +91,7 @@ class SynchronizedWallClockTimer:
                    reset=reset) * 1000.0 / normalizer
                string += ' | {}: {:.2f}'.format(name, elapsed_time)

-        # TODO: use our logging utilitied to selectively print. Useful for model
-        # parallelism because rank=0 is too restrictive.
-        print_rank_0(string)
+        log_dist(string, ranks=ranks or [0])


 class ThroughputTimer():