Better memory tracking across pipeline-parallel ranks

3574b8e6 · Deepak Narayanan · 00ac56ab · 3574b8e6 · 3574b8e6
Commit 3574b8e6 authored Dec 06, 2020 by Deepak Narayanan
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 3 deletions

megatron/training.py megatron/training.py +3 -2

megatron/utils.py megatron/utils.py +2 -1

No files found.
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -716,8 +716,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
        total_loss_dict[skipped_iters_key] = 0
        total_loss_dict[got_nan_key] = 0
        print_rank_last(log_string)
-        if report_memory_flag:
+        if report_memory_flag and learning_rate > 0.:
-            report_memory('after {} iterations'.format(iteration))
+            # Report memory after optimizer state has been initialized.
+            report_memory('(after {} iterations)'.format(iteration))
            report_memory_flag = False
        timers.log(timers_to_log, normalizer=args.log_interval)

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -50,7 +50,8 @@ def report_memory(name):
    string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
    string += ' | max reserved: {}'.format(
        torch.cuda.max_memory_reserved() / mega_bytes)
-    print_rank_0(string)
+    if mpu.get_data_parallel_rank() == 0:
+        print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
 def print_params_min_max_norm(optimizer, iteration):