Merge branch 'lmcafee/tbmem-fix' into 'main'

added memory stats (allocated/reserved) to tensorboard logging See merge request ADLR/megatron-lm!285

Merge branch 'lmcafee/tbmem-fix' into 'main'
added memory stats (allocated/reserved) to tensorboard logging See merge request ADLR/megatron-lm!285
f3be8ead · Jared Casper · 3202f237 · bc5a8e20 · f3be8ead · f3be8ead
Commit f3be8ead authored Jul 14, 2021 by Jared Casper
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 0 deletions

megatron/arguments.py megatron/arguments.py +3 -0

megatron/training.py megatron/training.py +17 -0

No files found.
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -328,6 +328,9 @@ def _add_logging_args(parser):
                       action='store_true',
                       help='If set, write validation perplexity to '
                       'tensorboard.')
+    group.add_argument('--log-memory-to-tensorboard',
+                       action='store_true',
+                       help='Enable memory logging to tensorboard.')

    return parser


--- a/megatron/training.py
+++ b/megatron/training.py
@@ -531,6 +531,23 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
        if args.log_timers_to_tensorboard:
            timers.write(timers_to_log, writer, iteration,
                         normalizer=total_iterations)
+        if args.log_memory_to_tensorboard:
+            mem_stats = torch.cuda.memory_stats()
+            writer.add_scalar(
+                "mem-reserved-bytes",
+                mem_stats["reserved_bytes.all.current"],
+                iteration,
+            )
+            writer.add_scalar(
+                "mem-allocated-bytes",
+                mem_stats["allocated_bytes.all.current"],
+                iteration,
+            )
+            writer.add_scalar(
+                "mem-allocated-count",
+                mem_stats["allocation.all.current"],
+                iteration,
+            )

    if iteration % args.log_interval == 0:
        elapsed_time = timers('interval-time').elapsed()