Merge branch 'lmcafee/empty-cache' into 'main'

Flag to call empty_cache() each iteration, to reduce fragmentation See merge request ADLR/megatron-lm!306

Merge branch 'lmcafee/empty-cache' into 'main'
Flag to call empty_cache() each iteration, to reduce fragmentation See merge request ADLR/megatron-lm!306
bd315c35 · Jared Casper · 5ca20cdd · 52b2296b · bd315c35 · bd315c35
Commit bd315c35 authored Aug 17, 2021 by Jared Casper
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 0 deletions

megatron/arguments.py megatron/arguments.py +5 -0

megatron/training.py megatron/training.py +12 -0

No files found.
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -601,6 +601,11 @@ def _add_distributed_args(parser):
    group.add_argument('--use-cpu-initialization', action='store_true',
                       default=None, help='If set, affine parallel weights '
                       'initialization uses CPU' )
+    group.add_argument('--empty-unused-memory-level', default=0, type=int,
+                       choices=[0, 1, 2],
+                       help='Call torch.cuda.empty_cache() each iteration '
+                       '(training and eval), to reduce fragmentation.'
+                       '0=off, 1=moderate, 2=aggressive.')
    return parser

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -362,6 +362,10 @@ def train_step(forward_step_func, data_iterator,
        forward_step_func, data_iterator, model,
        optimizer, timers, forward_only=False)
+    # Empty unused memory
+    if args.empty_unused_memory_each_iter >= 1:
+        torch.cuda.empty_cache()
    # All-reduce if needed.
    if args.DDP_impl == 'local':
        timers('backward-params-all-reduce').start()
@@ -408,6 +412,10 @@ def train_step(forward_step_func, data_iterator,
    else:
        skipped_iter = 1
+    # Empty unused memory
+    if args.empty_unused_memory_each_iter >= 2:
+        torch.cuda.empty_cache()
    if mpu.is_pipeline_last_stage(ignore_virtual=True):
        # Average loss across microbatches.
        loss_reduced = {}
@@ -716,6 +724,10 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
                forward_step_func, data_iterator, model, optimizer=None,
                timers=None, forward_only=True)
+            # Empty unused memory
+            if args.empty_unused_memory_each_iter >= 1:
+                torch.cuda.empty_cache()
            if mpu.is_pipeline_last_stage(ignore_virtual=True):
                # Reduce across processes.
                for loss_dict in loss_dicts: