Commit 68797d90 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'lmcafee/empty-cache' into 'main'

bug fix for previous MR (Flag to call empty_cache() each iteration, to reduce fragmentation)

See merge request ADLR/megatron-lm!309
parents 8e8d7603 db08a636
......@@ -363,7 +363,7 @@ def train_step(forward_step_func, data_iterator,
optimizer, timers, forward_only=False)
# Empty unused memory
if args.empty_unused_memory_each_iter >= 1:
if args.empty_unused_memory_level >= 1:
torch.cuda.empty_cache()
# All-reduce if needed.
......@@ -413,7 +413,7 @@ def train_step(forward_step_func, data_iterator,
skipped_iter = 1
# Empty unused memory
if args.empty_unused_memory_each_iter >= 2:
if args.empty_unused_memory_level >= 2:
torch.cuda.empty_cache()
if mpu.is_pipeline_last_stage(ignore_virtual=True):
......@@ -725,7 +725,7 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
timers=None, forward_only=True)
# Empty unused memory
if args.empty_unused_memory_each_iter >= 1:
if args.empty_unused_memory_level >= 1:
torch.cuda.empty_cache()
if mpu.is_pipeline_last_stage(ignore_virtual=True):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment