Commit 3574b8e6 authored by Deepak Narayanan's avatar Deepak Narayanan
Browse files

Better memory tracking across pipeline-parallel ranks

parent 00ac56ab
...@@ -716,8 +716,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, ...@@ -716,8 +716,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
total_loss_dict[skipped_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0
total_loss_dict[got_nan_key] = 0 total_loss_dict[got_nan_key] = 0
print_rank_last(log_string) print_rank_last(log_string)
if report_memory_flag: if report_memory_flag and learning_rate > 0.:
report_memory('after {} iterations'.format(iteration)) # Report memory after optimizer state has been initialized.
report_memory('(after {} iterations)'.format(iteration))
report_memory_flag = False report_memory_flag = False
timers.log(timers_to_log, normalizer=args.log_interval) timers.log(timers_to_log, normalizer=args.log_interval)
......
...@@ -50,7 +50,8 @@ def report_memory(name): ...@@ -50,7 +50,8 @@ def report_memory(name):
string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes) string += ' | reserved: {}'.format(torch.cuda.memory_reserved() / mega_bytes)
string += ' | max reserved: {}'.format( string += ' | max reserved: {}'.format(
torch.cuda.max_memory_reserved() / mega_bytes) torch.cuda.max_memory_reserved() / mega_bytes)
print_rank_0(string) if mpu.get_data_parallel_rank() == 0:
print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
def print_params_min_max_norm(optimizer, iteration): def print_params_min_max_norm(optimizer, iteration):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment