Commit 0e5b64af authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'logging_refactor' into 'main'

added options for tensorboard logging

See merge request ADLR/megatron-lm!222
parents 16db4a2c 792a468d
...@@ -160,7 +160,8 @@ def parse_args(extra_args_provider=None, defaults={}, ...@@ -160,7 +160,8 @@ def parse_args(extra_args_provider=None, defaults={},
'expected sample-based learnig rate warmup' 'expected sample-based learnig rate warmup'
if args.lr_warmup_fraction is not None: if args.lr_warmup_fraction is not None:
assert args.lr_warmup_samples == 0, \ assert args.lr_warmup_samples == 0, \
'can only specify one of lr-warmup-fraction and lr-warmup-samples' 'can only specify one of lr-warmup-fraction ' \
'and lr-warmup-samples'
# Check required arguments. # Check required arguments.
required_args = ['num_layers', 'hidden_size', 'num_attention_heads', required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
...@@ -242,13 +243,15 @@ def _add_network_size_args(parser): ...@@ -242,13 +243,15 @@ def _add_network_size_args(parser):
group.add_argument('--hidden-size', type=int, default=None, group.add_argument('--hidden-size', type=int, default=None,
help='Tansformer hidden size.') help='Tansformer hidden size.')
group.add_argument('--ffn-hidden-size', type=int, default=None, group.add_argument('--ffn-hidden-size', type=int, default=None,
help='Transformer Feed-Forward Network hidden size. This is set to 4*hidden-size if not ' help='Transformer Feed-Forward Network hidden size. '
'provided') 'This is set to 4*hidden-size if not provided')
group.add_argument('--num-attention-heads', type=int, default=None, group.add_argument('--num-attention-heads', type=int, default=None,
help='Number of transformer attention heads.') help='Number of transformer attention heads.')
group.add_argument('--kv-channels', type=int, default=None, group.add_argument('--kv-channels', type=int, default=None,
help='Projection weights dimension in multi-head attention. ' help='Projection weights dimension in multi-head '
'This is set to args.hidden_size // args.num_attention_heads if not provided.') 'attention. This is set to '
' args.hidden_size // args.num_attention_heads '
'if not provided.')
group.add_argument('--max-position-embeddings', type=int, default=None, group.add_argument('--max-position-embeddings', type=int, default=None,
help='Maximum number of position embeddings to use. ' help='Maximum number of position embeddings to use. '
'This is the size of position embedding.') 'This is the size of position embedding.')
...@@ -266,7 +269,8 @@ def _add_network_size_args(parser): ...@@ -266,7 +269,8 @@ def _add_network_size_args(parser):
'should not be used unless for backward compatibility' 'should not be used unless for backward compatibility'
'reasons.') 'reasons.')
group.add_argument('--onnx-safe', type=bool, required=False, group.add_argument('--onnx-safe', type=bool, required=False,
help='Use workarounds for known problems with Torch ONNX exporter') help='Use workarounds for known problems with '
'Torch ONNX exporter')
group.add_argument('--bert-no-binary-head', action='store_false', group.add_argument('--bert-no-binary-head', action='store_false',
help='Disable BERT binary head.', help='Disable BERT binary head.',
dest='bert_binary_head') dest='bert_binary_head')
...@@ -279,6 +283,24 @@ def _add_logging_args(parser): ...@@ -279,6 +283,24 @@ def _add_logging_args(parser):
group.add_argument('--log-params-norm', action='store_true', group.add_argument('--log-params-norm', action='store_true',
help='If set, calculate and log parameters norm.') help='If set, calculate and log parameters norm.')
group.add_argument('--tensorboard-log-interval', type=int, default=1,
help='Report to tensorboard interval.')
group.add_argument('--log-timers-to-tensorboard', action='store_true',
help='If set, write timers to tensorboard.')
group.add_argument('--log-batch-size-to-tensorboard', action='store_true',
help='If set, write batch-size to tensorboard.')
group.add_argument('--no-log-learnig-rate-to-tensorboard',
action='store_false',
help='Disable learning rate logging to tensorboard.',
dest='log_learning_rate_to_tensorboard')
group.add_argument('--no-log-loss-scale-to-tensorboard',
action='store_false',
help='Disable loss-scale logging to tensorboard.',
dest='log_loss_scale_to_tensorboard')
group.add_argument('--log-validation-ppl-to-tensorboard',
action='store_true',
help='If set, write validation perplexity to '
'tensorboard.')
return parser return parser
...@@ -295,11 +317,11 @@ def _add_regularization_args(parser): ...@@ -295,11 +317,11 @@ def _add_regularization_args(parser):
group.add_argument('--clip-grad', type=float, default=1.0, group.add_argument('--clip-grad', type=float, default=1.0,
help='Gradient clipping based on global L2 norm.') help='Gradient clipping based on global L2 norm.')
group.add_argument('--adam-beta1', type=float, default=0.9, group.add_argument('--adam-beta1', type=float, default=0.9,
help='First coefficient for computing running averages of' help='First coefficient for computing running averages '
'gradient and its square') 'of gradient and its square')
group.add_argument('--adam-beta2', type=float, default=0.999, group.add_argument('--adam-beta2', type=float, default=0.999,
help='Second coefficient for computing running averages of' help='Second coefficient for computing running averages '
'gradient and its square') 'of gradient and its square')
group.add_argument('--adam-eps', type=float, default=1e-08, group.add_argument('--adam-eps', type=float, default=1e-08,
help='Term added to the denominator to improve' help='Term added to the denominator to improve'
'numerical stability') 'numerical stability')
...@@ -425,7 +447,7 @@ def _add_learning_rate_args(parser): ...@@ -425,7 +447,7 @@ def _add_learning_rate_args(parser):
help='number of samples to linearly warmup ' help='number of samples to linearly warmup '
'learning rate over.') 'learning rate over.')
group.add_argument('--warmup', type=int, default=None, group.add_argument('--warmup', type=int, default=None,
help='Old lr warmup argument, do not use. Use one of the ' help='Old lr warmup argument, do not use. Use one of the'
'--lr-warmup-* arguments above') '--lr-warmup-* arguments above')
group.add_argument('--min-lr', type=float, default=0.0, group.add_argument('--min-lr', type=float, default=0.0,
help='Minumum value for learning rate. The scheduler' help='Minumum value for learning rate. The scheduler'
...@@ -525,12 +547,14 @@ def _add_distributed_args(parser): ...@@ -525,12 +547,14 @@ def _add_distributed_args(parser):
group.add_argument('--local_rank', type=int, default=None, group.add_argument('--local_rank', type=int, default=None,
help='local rank passed from distributed launcher.') help='local rank passed from distributed launcher.')
group.add_argument('--lazy-mpu-init', type=bool, required=False, group.add_argument('--lazy-mpu-init', type=bool, required=False,
help='If set to True, initialize_megatron() skips DDP initialization' help='If set to True, initialize_megatron() '
' and returns function to complete it instead.' 'skips DDP initialization and returns function to '
'Also turns on --use-cpu-initialization flag.' 'complete it instead.Also turns on '
'This is for external DDP manager.' ) '--use-cpu-initialization flag. This is for '
group.add_argument('--use-cpu-initialization', action='store_true', default=None, 'external DDP manager.' )
help='If set, affine parallel weights initialization uses CPU' ) group.add_argument('--use-cpu-initialization', action='store_true',
default=None, help='If set, affine parallel weights '
'initialization uses CPU' )
return parser return parser
...@@ -616,19 +640,22 @@ def _add_realm_args(parser): ...@@ -616,19 +640,22 @@ def _add_realm_args(parser):
# network size # network size
group.add_argument('--ict-head-size', type=int, default=None, group.add_argument('--ict-head-size', type=int, default=None,
help='Size of block embeddings to be used in ICT and REALM (paper default: 128)') help='Size of block embeddings to be used in ICT and '
'REALM (paper default: 128)')
# checkpointing # checkpointing
group.add_argument('--ict-load', type=str, default=None, group.add_argument('--ict-load', type=str, default=None,
help='Directory containing an ICTBertModel checkpoint') help='Directory containing an ICTBertModel checkpoint')
group.add_argument('--bert-load', type=str, default=None, group.add_argument('--bert-load', type=str, default=None,
help='Directory containing an BertModel checkpoint (needed to start ICT and REALM)') help='Directory containing an BertModel checkpoint '
'(needed to start ICT and REALM)')
# data # data
group.add_argument('--titles-data-path', type=str, default=None, group.add_argument('--titles-data-path', type=str, default=None,
help='Path to titles dataset used for ICT') help='Path to titles dataset used for ICT')
group.add_argument('--query-in-block-prob', type=float, default=0.1, group.add_argument('--query-in-block-prob', type=float, default=0.1,
help='Probability of keeping query in block for ICT dataset') help='Probability of keeping query in block for '
'ICT dataset')
group.add_argument('--use-one-sent-docs', action='store_true', group.add_argument('--use-one-sent-docs', action='store_true',
help='Whether to use one sentence documents in ICT') help='Whether to use one sentence documents in ICT')
...@@ -644,9 +671,11 @@ def _add_realm_args(parser): ...@@ -644,9 +671,11 @@ def _add_realm_args(parser):
# indexer # indexer
group.add_argument('--indexer-batch-size', type=int, default=128, group.add_argument('--indexer-batch-size', type=int, default=128,
help='How large of batches to use when doing indexing jobs') help='How large of batches to use when doing indexing '
'jobs')
group.add_argument('--indexer-log-interval', type=int, default=1000, group.add_argument('--indexer-log-interval', type=int, default=1000,
help='After how many batches should the indexer report progress') help='After how many batches should the indexer '
'report progress')
return parser return parser
......
...@@ -712,20 +712,24 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, ...@@ -712,20 +712,24 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
total_loss_dict[skipped_iters_key] total_loss_dict[skipped_iters_key]
# Tensorboard values. # Tensorboard values.
if writer and is_last_rank(): if writer and (iteration % args.tensorboard_log_interval == 0 ) and \
writer.add_scalar('learning-rate', learning_rate, iteration) is_last_rank():
writer.add_scalar('learning-rate vs samples', learning_rate, if args.log_learning_rate_to_tensorboard:
args.consumed_train_samples) writer.add_scalar('learning-rate', learning_rate, iteration)
writer.add_scalar('batch-size', batch_size, iteration) writer.add_scalar('learning-rate vs samples', learning_rate,
writer.add_scalar('batch-size vs samples', batch_size, args.consumed_train_samples)
args.consumed_train_samples) if args.log_batch_size_to_tensorboard:
writer.add_scalar('batch-size', batch_size, iteration)
writer.add_scalar('batch-size vs samples', batch_size,
args.consumed_train_samples)
for key in loss_dict: for key in loss_dict:
writer.add_scalar(key , loss_dict[key], iteration) writer.add_scalar(key , loss_dict[key], iteration)
writer.add_scalar(key + ' vs samples', loss_dict[key], writer.add_scalar(key + ' vs samples', loss_dict[key],
args.consumed_train_samples) args.consumed_train_samples)
writer.add_scalar('loss-scale', loss_scale, iteration) if args.log_loss_scale_to_tensorboard:
writer.add_scalar('loss-scale vs samples', loss_scale, writer.add_scalar('loss-scale', loss_scale, iteration)
args.consumed_train_samples) writer.add_scalar('loss-scale vs samples', loss_scale,
args.consumed_train_samples)
if grad_norm is not None: if grad_norm is not None:
writer.add_scalar('grad-norm', grad_norm, iteration) writer.add_scalar('grad-norm', grad_norm, iteration)
writer.add_scalar('grad-norm vs samples', grad_norm, writer.add_scalar('grad-norm vs samples', grad_norm,
...@@ -734,15 +738,17 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration, ...@@ -734,15 +738,17 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer.add_scalar('params-norm', params_norm, iteration) writer.add_scalar('params-norm', params_norm, iteration)
writer.add_scalar('params-norm vs samples', params_norm, writer.add_scalar('params-norm vs samples', params_norm,
args.consumed_train_samples) args.consumed_train_samples)
timers.write(timers_to_log, writer, iteration, if args.log_timers_to_tensorboard:
normalizer=total_iterations) timers.write(timers_to_log, writer, iteration,
normalizer=total_iterations)
if iteration % args.log_interval == 0: if iteration % args.log_interval == 0:
elapsed_time = timers('interval time').elapsed() elapsed_time = timers('interval time').elapsed()
elapsed_time_per_iteration = elapsed_time / total_iterations elapsed_time_per_iteration = elapsed_time / total_iterations
if writer and torch.distributed.get_rank() == 0: if writer and torch.distributed.get_rank() == 0:
writer.add_scalar('iteration-time', if args.log_timers_to_tensorboard:
elapsed_time_per_iteration, iteration) writer.add_scalar('iteration-time',
elapsed_time_per_iteration, iteration)
log_string = ' iteration {:8d}/{:8d} |'.format( log_string = ' iteration {:8d}/{:8d} |'.format(
iteration, args.train_iters) iteration, args.train_iters)
log_string += ' consumed samples: {:12d} |'.format( log_string += ' consumed samples: {:12d} |'.format(
...@@ -955,15 +961,17 @@ def evaluate_and_print_results(prefix, forward_step_func, ...@@ -955,15 +961,17 @@ def evaluate_and_print_results(prefix, forward_step_func,
ppl = math.exp(min(20, total_loss_dict[key].item())) ppl = math.exp(min(20, total_loss_dict[key].item()))
string += '{} PPL: {:.6E} | '.format(key, ppl) string += '{} PPL: {:.6E} | '.format(key, ppl)
if writer and is_last_rank(): if writer and is_last_rank():
writer.add_scalar('{} value-validation'.format(key), writer.add_scalar('{} validation'.format(key),
total_loss_dict[key].item(), total_loss_dict[key].item(),
iteration) iteration)
writer.add_scalar('{} ppl-validation'.format(key), ppl, iteration) writer.add_scalar('{} validation vs samples'.format(key),
writer.add_scalar('{} value-validation vs samples'.format(key),
total_loss_dict[key].item(), total_loss_dict[key].item(),
args.consumed_train_samples) args.consumed_train_samples)
writer.add_scalar('{} ppl-validation vs samples'.format(key), ppl, if args.log_validation_ppl_to_tensorboard:
args.consumed_train_samples) writer.add_scalar('{} validation ppl'.format(key), ppl,
iteration)
writer.add_scalar('{} validation ppl vs samples'.format(key),
ppl, args.consumed_train_samples)
length = len(string) + 1 length = len(string) + 1
print_rank_last('-' * length) print_rank_last('-' * length)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment