Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
3a26a168
Commit
3a26a168
authored
Jan 28, 2021
by
mohammad
Browse files
added options for tensorboard logging
parent
16db4a2c
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
76 additions
and
39 deletions
+76
-39
megatron/arguments.py
megatron/arguments.py
+51
-22
megatron/training.py
megatron/training.py
+25
-17
No files found.
megatron/arguments.py
View file @
3a26a168
...
@@ -160,7 +160,8 @@ def parse_args(extra_args_provider=None, defaults={},
...
@@ -160,7 +160,8 @@ def parse_args(extra_args_provider=None, defaults={},
'expected sample-based learnig rate warmup'
'expected sample-based learnig rate warmup'
if
args
.
lr_warmup_fraction
is
not
None
:
if
args
.
lr_warmup_fraction
is
not
None
:
assert
args
.
lr_warmup_samples
==
0
,
\
assert
args
.
lr_warmup_samples
==
0
,
\
'can only specify one of lr-warmup-fraction and lr-warmup-samples'
'can only specify one of lr-warmup-fraction '
\
'and lr-warmup-samples'
# Check required arguments.
# Check required arguments.
required_args
=
[
'num_layers'
,
'hidden_size'
,
'num_attention_heads'
,
required_args
=
[
'num_layers'
,
'hidden_size'
,
'num_attention_heads'
,
...
@@ -242,13 +243,15 @@ def _add_network_size_args(parser):
...
@@ -242,13 +243,15 @@ def _add_network_size_args(parser):
group
.
add_argument
(
'--hidden-size'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--hidden-size'
,
type
=
int
,
default
=
None
,
help
=
'Tansformer hidden size.'
)
help
=
'Tansformer hidden size.'
)
group
.
add_argument
(
'--ffn-hidden-size'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--ffn-hidden-size'
,
type
=
int
,
default
=
None
,
help
=
'Transformer Feed-Forward Network hidden size.
This is set to 4*hidden-size if not
'
help
=
'Transformer Feed-Forward Network hidden size. '
'
provided'
)
'This is set to 4*hidden-size if not
provided'
)
group
.
add_argument
(
'--num-attention-heads'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--num-attention-heads'
,
type
=
int
,
default
=
None
,
help
=
'Number of transformer attention heads.'
)
help
=
'Number of transformer attention heads.'
)
group
.
add_argument
(
'--kv-channels'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--kv-channels'
,
type
=
int
,
default
=
None
,
help
=
'Projection weights dimension in multi-head attention. '
help
=
'Projection weights dimension in multi-head '
'This is set to args.hidden_size // args.num_attention_heads if not provided.'
)
'attention. This is set to '
' args.hidden_size // args.num_attention_heads '
'if not provided.'
)
group
.
add_argument
(
'--max-position-embeddings'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--max-position-embeddings'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of position embeddings to use. '
help
=
'Maximum number of position embeddings to use. '
'This is the size of position embedding.'
)
'This is the size of position embedding.'
)
...
@@ -266,7 +269,8 @@ def _add_network_size_args(parser):
...
@@ -266,7 +269,8 @@ def _add_network_size_args(parser):
'should not be used unless for backward compatibility'
'should not be used unless for backward compatibility'
'reasons.'
)
'reasons.'
)
group
.
add_argument
(
'--onnx-safe'
,
type
=
bool
,
required
=
False
,
group
.
add_argument
(
'--onnx-safe'
,
type
=
bool
,
required
=
False
,
help
=
'Use workarounds for known problems with Torch ONNX exporter'
)
help
=
'Use workarounds for known problems with '
'Torch ONNX exporter'
)
group
.
add_argument
(
'--bert-no-binary-head'
,
action
=
'store_false'
,
group
.
add_argument
(
'--bert-no-binary-head'
,
action
=
'store_false'
,
help
=
'Disable BERT binary head.'
,
help
=
'Disable BERT binary head.'
,
dest
=
'bert_binary_head'
)
dest
=
'bert_binary_head'
)
...
@@ -279,6 +283,24 @@ def _add_logging_args(parser):
...
@@ -279,6 +283,24 @@ def _add_logging_args(parser):
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
help
=
'If set, calculate and log parameters norm.'
)
help
=
'If set, calculate and log parameters norm.'
)
group
.
add_argument
(
'--tensorboard-log-interval'
,
type
=
int
,
default
=
1
,
help
=
'Report to tensorboard interval.'
)
group
.
add_argument
(
'--log-timers-to-tensorboard'
,
action
=
'store_true'
,
help
=
'If set, write timers to tensorboard.'
)
group
.
add_argument
(
'--log-batch-size-to-tensorboard'
,
action
=
'store_true'
,
help
=
'If set, write batch-size to tensorboard.'
)
group
.
add_argument
(
'--no-log-learnig-rate-to-tensorboard'
,
action
=
'store_false'
,
help
=
'Disable learning rate logging to tensorboard.'
,
dest
=
'log_learning_rate_to_tensorboard'
)
group
.
add_argument
(
'--no-log-loss-scale-to-tensorboard'
,
action
=
'store_false'
,
help
=
'Disable loss-scale logging to tensorboard.'
,
dest
=
'log_loss_scale_to_tensorboard'
)
group
.
add_argument
(
'--log-validation-ppl-to-tensorboard'
,
action
=
'store_true'
,
help
=
'If set, write validation perplexity to '
'tensorboard.'
)
return
parser
return
parser
...
@@ -295,11 +317,11 @@ def _add_regularization_args(parser):
...
@@ -295,11 +317,11 @@ def _add_regularization_args(parser):
group
.
add_argument
(
'--clip-grad'
,
type
=
float
,
default
=
1.0
,
group
.
add_argument
(
'--clip-grad'
,
type
=
float
,
default
=
1.0
,
help
=
'Gradient clipping based on global L2 norm.'
)
help
=
'Gradient clipping based on global L2 norm.'
)
group
.
add_argument
(
'--adam-beta1'
,
type
=
float
,
default
=
0.9
,
group
.
add_argument
(
'--adam-beta1'
,
type
=
float
,
default
=
0.9
,
help
=
'First coefficient for computing running averages
of
'
help
=
'First coefficient for computing running averages '
'gradient and its square'
)
'
of
gradient and its square'
)
group
.
add_argument
(
'--adam-beta2'
,
type
=
float
,
default
=
0.999
,
group
.
add_argument
(
'--adam-beta2'
,
type
=
float
,
default
=
0.999
,
help
=
'Second coefficient for computing running averages
of
'
help
=
'Second coefficient for computing running averages '
'gradient and its square'
)
'
of
gradient and its square'
)
group
.
add_argument
(
'--adam-eps'
,
type
=
float
,
default
=
1e-08
,
group
.
add_argument
(
'--adam-eps'
,
type
=
float
,
default
=
1e-08
,
help
=
'Term added to the denominator to improve'
help
=
'Term added to the denominator to improve'
'numerical stability'
)
'numerical stability'
)
...
@@ -425,7 +447,7 @@ def _add_learning_rate_args(parser):
...
@@ -425,7 +447,7 @@ def _add_learning_rate_args(parser):
help
=
'number of samples to linearly warmup '
help
=
'number of samples to linearly warmup '
'learning rate over.'
)
'learning rate over.'
)
group
.
add_argument
(
'--warmup'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--warmup'
,
type
=
int
,
default
=
None
,
help
=
'Old lr warmup argument, do not use. Use one of the
'
help
=
'Old lr warmup argument, do not use. Use one of the'
'--lr-warmup-* arguments above'
)
'--lr-warmup-* arguments above'
)
group
.
add_argument
(
'--min-lr'
,
type
=
float
,
default
=
0.0
,
group
.
add_argument
(
'--min-lr'
,
type
=
float
,
default
=
0.0
,
help
=
'Minumum value for learning rate. The scheduler'
help
=
'Minumum value for learning rate. The scheduler'
...
@@ -525,12 +547,14 @@ def _add_distributed_args(parser):
...
@@ -525,12 +547,14 @@ def _add_distributed_args(parser):
group
.
add_argument
(
'--local_rank'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--local_rank'
,
type
=
int
,
default
=
None
,
help
=
'local rank passed from distributed launcher.'
)
help
=
'local rank passed from distributed launcher.'
)
group
.
add_argument
(
'--lazy-mpu-init'
,
type
=
bool
,
required
=
False
,
group
.
add_argument
(
'--lazy-mpu-init'
,
type
=
bool
,
required
=
False
,
help
=
'If set to True, initialize_megatron() skips DDP initialization'
help
=
'If set to True, initialize_megatron() '
' and returns function to complete it instead.'
'skips DDP initialization and returns function to '
'Also turns on --use-cpu-initialization flag.'
'complete it instead.Also turns on '
'This is for external DDP manager.'
)
'--use-cpu-initialization flag. This is for '
group
.
add_argument
(
'--use-cpu-initialization'
,
action
=
'store_true'
,
default
=
None
,
'external DDP manager.'
)
help
=
'If set, affine parallel weights initialization uses CPU'
)
group
.
add_argument
(
'--use-cpu-initialization'
,
action
=
'store_true'
,
default
=
None
,
help
=
'If set, affine parallel weights '
'initialization uses CPU'
)
return
parser
return
parser
...
@@ -616,19 +640,22 @@ def _add_realm_args(parser):
...
@@ -616,19 +640,22 @@ def _add_realm_args(parser):
# network size
# network size
group
.
add_argument
(
'--ict-head-size'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--ict-head-size'
,
type
=
int
,
default
=
None
,
help
=
'Size of block embeddings to be used in ICT and REALM (paper default: 128)'
)
help
=
'Size of block embeddings to be used in ICT and '
'REALM (paper default: 128)'
)
# checkpointing
# checkpointing
group
.
add_argument
(
'--ict-load'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--ict-load'
,
type
=
str
,
default
=
None
,
help
=
'Directory containing an ICTBertModel checkpoint'
)
help
=
'Directory containing an ICTBertModel checkpoint'
)
group
.
add_argument
(
'--bert-load'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--bert-load'
,
type
=
str
,
default
=
None
,
help
=
'Directory containing an BertModel checkpoint (needed to start ICT and REALM)'
)
help
=
'Directory containing an BertModel checkpoint '
'(needed to start ICT and REALM)'
)
# data
# data
group
.
add_argument
(
'--titles-data-path'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--titles-data-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to titles dataset used for ICT'
)
help
=
'Path to titles dataset used for ICT'
)
group
.
add_argument
(
'--query-in-block-prob'
,
type
=
float
,
default
=
0.1
,
group
.
add_argument
(
'--query-in-block-prob'
,
type
=
float
,
default
=
0.1
,
help
=
'Probability of keeping query in block for ICT dataset'
)
help
=
'Probability of keeping query in block for '
'ICT dataset'
)
group
.
add_argument
(
'--use-one-sent-docs'
,
action
=
'store_true'
,
group
.
add_argument
(
'--use-one-sent-docs'
,
action
=
'store_true'
,
help
=
'Whether to use one sentence documents in ICT'
)
help
=
'Whether to use one sentence documents in ICT'
)
...
@@ -644,9 +671,11 @@ def _add_realm_args(parser):
...
@@ -644,9 +671,11 @@ def _add_realm_args(parser):
# indexer
# indexer
group
.
add_argument
(
'--indexer-batch-size'
,
type
=
int
,
default
=
128
,
group
.
add_argument
(
'--indexer-batch-size'
,
type
=
int
,
default
=
128
,
help
=
'How large of batches to use when doing indexing jobs'
)
help
=
'How large of batches to use when doing indexing '
'jobs'
)
group
.
add_argument
(
'--indexer-log-interval'
,
type
=
int
,
default
=
1000
,
group
.
add_argument
(
'--indexer-log-interval'
,
type
=
int
,
default
=
1000
,
help
=
'After how many batches should the indexer report progress'
)
help
=
'After how many batches should the indexer '
'report progress'
)
return
parser
return
parser
...
...
megatron/training.py
View file @
3a26a168
...
@@ -712,20 +712,24 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -712,20 +712,24 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
total_loss_dict
[
skipped_iters_key
]
total_loss_dict
[
skipped_iters_key
]
# Tensorboard values.
# Tensorboard values.
if
writer
and
is_last_rank
():
if
writer
and
(
iteration
%
args
.
tensorboard_log_interval
==
0
)
and
\
writer
.
add_scalar
(
'learning-rate'
,
learning_rate
,
iteration
)
is_last_rank
():
writer
.
add_scalar
(
'learning-rate vs samples'
,
learning_rate
,
if
args
.
log_learning_rate_to_tensorboard
:
args
.
consumed_train_samples
)
writer
.
add_scalar
(
'learning-rate'
,
learning_rate
,
iteration
)
writer
.
add_scalar
(
'batch-size'
,
batch_size
,
iteration
)
writer
.
add_scalar
(
'learning-rate vs samples'
,
learning_rate
,
writer
.
add_scalar
(
'batch-size vs samples'
,
batch_size
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
if
args
.
log_batch_size_to_tensorboard
:
writer
.
add_scalar
(
'batch-size'
,
batch_size
,
iteration
)
writer
.
add_scalar
(
'batch-size vs samples'
,
batch_size
,
args
.
consumed_train_samples
)
for
key
in
loss_dict
:
for
key
in
loss_dict
:
writer
.
add_scalar
(
key
,
loss_dict
[
key
],
iteration
)
writer
.
add_scalar
(
key
,
loss_dict
[
key
],
iteration
)
writer
.
add_scalar
(
key
+
' vs samples'
,
loss_dict
[
key
],
writer
.
add_scalar
(
key
+
' vs samples'
,
loss_dict
[
key
],
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
writer
.
add_scalar
(
'loss-scale'
,
loss_scale
,
iteration
)
if
args
.
log_loss_scale_to_tensorboard
:
writer
.
add_scalar
(
'loss-scale vs samples'
,
loss_scale
,
writer
.
add_scalar
(
'loss-scale'
,
loss_scale
,
iteration
)
args
.
consumed_train_samples
)
writer
.
add_scalar
(
'loss-scale vs samples'
,
loss_scale
,
args
.
consumed_train_samples
)
if
grad_norm
is
not
None
:
if
grad_norm
is
not
None
:
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
...
@@ -734,15 +738,17 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -734,15 +738,17 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
timers
.
write
(
timers_to_log
,
writer
,
iteration
,
if
args
.
log_timers_to_tensorboard
:
normalizer
=
total_iterations
)
timers
.
write
(
timers_to_log
,
writer
,
iteration
,
normalizer
=
total_iterations
)
if
iteration
%
args
.
log_interval
==
0
:
if
iteration
%
args
.
log_interval
==
0
:
elapsed_time
=
timers
(
'interval time'
).
elapsed
()
elapsed_time
=
timers
(
'interval time'
).
elapsed
()
elapsed_time_per_iteration
=
elapsed_time
/
total_iterations
elapsed_time_per_iteration
=
elapsed_time
/
total_iterations
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
writer
.
add_scalar
(
'iteration-time'
,
if
args
.
log_timers_to_tensorboard
:
elapsed_time_per_iteration
,
iteration
)
writer
.
add_scalar
(
'iteration-time'
,
elapsed_time_per_iteration
,
iteration
)
log_string
=
' iteration {:8d}/{:8d} |'
.
format
(
log_string
=
' iteration {:8d}/{:8d} |'
.
format
(
iteration
,
args
.
train_iters
)
iteration
,
args
.
train_iters
)
log_string
+=
' consumed samples: {:12d} |'
.
format
(
log_string
+=
' consumed samples: {:12d} |'
.
format
(
...
@@ -958,12 +964,14 @@ def evaluate_and_print_results(prefix, forward_step_func,
...
@@ -958,12 +964,14 @@ def evaluate_and_print_results(prefix, forward_step_func,
writer
.
add_scalar
(
'{} value-validation'
.
format
(
key
),
writer
.
add_scalar
(
'{} value-validation'
.
format
(
key
),
total_loss_dict
[
key
].
item
(),
total_loss_dict
[
key
].
item
(),
iteration
)
iteration
)
writer
.
add_scalar
(
'{} ppl-validation'
.
format
(
key
),
ppl
,
iteration
)
writer
.
add_scalar
(
'{} value-validation vs samples'
.
format
(
key
),
writer
.
add_scalar
(
'{} value-validation vs samples'
.
format
(
key
),
total_loss_dict
[
key
].
item
(),
total_loss_dict
[
key
].
item
(),
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
writer
.
add_scalar
(
'{} ppl-validation vs samples'
.
format
(
key
),
ppl
,
if
args
.
log_validation_ppl_to_tensorboard
:
args
.
consumed_train_samples
)
writer
.
add_scalar
(
'{} ppl-validation'
.
format
(
key
),
ppl
,
iteration
)
writer
.
add_scalar
(
'{} ppl-validation vs samples'
.
format
(
key
),
ppl
,
args
.
consumed_train_samples
)
length
=
len
(
string
)
+
1
length
=
len
(
string
)
+
1
print_rank_last
(
'-'
*
length
)
print_rank_last
(
'-'
*
length
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment