Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
0aff3629
Commit
0aff3629
authored
Mar 04, 2021
by
Rewon Child
Browse files
Update argument names and fix merge error
parent
41a64613
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
30 deletions
+27
-30
megatron/arguments.py
megatron/arguments.py
+1
-1
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+2
-2
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+8
-8
megatron/training.py
megatron/training.py
+16
-19
No files found.
megatron/arguments.py
View file @
0aff3629
...
...
@@ -308,7 +308,7 @@ def _add_logging_args(parser):
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
help
=
'If set, calculate and log parameters norm.'
)
group
.
add_argument
(
'--log-zeros'
,
action
=
'store_true'
,
group
.
add_argument
(
'--log-
num-
zeros
-in-grad
'
,
action
=
'store_true'
,
help
=
'If set, calculate and log the number of zeros in gradient.'
)
group
.
add_argument
(
'--tensorboard-log-interval'
,
type
=
int
,
default
=
1
,
help
=
'Report to tensorboard interval.'
)
...
...
megatron/optimizer/__init__.py
View file @
0aff3629
...
...
@@ -84,7 +84,7 @@ def get_megatron_optimizer(model):
hysteresis
=
args
.
hysteresis
)
# Megatron optimizer.
return
FP16OptimizerWithFP16Params
(
optimizer
,
grad_scaler
,
args
.
clip_grad
,
args
.
log_zeros
)
args
.
clip_grad
,
args
.
log_
num_
zeros
_in_grad
)
# FP32.
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_zeros
)
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_
num_
zeros
_in_grad
)
megatron/optimizer/optimizer.py
View file @
0aff3629
...
...
@@ -139,12 +139,12 @@ class MegatronOptimizer(ABC):
class
FP16OptimizerWithFP16Params
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
grad_scaler
,
clip_grad
,
log_zeros
):
def
__init__
(
self
,
optimizer
,
grad_scaler
,
clip_grad
,
log_
num_
zeros
_in_grad
):
super
(
FP16OptimizerWithFP16Params
,
self
).
__init__
(
optimizer
)
self
.
grad_scaler
=
grad_scaler
self
.
clip_grad
=
clip_grad
self
.
log_zeros
=
log_zeros
self
.
log_
num_
zeros
_in_grad
=
log_
num_
zeros
_in_grad
# Tensor used to determine if a nan/if has happend.
# Any non-zero value indicates inf/nan.
...
...
@@ -329,7 +329,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
timers
(
'optimizer-clip-main-grad'
).
stop
()
# count the zeros in the grads
num_zeros
=
self
.
count_zeros
()
if
self
.
log_zeros
else
None
num_zeros
_in_grad
=
self
.
count_zeros
()
if
self
.
log_
num_
zeros
_in_grad
else
None
# Step the optimizer.
self
.
optimizer
.
step
()
...
...
@@ -340,7 +340,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
# Successful update.
return
True
,
grad_norm
,
num_zeros
return
True
,
grad_norm
,
num_zeros
_in_grad
def
state_dict
(
self
):
...
...
@@ -381,11 +381,11 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
class
FP32Optimizer
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_zeros
):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_
num_
zeros
_in_grad
):
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
)
self
.
clip_grad
=
clip_grad
self
.
log_zeros
=
log_zeros
self
.
log_
num_
zeros
_in_grad
=
log_
num_
zeros
_in_grad
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
...
...
@@ -411,13 +411,13 @@ class FP32Optimizer(MegatronOptimizer):
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
# count the zeros in the grads
num_zeros
=
self
.
count_zeros
()
if
self
.
log_zeros
else
None
num_zeros
_in_grad
=
self
.
count_zeros
()
if
self
.
log_
num_
zeros
_in_grad
else
None
# Update parameters.
self
.
optimizer
.
step
()
# No overflow for FP32 optimizer.
return
True
,
grad_norm
,
num_zeros
return
True
,
grad_norm
,
num_zeros
_in_grad
def
reload_model_params
(
self
):
...
...
megatron/training.py
View file @
0aff3629
...
...
@@ -378,11 +378,7 @@ def train_step(forward_step_func, data_iterator,
# Update parameters.
timers
(
'optimizer'
).
start
()
<<<<<<<
HEAD
update_successfull
,
grad_norm
,
num_zeros
=
optimizer
.
step
()
=======
update_successful
,
grad_norm
=
optimizer
.
step
()
>>>>>>>
main
update_successful
,
grad_norm
,
num_zeros_in_grad
=
optimizer
.
step
()
timers
(
'optimizer'
).
stop
()
# Update learning rate.
...
...
@@ -401,13 +397,13 @@ def train_step(forward_step_func, data_iterator,
for
key
in
losses_reduced
[
0
]:
losses_reduced_for_key
=
[
x
[
key
]
for
x
in
losses_reduced
]
loss_reduced
[
key
]
=
sum
(
losses_reduced_for_key
)
/
len
(
losses_reduced_for_key
)
return
loss_reduced
,
skipped_iter
,
grad_norm
,
num_zeros
return
{},
skipped_iter
,
grad_norm
,
num_zeros
return
loss_reduced
,
skipped_iter
,
grad_norm
,
num_zeros
_in_grad
return
{},
skipped_iter
,
grad_norm
,
num_zeros
_in_grad
def
training_log
(
loss_dict
,
total_loss_dict
,
learning_rate
,
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
grad_norm
,
params_norm
,
num_zeros
):
grad_norm
,
params_norm
,
num_zeros
_in_grad
):
"""Log training information such as losses, timing, ...."""
args
=
get_args
()
timers
=
get_timers
()
...
...
@@ -496,9 +492,9 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
args
.
consumed_train_samples
)
if
num_zeros
is
not
None
:
writer
.
add_scalar
(
'num-zeros'
,
num_zeros
,
iteration
)
writer
.
add_scalar
(
'num-zeros vs samples'
,
num_zeros
,
if
num_zeros
_in_grad
is
not
None
:
writer
.
add_scalar
(
'num-zeros'
,
num_zeros
_in_grad
,
iteration
)
writer
.
add_scalar
(
'num-zeros vs samples'
,
num_zeros
_in_grad
,
args
.
consumed_train_samples
)
if
params_norm
is
not
None
:
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
...
...
@@ -534,8 +530,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
log_string
+=
' loss scale: {:.1f} |'
.
format
(
loss_scale
)
if
grad_norm
is
not
None
:
log_string
+=
' grad norm: {:.3f} |'
.
format
(
grad_norm
)
if
num_zeros
is
not
None
:
log_string
+=
' num zeros: {:.1f} |'
.
format
(
num_zeros
)
if
num_zeros
_in_grad
is
not
None
:
log_string
+=
' num zeros: {:.1f} |'
.
format
(
num_zeros
_in_grad
)
if
params_norm
is
not
None
:
log_string
+=
' params norm: {:.3f} |'
.
format
(
params_norm
)
log_string
+=
' number of skipped iterations: {:3d} |'
.
format
(
...
...
@@ -591,11 +587,12 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
report_memory_flag
=
True
while
iteration
<
args
.
train_iters
:
update_num_microbatches
(
args
.
consumed_train_samples
)
loss_dict
,
skipped_iter
,
grad_norm
,
num_zeros
=
train_step
(
forward_step_func
,
train_data_iterator
,
model
,
optimizer
,
lr_scheduler
)
loss_dict
,
skipped_iter
,
grad_norm
,
num_zeros_in_grad
=
\
train_step
(
forward_step_func
,
train_data_iterator
,
model
,
optimizer
,
lr_scheduler
)
iteration
+=
1
args
.
consumed_train_samples
+=
mpu
.
get_data_parallel_world_size
()
*
\
args
.
micro_batch_size
*
\
...
...
@@ -610,7 +607,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
optimizer
.
param_groups
[
0
][
'lr'
],
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
grad_norm
,
params_norm
,
num_zeros
)
grad_norm
,
params_norm
,
num_zeros
_in_grad
)
# Autoresume
if
args
.
adlr_autoresume
and
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment