Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
87b8b9dc
Commit
87b8b9dc
authored
Mar 04, 2021
by
Jared Casper
Browse files
Merge branch 'rc-debug-underflow' into 'main'
Rc debug underflow See merge request ADLR/megatron-lm!246
parents
83d26f03
57437cb1
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
20 deletions
+72
-20
megatron/arguments.py
megatron/arguments.py
+2
-0
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+2
-2
megatron/optimizer/clip_grads.py
megatron/optimizer/clip_grads.py
+28
-0
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+23
-8
megatron/training.py
megatron/training.py
+17
-10
No files found.
megatron/arguments.py
View file @
87b8b9dc
...
@@ -308,6 +308,8 @@ def _add_logging_args(parser):
...
@@ -308,6 +308,8 @@ def _add_logging_args(parser):
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
help
=
'If set, calculate and log parameters norm.'
)
help
=
'If set, calculate and log parameters norm.'
)
group
.
add_argument
(
'--log-num-zeros-in-grad'
,
action
=
'store_true'
,
help
=
'If set, calculate and log the number of zeros in gradient.'
)
group
.
add_argument
(
'--tensorboard-log-interval'
,
type
=
int
,
default
=
1
,
group
.
add_argument
(
'--tensorboard-log-interval'
,
type
=
int
,
default
=
1
,
help
=
'Report to tensorboard interval.'
)
help
=
'Report to tensorboard interval.'
)
group
.
add_argument
(
'--tensorboard-queue-size'
,
type
=
int
,
default
=
1000
,
group
.
add_argument
(
'--tensorboard-queue-size'
,
type
=
int
,
default
=
1000
,
...
...
megatron/optimizer/__init__.py
View file @
87b8b9dc
...
@@ -84,7 +84,7 @@ def get_megatron_optimizer(model):
...
@@ -84,7 +84,7 @@ def get_megatron_optimizer(model):
hysteresis
=
args
.
hysteresis
)
hysteresis
=
args
.
hysteresis
)
# Megatron optimizer.
# Megatron optimizer.
return
FP16OptimizerWithFP16Params
(
optimizer
,
grad_scaler
,
return
FP16OptimizerWithFP16Params
(
optimizer
,
grad_scaler
,
args
.
clip_grad
)
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
)
# FP32.
# FP32.
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
)
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
)
megatron/optimizer/clip_grads.py
View file @
87b8b9dc
...
@@ -118,3 +118,31 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
...
@@ -118,3 +118,31 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
clip_coeff
)
clip_coeff
)
return
total_norm
return
total_norm
def
count_zeros_fp32
(
parameters
):
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
total_num_zeros
=
0.0
for
param
in
parameters
:
grad_not_none
=
param
.
grad
is
not
None
is_not_shared
=
param_is_not_shared
(
param
)
is_not_tp_duplicate
=
param_is_not_tensor_parallel_duplicate
(
param
)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
grad
=
param
.
grad
.
detach
()
num_zeros
=
grad
.
numel
()
-
torch
.
count_nonzero
(
grad
)
total_num_zeros
=
num_zeros
+
total_num_zeros
# Sum across all model-parallel GPUs.
torch
.
distributed
.
all_reduce
(
total_num_zeros
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
mpu
.
get_model_parallel_group
())
total_num_zeros
=
total_num_zeros
.
item
()
return
total_num_zeros
megatron/optimizer/optimizer.py
View file @
87b8b9dc
...
@@ -27,7 +27,7 @@ from megatron import get_timers
...
@@ -27,7 +27,7 @@ from megatron import get_timers
from
megatron
import
mpu
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron
import
print_rank_0
from
.clip_grads
import
clip_grad_norm_fp32
from
.clip_grads
import
clip_grad_norm_fp32
,
count_zeros_fp32
def
_zero_grad_group_helper
(
group
,
set_to_none
):
def
_zero_grad_group_helper
(
group
,
set_to_none
):
...
@@ -65,13 +65,21 @@ class MegatronOptimizer(ABC):
...
@@ -65,13 +65,21 @@ class MegatronOptimizer(ABC):
self
.
optimizer
=
optimizer
self
.
optimizer
=
optimizer
assert
self
.
optimizer
,
'no optimizer is provided.'
assert
self
.
optimizer
,
'no optimizer is provided.'
def
clip_grad_norm
(
self
,
clip_grad
):
def
get_parameters
(
self
):
params
=
[]
params
=
[]
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param
in
param_group
[
'params'
]:
for
param
in
param_group
[
'params'
]:
params
.
append
(
param
)
params
.
append
(
param
)
return
params
def
clip_grad_norm
(
self
,
clip_grad
):
params
=
self
.
get_parameters
()
return
clip_grad_norm_fp32
(
params
,
clip_grad
)
return
clip_grad_norm_fp32
(
params
,
clip_grad
)
def
count_zeros
(
self
):
params
=
self
.
get_parameters
()
return
count_zeros_fp32
(
params
)
@
abstractmethod
@
abstractmethod
def
zero_grad
(
self
,
set_to_none
=
True
):
def
zero_grad
(
self
,
set_to_none
=
True
):
pass
pass
...
@@ -131,11 +139,12 @@ class MegatronOptimizer(ABC):
...
@@ -131,11 +139,12 @@ class MegatronOptimizer(ABC):
class
FP16OptimizerWithFP16Params
(
MegatronOptimizer
):
class
FP16OptimizerWithFP16Params
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
grad_scaler
,
clip_grad
):
def
__init__
(
self
,
optimizer
,
grad_scaler
,
clip_grad
,
log_num_zeros_in_grad
):
super
(
FP16OptimizerWithFP16Params
,
self
).
__init__
(
optimizer
)
super
(
FP16OptimizerWithFP16Params
,
self
).
__init__
(
optimizer
)
self
.
grad_scaler
=
grad_scaler
self
.
grad_scaler
=
grad_scaler
self
.
clip_grad
=
clip_grad
self
.
clip_grad
=
clip_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
# Tensor used to determine if a nan/if has happend.
# Tensor used to determine if a nan/if has happend.
# Any non-zero value indicates inf/nan.
# Any non-zero value indicates inf/nan.
...
@@ -289,7 +298,6 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -289,7 +298,6 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
def
reload_model_params
(
self
):
def
reload_model_params
(
self
):
self
.
_copy_model_params_to_main_params
()
self
.
_copy_model_params_to_main_params
()
@
torch
.
no_grad
()
@
torch
.
no_grad
()
def
step
(
self
):
def
step
(
self
):
...
@@ -311,7 +319,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -311,7 +319,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
# If we found inf/nan, skip the update.
# If we found inf/nan, skip the update.
if
found_inf_flag
:
if
found_inf_flag
:
return
False
,
None
return
False
,
None
,
None
# Clip the main gradients.
# Clip the main gradients.
timers
(
'optimizer-clip-main-grad'
).
start
()
timers
(
'optimizer-clip-main-grad'
).
start
()
...
@@ -320,6 +328,9 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -320,6 +328,9 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
timers
(
'optimizer-clip-main-grad'
).
stop
()
# count the zeros in the grads
num_zeros_in_grad
=
self
.
count_zeros
()
if
self
.
log_num_zeros_in_grad
else
None
# Step the optimizer.
# Step the optimizer.
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
...
@@ -329,7 +340,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -329,7 +340,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
# Successful update.
# Successful update.
return
True
,
grad_norm
return
True
,
grad_norm
,
num_zeros_in_grad
def
state_dict
(
self
):
def
state_dict
(
self
):
...
@@ -370,10 +381,11 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -370,10 +381,11 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
class
FP32Optimizer
(
MegatronOptimizer
):
class
FP32Optimizer
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
clip_grad
):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
):
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
)
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
)
self
.
clip_grad
=
clip_grad
self
.
clip_grad
=
clip_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
...
@@ -398,11 +410,14 @@ class FP32Optimizer(MegatronOptimizer):
...
@@ -398,11 +410,14 @@ class FP32Optimizer(MegatronOptimizer):
if
self
.
clip_grad
>
0.0
:
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
# count the zeros in the grads
num_zeros_in_grad
=
self
.
count_zeros
()
if
self
.
log_num_zeros_in_grad
else
None
# Update parameters.
# Update parameters.
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
# No overflow for FP32 optimizer.
# No overflow for FP32 optimizer.
return
True
,
grad_norm
return
True
,
grad_norm
,
num_zeros_in_grad
def
reload_model_params
(
self
):
def
reload_model_params
(
self
):
...
...
megatron/training.py
View file @
87b8b9dc
...
@@ -378,7 +378,7 @@ def train_step(forward_step_func, data_iterator,
...
@@ -378,7 +378,7 @@ def train_step(forward_step_func, data_iterator,
# Update parameters.
# Update parameters.
timers
(
'optimizer'
).
start
()
timers
(
'optimizer'
).
start
()
update_successful
,
grad_norm
=
optimizer
.
step
()
update_successful
,
grad_norm
,
num_zeros_in_grad
=
optimizer
.
step
()
timers
(
'optimizer'
).
stop
()
timers
(
'optimizer'
).
stop
()
# Update learning rate.
# Update learning rate.
...
@@ -397,13 +397,13 @@ def train_step(forward_step_func, data_iterator,
...
@@ -397,13 +397,13 @@ def train_step(forward_step_func, data_iterator,
for
key
in
losses_reduced
[
0
]:
for
key
in
losses_reduced
[
0
]:
losses_reduced_for_key
=
[
x
[
key
]
for
x
in
losses_reduced
]
losses_reduced_for_key
=
[
x
[
key
]
for
x
in
losses_reduced
]
loss_reduced
[
key
]
=
sum
(
losses_reduced_for_key
)
/
len
(
losses_reduced_for_key
)
loss_reduced
[
key
]
=
sum
(
losses_reduced_for_key
)
/
len
(
losses_reduced_for_key
)
return
loss_reduced
,
skipped_iter
,
grad_norm
return
loss_reduced
,
skipped_iter
,
grad_norm
,
num_zeros_in_grad
return
{},
skipped_iter
,
grad_norm
return
{},
skipped_iter
,
grad_norm
,
num_zeros_in_grad
def
training_log
(
loss_dict
,
total_loss_dict
,
learning_rate
,
iteration
,
def
training_log
(
loss_dict
,
total_loss_dict
,
learning_rate
,
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
grad_norm
,
params_norm
):
grad_norm
,
params_norm
,
num_zeros_in_grad
):
"""Log training information such as losses, timing, ...."""
"""Log training information such as losses, timing, ...."""
args
=
get_args
()
args
=
get_args
()
timers
=
get_timers
()
timers
=
get_timers
()
...
@@ -492,6 +492,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -492,6 +492,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
if
num_zeros_in_grad
is
not
None
:
writer
.
add_scalar
(
'num-zeros'
,
num_zeros_in_grad
,
iteration
)
writer
.
add_scalar
(
'num-zeros vs samples'
,
num_zeros_in_grad
,
args
.
consumed_train_samples
)
if
params_norm
is
not
None
:
if
params_norm
is
not
None
:
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
...
@@ -526,6 +530,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -526,6 +530,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
log_string
+=
' loss scale: {:.1f} |'
.
format
(
loss_scale
)
log_string
+=
' loss scale: {:.1f} |'
.
format
(
loss_scale
)
if
grad_norm
is
not
None
:
if
grad_norm
is
not
None
:
log_string
+=
' grad norm: {:.3f} |'
.
format
(
grad_norm
)
log_string
+=
' grad norm: {:.3f} |'
.
format
(
grad_norm
)
if
num_zeros_in_grad
is
not
None
:
log_string
+=
' num zeros: {:.1f} |'
.
format
(
num_zeros_in_grad
)
if
params_norm
is
not
None
:
if
params_norm
is
not
None
:
log_string
+=
' params norm: {:.3f} |'
.
format
(
params_norm
)
log_string
+=
' params norm: {:.3f} |'
.
format
(
params_norm
)
log_string
+=
' number of skipped iterations: {:3d} |'
.
format
(
log_string
+=
' number of skipped iterations: {:3d} |'
.
format
(
...
@@ -581,7 +587,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
...
@@ -581,7 +587,8 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
report_memory_flag
=
True
report_memory_flag
=
True
while
iteration
<
args
.
train_iters
:
while
iteration
<
args
.
train_iters
:
update_num_microbatches
(
args
.
consumed_train_samples
)
update_num_microbatches
(
args
.
consumed_train_samples
)
loss_dict
,
skipped_iter
,
grad_norm
=
train_step
(
forward_step_func
,
loss_dict
,
skipped_iter
,
grad_norm
,
num_zeros_in_grad
=
\
train_step
(
forward_step_func
,
train_data_iterator
,
train_data_iterator
,
model
,
model
,
optimizer
,
optimizer
,
...
@@ -600,7 +607,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
...
@@ -600,7 +607,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
optimizer
.
param_groups
[
0
][
'lr'
],
optimizer
.
param_groups
[
0
][
'lr'
],
iteration
,
loss_scale
,
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
report_memory_flag
,
skipped_iter
,
grad_norm
,
params_norm
)
grad_norm
,
params_norm
,
num_zeros_in_grad
)
# Autoresume
# Autoresume
if
args
.
adlr_autoresume
and
\
if
args
.
adlr_autoresume
and
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment