Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
wxj
Megatron-LM
Commits
4e77e7c6
You need to sign in or sign up before continuing.
Commit
4e77e7c6
authored
Feb 23, 2021
by
Rewon Child
Browse files
Add option to log zeros
parent
872e38ea
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
78 additions
and
20 deletions
+78
-20
megatron/arguments.py
megatron/arguments.py
+2
-0
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+2
-2
megatron/optimizer/clip_grads.py
megatron/optimizer/clip_grads.py
+35
-0
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+23
-8
megatron/training.py
megatron/training.py
+16
-10
No files found.
megatron/arguments.py
View file @
4e77e7c6
...
@@ -283,6 +283,8 @@ def _add_logging_args(parser):
...
@@ -283,6 +283,8 @@ def _add_logging_args(parser):
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
group
.
add_argument
(
'--log-params-norm'
,
action
=
'store_true'
,
help
=
'If set, calculate and log parameters norm.'
)
help
=
'If set, calculate and log parameters norm.'
)
group
.
add_argument
(
'--log-zeros'
,
action
=
'store_true'
,
help
=
'If set, calculate and log the number of zeros in gradient.'
)
group
.
add_argument
(
'--tensorboard-log-interval'
,
type
=
int
,
default
=
1
,
group
.
add_argument
(
'--tensorboard-log-interval'
,
type
=
int
,
default
=
1
,
help
=
'Report to tensorboard interval.'
)
help
=
'Report to tensorboard interval.'
)
group
.
add_argument
(
'--tensorboard-queue-size'
,
type
=
int
,
default
=
1000
,
group
.
add_argument
(
'--tensorboard-queue-size'
,
type
=
int
,
default
=
1000
,
...
...
megatron/optimizer/__init__.py
View file @
4e77e7c6
...
@@ -83,7 +83,7 @@ def get_megatron_optimizer(model):
...
@@ -83,7 +83,7 @@ def get_megatron_optimizer(model):
hysteresis
=
args
.
hysteresis
)
hysteresis
=
args
.
hysteresis
)
# Megatron optimizer.
# Megatron optimizer.
return
FP16OptimizerWithFP16Params
(
optimizer
,
grad_scaler
,
return
FP16OptimizerWithFP16Params
(
optimizer
,
grad_scaler
,
args
.
clip_grad
)
args
.
clip_grad
,
args
.
log_zeros
)
# FP32.
# FP32.
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
)
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_zeros
)
megatron/optimizer/clip_grads.py
View file @
4e77e7c6
...
@@ -118,3 +118,38 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
...
@@ -118,3 +118,38 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
clip_coeff
)
clip_coeff
)
return
total_norm
return
total_norm
def
count_zeros_fp32
(
parameters
):
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
grads_to_count
=
[]
for
param
in
parameters
:
grad_not_none
=
param
.
grad
is
not
None
is_not_shared
=
param_is_not_shared
(
param
)
is_not_tp_duplicate
=
param_is_not_tensor_parallel_duplicate
(
param
)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
grad
=
param
.
grad
.
detach
()
grads_to_count
.
append
(
grad
)
# Norm parameters.
total_num_zeros
=
0.0
# Calculate norm.
for
grad
in
grads_to_count
:
num_zeros
=
grad
.
numel
()
-
torch
.
count_nonzero
(
grad
)
total_num_zeros
=
num_zeros
+
total_num_zeros
# Sum across all model-parallel GPUs.
torch
.
distributed
.
all_reduce
(
total_num_zeros
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
mpu
.
get_model_parallel_group
())
total_num_zeros
=
total_num_zeros
.
item
()
return
total_num_zeros
megatron/optimizer/optimizer.py
View file @
4e77e7c6
...
@@ -27,7 +27,7 @@ from megatron import get_timers
...
@@ -27,7 +27,7 @@ from megatron import get_timers
from
megatron
import
mpu
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron
import
print_rank_0
from
.clip_grads
import
clip_grad_norm_fp32
from
.clip_grads
import
clip_grad_norm_fp32
,
count_zeros_fp32
def
_zero_grad_group_helper
(
group
,
set_to_none
):
def
_zero_grad_group_helper
(
group
,
set_to_none
):
...
@@ -65,13 +65,21 @@ class MegatronOptimizer(ABC):
...
@@ -65,13 +65,21 @@ class MegatronOptimizer(ABC):
self
.
optimizer
=
optimizer
self
.
optimizer
=
optimizer
assert
self
.
optimizer
,
'no optimizer is provided.'
assert
self
.
optimizer
,
'no optimizer is provided.'
def
clip_grad_norm
(
self
,
clip_grad
):
def
get_parameters
(
self
):
params
=
[]
params
=
[]
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param
in
param_group
[
'params'
]:
for
param
in
param_group
[
'params'
]:
params
.
append
(
param
)
params
.
append
(
param
)
return
params
def
clip_grad_norm
(
self
,
clip_grad
):
params
=
self
.
get_parameters
()
return
clip_grad_norm_fp32
(
params
,
clip_grad
)
return
clip_grad_norm_fp32
(
params
,
clip_grad
)
def
count_zeros
(
self
):
params
=
self
.
get_parameters
()
return
count_zeros_fp32
(
params
)
@
abstractmethod
@
abstractmethod
def
zero_grad
(
self
,
set_to_none
=
True
):
def
zero_grad
(
self
,
set_to_none
=
True
):
pass
pass
...
@@ -131,11 +139,12 @@ class MegatronOptimizer(ABC):
...
@@ -131,11 +139,12 @@ class MegatronOptimizer(ABC):
class
FP16OptimizerWithFP16Params
(
MegatronOptimizer
):
class
FP16OptimizerWithFP16Params
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
grad_scaler
,
clip_grad
):
def
__init__
(
self
,
optimizer
,
grad_scaler
,
clip_grad
,
log_zeros
):
super
(
FP16OptimizerWithFP16Params
,
self
).
__init__
(
optimizer
)
super
(
FP16OptimizerWithFP16Params
,
self
).
__init__
(
optimizer
)
self
.
grad_scaler
=
grad_scaler
self
.
grad_scaler
=
grad_scaler
self
.
clip_grad
=
clip_grad
self
.
clip_grad
=
clip_grad
self
.
log_zeros
=
log_zeros
# Tensor used to determine if a nan/if has happend.
# Tensor used to determine if a nan/if has happend.
# Any non-zero value indicates inf/nan.
# Any non-zero value indicates inf/nan.
...
@@ -289,7 +298,6 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -289,7 +298,6 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
def
reload_model_params
(
self
):
def
reload_model_params
(
self
):
self
.
_copy_model_params_to_main_params
()
self
.
_copy_model_params_to_main_params
()
@
torch
.
no_grad
()
@
torch
.
no_grad
()
def
step
(
self
):
def
step
(
self
):
...
@@ -311,7 +319,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -311,7 +319,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
# If we found inf/nan, skip the update.
# If we found inf/nan, skip the update.
if
found_inf_flag
:
if
found_inf_flag
:
return
False
,
None
return
False
,
None
,
None
# Clip the main gradients.
# Clip the main gradients.
timers
(
'optimizer-clip-main-grad'
).
start
()
timers
(
'optimizer-clip-main-grad'
).
start
()
...
@@ -320,6 +328,9 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -320,6 +328,9 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
timers
(
'optimizer-clip-main-grad'
).
stop
()
# count the zeros in the grads
num_zeros
=
self
.
count_zeros
()
if
self
.
log_zeros
else
None
# Step the optimizer.
# Step the optimizer.
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
...
@@ -329,7 +340,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -329,7 +340,7 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
# Successful update.
# Successful update.
return
True
,
grad_norm
return
True
,
grad_norm
,
num_zeros
def
state_dict
(
self
):
def
state_dict
(
self
):
...
@@ -370,10 +381,11 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
...
@@ -370,10 +381,11 @@ class FP16OptimizerWithFP16Params(MegatronOptimizer):
class
FP32Optimizer
(
MegatronOptimizer
):
class
FP32Optimizer
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
clip_grad
):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_zeros
):
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
)
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
)
self
.
clip_grad
=
clip_grad
self
.
clip_grad
=
clip_grad
self
.
log_zeros
=
log_zeros
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
...
@@ -398,11 +410,14 @@ class FP32Optimizer(MegatronOptimizer):
...
@@ -398,11 +410,14 @@ class FP32Optimizer(MegatronOptimizer):
if
self
.
clip_grad
>
0.0
:
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
# count the zeros in the grads
num_zeros
=
self
.
count_zeros
()
if
self
.
log_zeros
else
None
# Update parameters.
# Update parameters.
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
# No overflow for FP32 optimizer.
# No overflow for FP32 optimizer.
return
True
,
grad_norm
return
True
,
grad_norm
,
num_zeros
def
reload_model_params
(
self
):
def
reload_model_params
(
self
):
...
...
megatron/training.py
View file @
4e77e7c6
...
@@ -621,7 +621,7 @@ def train_step(forward_step_func, data_iterator,
...
@@ -621,7 +621,7 @@ def train_step(forward_step_func, data_iterator,
# Update parameters.
# Update parameters.
timers
(
'optimizer'
).
start
()
timers
(
'optimizer'
).
start
()
update_successfull
,
grad_norm
=
optimizer
.
step
()
update_successfull
,
grad_norm
,
num_zeros
=
optimizer
.
step
()
timers
(
'optimizer'
).
stop
()
timers
(
'optimizer'
).
stop
()
# Update learning rate.
# Update learning rate.
...
@@ -640,13 +640,13 @@ def train_step(forward_step_func, data_iterator,
...
@@ -640,13 +640,13 @@ def train_step(forward_step_func, data_iterator,
for
key
in
losses_reduced
[
0
]:
for
key
in
losses_reduced
[
0
]:
losses_reduced_for_key
=
[
x
[
key
]
for
x
in
losses_reduced
]
losses_reduced_for_key
=
[
x
[
key
]
for
x
in
losses_reduced
]
loss_reduced
[
key
]
=
sum
(
losses_reduced_for_key
)
/
len
(
losses_reduced_for_key
)
loss_reduced
[
key
]
=
sum
(
losses_reduced_for_key
)
/
len
(
losses_reduced_for_key
)
return
loss_reduced
,
skipped_iter
,
grad_norm
return
loss_reduced
,
skipped_iter
,
grad_norm
,
num_zeros
return
{},
skipped_iter
,
grad_norm
return
{},
skipped_iter
,
grad_norm
,
num_zeros
def
training_log
(
loss_dict
,
total_loss_dict
,
learning_rate
,
iteration
,
def
training_log
(
loss_dict
,
total_loss_dict
,
learning_rate
,
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
grad_norm
,
params_norm
):
grad_norm
,
params_norm
,
num_zeros
):
"""Log training information such as losses, timing, ...."""
"""Log training information such as losses, timing, ...."""
args
=
get_args
()
args
=
get_args
()
timers
=
get_timers
()
timers
=
get_timers
()
...
@@ -734,6 +734,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -734,6 +734,10 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm'
,
grad_norm
,
iteration
)
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
writer
.
add_scalar
(
'grad-norm vs samples'
,
grad_norm
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
if
num_zeros
is
not
None
:
writer
.
add_scalar
(
'num-zeros'
,
num_zeros
,
iteration
)
writer
.
add_scalar
(
'num-zeros vs samples'
,
num_zeros
,
args
.
consumed_train_samples
)
if
params_norm
is
not
None
:
if
params_norm
is
not
None
:
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
...
@@ -768,6 +772,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -768,6 +772,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
log_string
+=
' loss scale: {:.1f} |'
.
format
(
loss_scale
)
log_string
+=
' loss scale: {:.1f} |'
.
format
(
loss_scale
)
if
grad_norm
is
not
None
:
if
grad_norm
is
not
None
:
log_string
+=
' grad norm: {:.3f} |'
.
format
(
grad_norm
)
log_string
+=
' grad norm: {:.3f} |'
.
format
(
grad_norm
)
if
num_zeros
is
not
None
:
log_string
+=
' num zeros: {:.1f} |'
.
format
(
num_zeros
)
if
params_norm
is
not
None
:
if
params_norm
is
not
None
:
log_string
+=
' params norm: {:.3f} |'
.
format
(
params_norm
)
log_string
+=
' params norm: {:.3f} |'
.
format
(
params_norm
)
log_string
+=
' number of skipped iterations: {:3d} |'
.
format
(
log_string
+=
' number of skipped iterations: {:3d} |'
.
format
(
...
@@ -822,7 +828,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
...
@@ -822,7 +828,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
report_memory_flag
=
True
report_memory_flag
=
True
while
iteration
<
args
.
train_iters
:
while
iteration
<
args
.
train_iters
:
update_num_microbatches
(
args
.
consumed_train_samples
)
update_num_microbatches
(
args
.
consumed_train_samples
)
loss_dict
,
skipped_iter
,
grad_norm
=
train_step
(
forward_step_func
,
loss_dict
,
skipped_iter
,
grad_norm
,
num_zeros
=
train_step
(
forward_step_func
,
train_data_iterator
,
train_data_iterator
,
model
,
model
,
optimizer
,
optimizer
,
...
@@ -841,7 +847,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
...
@@ -841,7 +847,7 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
optimizer
.
param_groups
[
0
][
'lr'
],
optimizer
.
param_groups
[
0
][
'lr'
],
iteration
,
loss_scale
,
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
,
report_memory_flag
,
skipped_iter
,
grad_norm
,
params_norm
)
grad_norm
,
params_norm
,
num_zeros
)
# Autoresume
# Autoresume
if
args
.
adlr_autoresume
and
\
if
args
.
adlr_autoresume
and
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment