Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
a58653dc
Commit
a58653dc
authored
Feb 28, 2022
by
Lawrence McAfee
Browse files
clip grad reduce across world. [ not just model group ]
parent
371a8828
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
48 additions
and
9 deletions
+48
-9
megatron/optimizer/clip_grads.py
megatron/optimizer/clip_grads.py
+33
-3
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+15
-6
No files found.
megatron/optimizer/clip_grads.py
View file @
a58653dc
...
...
@@ -79,10 +79,15 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
# })
# <<<
# pax(1, {
# >>>
# pax(0, {
# "[LOC]" : "[** BEFORE CALC NORM **]",
# "max_norm" : max_norm,
# "parameters" : parameters,
# "grads" : grads,
# "grads_for_norm" : grads_for_norm,
# })
# <<<
# Norm parameters.
max_norm
=
float
(
max_norm
)
...
...
@@ -127,12 +132,37 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
grad_norm
=
torch
.
norm
(
grad
,
norm_type
)
total_norm
+=
grad_norm
**
norm_type
# >>>
# pax(0, {
# "[LOC]" : "[** CALC NORM **]",
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<<
# Sum across all model-parallel GPUs.
# >>>
# torch.distributed.all_reduce(total_norm,
# op=torch.distributed.ReduceOp.SUM,
# group=mpu.get_model_parallel_group())
# +++
torch
.
distributed
.
all_reduce
(
total_norm
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
mpu
.
get_model_parallel_group
())
op
=
torch
.
distributed
.
ReduceOp
.
SUM
)
# <<<
total_norm
=
total_norm
.
item
()
**
(
1.0
/
norm_type
)
# >>>
# pax(1, {
# "[LOC]" : "[** CALC NORM **]",
# "max_norm" : max_norm,
# "norm_type" : norm_type,
# "grad_norm" : tp(grad_norm),
# "total_norm" : tp(total_norm),
# })
# <<<
# Scale.
clip_coeff
=
max_norm
/
(
total_norm
+
1.0e-6
)
if
clip_coeff
<
1.0
:
...
...
megatron/optimizer/optimizer.py
View file @
a58653dc
...
...
@@ -103,7 +103,7 @@ class MegatronOptimizer(ABC):
# >>>
# pax(0, {
# "clip_grad" : clip_grad,
# "params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
#
#
"params": [ (p.tensor_model_parallel, tp(p)) for p in params ],
# "grads" : [ p.grad for p in params ],
# })
# <<<
...
...
@@ -314,11 +314,12 @@ class BaseFloat16Optimizer(MegatronOptimizer):
return
False
,
None
,
None
# >>>
pax
(
0
,
{
"[LOC]"
:
"[** BEFORE CLIP **]"
,
"param_group / params"
:
[
p
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]
],
"param_group / grads"
:
[
p
.
grad
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]
],
})
# pax(0, {
# "[LOC]" : "[** BEFORE CLIP **]",
# "clip_grad" : self.clip_grad,
# # "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
# "param_group / grads" : [ p.grad for g in self.optimizer.param_groups for p in g["params"] ],
# })
# <<<
# Clip the main gradients.
...
...
@@ -328,6 +329,14 @@ class BaseFloat16Optimizer(MegatronOptimizer):
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
# >>>
pax
(
1
,
{
"[LOC]"
:
"[** BEFORE NONZERO **]"
,
# "param_group / params" : [ p for g in self.optimizer.param_groups for p in g["params"] ],
"param_group / grads"
:
[
p
.
grad
for
g
in
self
.
optimizer
.
param_groups
for
p
in
g
[
"params"
]
],
})
# <<<
# count the zeros in the grads
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
self
.
log_num_zeros_in_grad
else
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment