Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
e68ea218
Commit
e68ea218
authored
Dec 29, 2020
by
mohammad
Browse files
further refactor, matching old results
parent
b0a3fdfe
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
11 additions
and
11 deletions
+11
-11
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+11
-11
No files found.
megatron/optimizer/optimizer.py
View file @
e68ea218
...
@@ -68,8 +68,8 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
...
@@ -68,8 +68,8 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
# - grad should not be none
# - grad should not be none
# - parameter should not be shared
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
# - should not be a replica due to tensor model parallelism
params_with_
grads
=
[]
grads
=
[]
pa
ra
m
s_for_norm
=
[]
g
ra
d
s_for_norm
=
[]
for
param
in
parameters
:
for
param
in
parameters
:
# Make sure the grads are in fp32
# Make sure the grads are in fp32
assert
param
.
grad
.
type
()
==
'torch.cuda.FloatTensor'
assert
param
.
grad
.
type
()
==
'torch.cuda.FloatTensor'
...
@@ -77,10 +77,11 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
...
@@ -77,10 +77,11 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
is_not_shared
=
not
hasattr
(
param
,
'shared'
)
or
not
param
.
shared
is_not_shared
=
not
hasattr
(
param
,
'shared'
)
or
not
param
.
shared
is_not_tp_duplicate
=
param
.
tensor_model_parallel
or
\
is_not_tp_duplicate
=
param
.
tensor_model_parallel
or
\
(
mpu
.
get_tensor_model_parallel_rank
()
==
0
)
(
mpu
.
get_tensor_model_parallel_rank
()
==
0
)
grad
=
param
.
grad
.
detach
()
if
grad_not_none
:
if
grad_not_none
:
params_with_
grads
.
append
(
pa
ra
m
)
grads
.
append
(
g
ra
d
)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
pa
ra
m
s_for_norm
.
append
(
pa
ra
m
)
g
ra
d
s_for_norm
.
append
(
g
ra
d
)
# Norm parameters.
# Norm parameters.
max_norm
=
float
(
max_norm
)
max_norm
=
float
(
max_norm
)
...
@@ -89,8 +90,7 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
...
@@ -89,8 +90,7 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
# Calculate norm.
# Calculate norm.
if
norm_type
==
inf
:
if
norm_type
==
inf
:
total_norm
=
max
(
param
.
grad
.
detach
().
abs
().
max
()
total_norm
=
max
(
grad
.
abs
().
max
()
for
grad
in
grads_for_norm
)
for
param
in
params_for_norm
)
total_norm_cuda
=
torch
.
cuda
.
FloatTensor
([
float
(
total_norm
)])
total_norm_cuda
=
torch
.
cuda
.
FloatTensor
([
float
(
total_norm
)])
# Take max across all model-parallel GPUs.
# Take max across all model-parallel GPUs.
torch
.
distributed
.
all_reduce
(
total_norm_cuda
,
torch
.
distributed
.
all_reduce
(
total_norm_cuda
,
...
@@ -99,9 +99,9 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
...
@@ -99,9 +99,9 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
total_norm
=
total_norm_cuda
[
0
].
item
()
total_norm
=
total_norm_cuda
[
0
].
item
()
else
:
else
:
for
pa
ra
m
in
pa
ra
m
s_for_norm
:
for
g
ra
d
in
g
ra
d
s_for_norm
:
pa
ra
m
_norm
=
torch
.
norm
(
param
.
grad
.
detach
()
,
norm_type
)
g
ra
d
_norm
=
torch
.
norm
(
grad
,
norm_type
)
total_norm
+=
pa
ra
m
_norm
.
item
()
**
norm_type
total_norm
+=
g
ra
d
_norm
.
item
()
**
norm_type
# Sum across all model-parallel GPUs.
# Sum across all model-parallel GPUs.
total_norm_cuda
=
torch
.
cuda
.
FloatTensor
([
float
(
total_norm
)])
total_norm_cuda
=
torch
.
cuda
.
FloatTensor
([
float
(
total_norm
)])
torch
.
distributed
.
all_reduce
(
total_norm_cuda
,
torch
.
distributed
.
all_reduce
(
total_norm_cuda
,
...
@@ -112,8 +112,8 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
...
@@ -112,8 +112,8 @@ def _clip_grad_norm(parameters, max_norm, norm_type=2):
# Scale.
# Scale.
clip_coef
=
max_norm
/
(
total_norm
+
1.0e-6
)
clip_coef
=
max_norm
/
(
total_norm
+
1.0e-6
)
if
clip_coef
<
1.0
:
if
clip_coef
<
1.0
:
for
pa
ra
m
in
params_with_
grads
:
for
g
ra
d
in
grads
:
param
.
grad
.
detach
()
.
mul_
(
clip_coef
)
grad
.
mul_
(
clip_coef
)
return
total_norm
return
total_norm
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment