Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
e858d338
Commit
e858d338
authored
Mar 23, 2022
by
Lawrence McAfee
Browse files
moved 'get_main_grads_for_grad_norm()'; for fp16/fp32 sharing.
parent
11581195
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
19 additions
and
32 deletions
+19
-32
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+19
-32
No files found.
megatron/optimizer/optimizer.py
View file @
e858d338
...
@@ -93,6 +93,7 @@ class MegatronOptimizer(ABC):
...
@@ -93,6 +93,7 @@ class MegatronOptimizer(ABC):
assert
self
.
params_have_main_grad
,
\
assert
self
.
params_have_main_grad
,
\
"use of contiguous buffer requires that params have main grad"
"use of contiguous buffer requires that params have main grad"
def
get_parameters
(
self
):
def
get_parameters
(
self
):
params
=
[]
params
=
[]
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param_group
in
self
.
optimizer
.
param_groups
:
...
@@ -100,9 +101,25 @@ class MegatronOptimizer(ABC):
...
@@ -100,9 +101,25 @@ class MegatronOptimizer(ABC):
params
.
append
(
param
)
params
.
append
(
param
)
return
params
return
params
@
abstractmethod
def
get_main_grads_for_grad_norm
(
self
):
def
get_main_grads_for_grad_norm
(
self
):
pass
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
params
=
self
.
get_parameters
()
grads_for_norm
=
[]
for
param
in
params
:
grad
=
param
.
grad
grad_not_none
=
grad
is
not
None
is_not_shared
=
param_is_not_shared
(
param
)
is_not_tp_duplicate
=
param_is_not_tensor_parallel_duplicate
(
param
)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
grads_for_norm
.
append
(
grad
)
return
grads_for_norm
def
get_model_parallel_group
(
self
):
def
get_model_parallel_group
(
self
):
'''Default returned here, but the distributed optimizer overrides this.'''
'''Default returned here, but the distributed optimizer overrides this.'''
...
@@ -544,36 +561,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
...
@@ -544,36 +561,6 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
_zero_grad_group_helper
(
group
,
set_to_none
)
_zero_grad_group_helper
(
group
,
set_to_none
)
def
get_main_grads_for_grad_norm
(
self
):
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
params
=
self
.
get_parameters
()
# grads = []
grads_for_norm
=
[]
for
param
in
params
:
grad
=
param
.
grad
grad_not_none
=
grad
is
not
None
is_not_shared
=
param_is_not_shared
(
param
)
is_not_tp_duplicate
=
param_is_not_tensor_parallel_duplicate
(
param
)
# if grad_not_none:
# grad = param.grad.detach()
# if grad_not_none:
# # Make sure the grads are in fp32
# assert param.grad.type() == 'torch.cuda.FloatTensor'
# grads.append(grad)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
grads_for_norm
.
append
(
grad
)
# pax(0, {"grads_for_norm": [
# str(tuple(g.shape))
# for g in grads_for_norm
# ]})
return
grads_for_norm
def
_collect_main_grad_data_for_unscaling
(
self
):
def
_collect_main_grad_data_for_unscaling
(
self
):
main_grads
=
[]
main_grads
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment