Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
6f2bff5c
Commit
6f2bff5c
authored
Jul 16, 2021
by
Lawrence McAfee
Browse files
(conditionally) release grad/main_grad memory in copy_model_grads_to_main_grads, after copy
parent
90e0a0dd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
6 deletions
+32
-6
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+3
-1
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+29
-5
No files found.
megatron/optimizer/__init__.py
View file @
6f2bff5c
...
...
@@ -100,10 +100,12 @@ def get_megatron_optimizer(model):
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
,
args
.
use_contiguous_buffers_in_ddp
,
args
.
bf16
,
grad_scaler
)
# FP32.
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
)
params_have_main_grad
,
args
.
use_contiguous_buffers_in_ddp
)
megatron/optimizer/optimizer.py
View file @
6f2bff5c
...
...
@@ -68,7 +68,9 @@ class MegatronOptimizer(ABC):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
):
params_have_main_grad
,
use_contiguous_buffers_in_ddp
):
"""Input optimizer is the base optimizer for example Adam."""
self
.
optimizer
=
optimizer
assert
self
.
optimizer
,
'no optimizer is provided.'
...
...
@@ -76,6 +78,7 @@ class MegatronOptimizer(ABC):
self
.
clip_grad
=
clip_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
self
.
params_have_main_grad
=
params_have_main_grad
self
.
use_contiguous_buffers_in_ddp
=
use_contiguous_buffers_in_ddp
def
get_parameters
(
self
):
...
...
@@ -187,11 +190,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
"""
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
bf16
,
grad_scaler
):
params_have_main_grad
,
use_contiguous_buffers_in_ddp
,
bf16
,
grad_scaler
):
super
(
Float16OptimizerWithFloat16Params
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
)
params_have_main_grad
,
use_contiguous_buffers_in_ddp
)
self
.
bf16
=
bf16
self
.
grad_scaler
=
grad_scaler
...
...
@@ -305,12 +309,25 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
else
:
if
model_param
.
grad
is
not
None
:
main_param
.
grad
=
model_param
.
grad
.
float
()
# Safe to deallocate model's grad/main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
model_param
.
grad
=
None
if
not
self
.
use_contiguous_buffers_in_ddp
:
model_param
.
main_grad
=
None
# For fp32 grads, we need to reset the grads to main grad.
if
self
.
params_have_main_grad
:
for
model_group
in
self
.
fp32_from_fp32_groups
:
for
model_param
in
model_group
:
model_param
.
grad
=
model_param
.
main_grad
# Safe to de-reference model's main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
if
not
self
.
use_contiguous_buffers_in_ddp
:
model_param
.
main_grad
=
None
def
_unscale_main_grads_and_check_for_nan
(
self
):
main_grads
=
[]
...
...
@@ -464,11 +481,12 @@ class FP32Optimizer(MegatronOptimizer):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
):
params_have_main_grad
,
use_contiguous_buffers_in_ddp
):
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
)
params_have_main_grad
,
use_contiguous_buffers_in_ddp
)
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
...
...
@@ -495,6 +513,12 @@ class FP32Optimizer(MegatronOptimizer):
for
param
in
param_group
[
'params'
]:
param
.
grad
=
param
.
main_grad
# Safe to de-reference model's main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
if
not
self
.
use_contiguous_buffers_in_ddp
:
param
.
main_grad
=
None
# Clip gradients.
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment