Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
2c3cb9fc
Commit
2c3cb9fc
authored
Mar 24, 2022
by
Lawrence McAfee
Browse files
many edits; working towards first draft.
parent
867105c2
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
456 additions
and
271 deletions
+456
-271
megatron/optimizer/distrib_optimizer.py
megatron/optimizer/distrib_optimizer.py
+426
-259
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+30
-12
No files found.
megatron/optimizer/distrib_optimizer.py
View file @
2c3cb9fc
This diff is collapsed.
Click to expand it.
megatron/optimizer/optimizer.py
View file @
2c3cb9fc
...
@@ -323,6 +323,22 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -323,6 +323,22 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
self
.
_copy_model_params_to_main_params
()
self
.
_copy_model_params_to_main_params
()
# >>>
# def zero_grad(self, set_to_none=True):
# """We only need to zero the model related parameters, i.e.,
# float16_groups & fp32_from_fp32_groups. We additionally zero
# fp32_from_float16_groups as a memory optimization to reduce
# fragmentation; in the case of set_to_none==True, the space
# used by this field can be safely deallocated at this point."""
# for group in self.float16_groups:
# _zero_grad_group_helper(group, set_to_none)
# for group in self.fp32_from_float16_groups:
# _zero_grad_group_helper(group, set_to_none)
# for group in self.fp32_from_fp32_groups:
# _zero_grad_group_helper(group, set_to_none)
# <<<
def
_unscale_main_grads_and_check_for_nan
(
self
):
def
_unscale_main_grads_and_check_for_nan
(
self
):
# Collect main grads.
# Collect main grads.
...
@@ -552,18 +568,20 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
...
@@ -552,18 +568,20 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
self
.
optimizer
.
load_state_dict
(
self
.
optimizer
.
state_dict
())
self
.
optimizer
.
load_state_dict
(
self
.
optimizer
.
state_dict
())
def
zero_grad
(
self
,
set_to_none
=
True
):
# >>>
"""We only need to zero the model related parameters, i.e.,
# def zero_grad(self, set_to_none=True):
float16_groups & fp32_from_fp32_groups. We additionally zero
# """We only need to zero the model related parameters, i.e.,
fp32_from_float16_groups as a memory optimization to reduce
# float16_groups & fp32_from_fp32_groups. We additionally zero
fragmentation; in the case of set_to_none==True, the space
# fp32_from_float16_groups as a memory optimization to reduce
used by this field can be safely deallocated at this point."""
# fragmentation; in the case of set_to_none==True, the space
for
group
in
self
.
float16_groups
:
# used by this field can be safely deallocated at this point."""
_zero_grad_group_helper
(
group
,
set_to_none
)
# for group in self.float16_groups:
for
group
in
self
.
fp32_from_float16_groups
:
# _zero_grad_group_helper(group, set_to_none)
_zero_grad_group_helper
(
group
,
set_to_none
)
# for group in self.fp32_from_float16_groups:
for
group
in
self
.
fp32_from_fp32_groups
:
# _zero_grad_group_helper(group, set_to_none)
_zero_grad_group_helper
(
group
,
set_to_none
)
# for group in self.fp32_from_fp32_groups:
# _zero_grad_group_helper(group, set_to_none)
# <<<
def
_collect_main_grad_data_for_unscaling
(
self
):
def
_collect_main_grad_data_for_unscaling
(
self
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment