Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
64b9d28c
"src/vscode:/vscode.git/clone" did not exist on "c77ac246c1650f44f0b39b663c14207e7e669bdd"
Commit
64b9d28c
authored
Mar 09, 2022
by
Lawrence McAfee
Browse files
renamed reduce_grads/gather_params -> reduce_model_grads/gather_model_params
parent
d58d1762
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
17 additions
and
24 deletions
+17
-24
megatron/optimizer/distrib_optimizer.py
megatron/optimizer/distrib_optimizer.py
+8
-18
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+4
-3
megatron/training.py
megatron/training.py
+5
-3
No files found.
megatron/optimizer/distrib_optimizer.py
View file @
64b9d28c
...
...
@@ -381,21 +381,11 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
return
gbuf_view_items
# def reduce_grads(self, model):
def
reduce_grads
(
self
,
args
,
timers
):
# >>>
# from torch.nn.parallel.distributed import DistributedDataParallel as torchDDP
# from megatron import get_args
# from megatron import get_timers
# from megatron.model import DistributedDataParallel as LocalDDP
# from megatron.model import Float16Module
# from megatron.utils import unwrap_model
# args = get_args()
# timers = get_timers()
# <<<
def
reduce_model_grads
(
self
,
args
,
timers
):
'''Note: this is a different order of reduction, versus the non-
distributed optimizer, which reduces: 1) all grads, 2) embedding
grads.
'''
# All-reduce embedding grads.
timers
(
'backward-embedding-all-reduce'
).
start
()
...
...
@@ -420,7 +410,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
timers
(
'backward-params-all-reduce'
).
stop
()
def
gather_params
(
self
,
args
,
timers
,
ITERATION
):
def
gather_
model_
params
(
self
,
args
,
timers
,
ITERATION
):
# >>>
# timers = get_timers()
...
...
@@ -454,8 +444,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
# pax(0, {"gbuf_view_items": gbuf_view_items})
# >>>
# self.debug_main(ITERATION, "after/inside gather_params.", 0)
# self.debug_model(ITERATION, "after/inside gather_params.", 0)
# self.debug_main(ITERATION, "after/inside gather_
model_
params.", 0)
# self.debug_model(ITERATION, "after/inside gather_
model_
params.", 0)
# if ITERATION == 2:
# pax(1, {
...
...
megatron/optimizer/optimizer.py
View file @
64b9d28c
...
...
@@ -180,7 +180,9 @@ class MegatronOptimizer(ABC):
def
step
(
self
,
args
,
timers
):
pass
def
gather_params
(
self
,
args
,
timers
,
ITERATION
):
def
gather_model_params
(
self
,
args
,
timers
,
ITERATION
):
'''For the case of a non-distributed-optimizer, there is nothing to
do here.'''
pass
def
allreduce_word_embedding_grads
(
self
):
...
...
@@ -236,8 +238,7 @@ class MegatronOptimizer(ABC):
self
.
allreduce_word_embedding_grads
()
self
.
allreduce_position_embedding_grads
()
# def reduce_grads(self, model):
def
reduce_grads
(
self
,
args
,
timers
):
def
reduce_model_grads
(
self
,
args
,
timers
):
# pax(0, {
# "*models" : self.models,
...
...
megatron/training.py
View file @
64b9d28c
...
...
@@ -437,7 +437,7 @@ def train_step(forward_step_func, data_iterator,
# >>>
# Reduce gradients.
optimizer
.
reduce_grads
(
args
,
timers
)
optimizer
.
reduce_
model_
grads
(
args
,
timers
)
# <<<
# Update parameters.
...
...
@@ -447,7 +447,7 @@ def train_step(forward_step_func, data_iterator,
# >>>
# Gather params.
optimizer
.
gather_params
(
args
,
timers
,
ITERATION
)
optimizer
.
gather_
model_
params
(
args
,
timers
,
ITERATION
)
# <<<
# >>>
...
...
@@ -464,7 +464,9 @@ def train_step(forward_step_func, data_iterator,
else
:
skipped_iter
=
1
# Empty unused memory
# >>>
# Empty unused memory.
# <<<
if
args
.
empty_unused_memory_level
>=
2
:
torch
.
cuda
.
empty_cache
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment