Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
eaa0c1df
Commit
eaa0c1df
authored
Feb 15, 2022
by
Lawrence McAfee
Browse files
updated zero_grad
parent
33626179
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
31 additions
and
11 deletions
+31
-11
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+31
-11
No files found.
megatron/optimizer/optimizer.py
View file @
eaa0c1df
...
...
@@ -1143,11 +1143,11 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
self
.
optimizer
.
load_state_dict
(
self
.
optimizer
.
state_dict
())
# >>>
pax
(
0
,
{
"model_gbuf_shards"
:
self
.
model_gbuf_shards
,
"opt_group_shards"
:
self
.
opt_group_shards
,
"main_param_shards"
:
self
.
main_param_shards
,
})
#
pax(0, {
#
"model_gbuf_shards" : self.model_gbuf_shards,
#
"opt_group_shards" : self.opt_group_shards,
#
"main_param_shards" : self.main_param_shards,
#
})
# <<<
# def get_loss_scale(self):
...
...
@@ -1162,21 +1162,34 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
def
state_dict
(
self
):
raise
Exception
(
"hi."
)
# def zero_grad(self, set_to_none=True):
# params = []
# for model_param_group in self.model_param_groups:
# params.extend(model_param_group["offset_map"].keys())
# for main_group in self.optimizer.param_groups:
# params.extend(main_group["params"])
# # _zero_grad_group_helper(params, set_to_none)
# _zero_grad_group_helper(params, set_to_none = False)
# # pax(0, {
# # "model_param_groups" : self.model_param_groups,
# # "params" : params,
# # })
def
zero_grad
(
self
,
set_to_none
=
True
):
params
=
[]
for
model_param_group
in
self
.
model_param_groups
:
params
.
extend
(
model_param_group
[
"offset_map"
].
keys
())
for
model
in
self
.
models
:
for
dtype
,
param_map
in
model
.
_grad_buffer_param_index_map
.
items
():
params
.
extend
(
param_map
.
keys
())
for
main_group
in
self
.
optimizer
.
param_groups
:
params
.
extend
(
main_group
[
"params"
])
# _zero_grad_group_helper(params, set_to_none)
_zero_grad_group_helper
(
params
,
set_to_none
=
False
)
# pax(0, {
# "model_param_groups" : self.model_param_groups,
# "params" : params,
# })
# pax(0, {"params": params})
# def reduce_gradients(self, model):
...
...
@@ -1289,6 +1302,13 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
# ** only contiguous grad buffer supported, for now [ TEMPORARY ] **
assert
args
.
use_contiguous_buffers_in_local_ddp
for
model_index
,
model
in
enuemrate
(
self
.
models
):
pax
(
0
,
{
"model_index"
:
model_index
,
"model"
:
model
,
})
world_sizes
=
[]
for
r
in
self
.
world_shard_infos
:
# world_sizes.append(sum(g["size"] for g in r))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment