Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
ac5ef637
Commit
ac5ef637
authored
Feb 18, 2022
by
Lawrence McAfee
Browse files
small fixes; training & learning!
parent
a4f41882
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
18 additions
and
23 deletions
+18
-23
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+18
-23
No files found.
megatron/optimizer/optimizer.py
View file @
ac5ef637
...
@@ -1083,35 +1083,31 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
...
@@ -1083,35 +1083,31 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
# "model_shard" : str(model_shard),
# "model_shard" : str(model_shard),
# })
# })
pax
(
0
,
{
#
pax(0, {
"opt_group_shards"
:
self
.
opt_group_shards
,
#
"opt_group_shards" : self.opt_group_shards,
"main_param_shards"
:
self
.
main_param_shards
,
#
"main_param_shards" : self.main_param_shards,
})
#
})
def
_copy_model_grads_to_main_grads
(
self
):
def
_copy_model_grads_to_main_grads
(
self
):
for
group_index
,
group_shard
in
enumerate
(
self
.
opt_group_shards
):
for
group_index
,
group_shard
in
enumerate
(
self
.
opt_group_shards
):
for
model_param
,
main_shard
in
group_shard
[
"param_map"
].
items
():
for
model_param
,
main_shard
in
group_shard
[
"param_map"
].
items
():
model_index
,
gbuf_
dtype
=
self
.
param_gbuf_map
[
param
]
model_index
,
dtype
=
self
.
param_gbuf_map
[
model_
param
]
model_shard
=
self
.
model_gbuf_shards
\
model_shard
=
self
.
model_gbuf_shards
\
[
model_index
][
gbuf_
dtype
][
"param_map"
][
param
][
"world"
]
[
model_index
][
dtype
][
"param_map"
][
model_
param
][
"
gbuf_
world"
]
assert
main_shard
.
size
==
model_shard
.
size
assert
main_shard
.
size
==
model_shard
.
size
# Copy from DDP's contiguous buffer to main shard's grad.
# Copy from DDP's contiguous buffer to main shard's grad.
model_grad_tensor
=
\
model_grad
=
self
.
models
[
model_index
].
_grad_buffers
[
dtype
].
data
self
.
models
[
model_index
].
_grad_buffers
[
gbuf_dtype
].
data
main_grad
=
self
.
main_param_shards
[
group_index
].
grad
main_grad_tensor
=
\
self
.
main_param_shards
[
group_index
].
grad
# Copy sub-range within tensor.
# Copy sub-range within tensor.
model_grad_view
=
\
model_view
=
model_grad
[
model_shard
.
start
:
model_shard
.
end
]
model_grad_tensor
[
model_shard
.
start
:
model_shard
.
end
]
main_view
=
main_grad
[
main_shard
.
start
:
main_shard
.
end
]
main_grad_view
=
\
main_grad_tensor
[
main_shard
.
start
:
main_shard
.
end
]
main_
grad_
view
.
detach
().
copy_
(
model_
grad_
view
)
main_view
.
detach
().
copy_
(
model_view
)
# pax(0, {
# pax(0, {
# "group_index" : group_index,
# "group_index" : group_index,
...
@@ -1146,22 +1142,21 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
...
@@ -1146,22 +1142,21 @@ class Float16DistributedOptimizer(BaseFloat16Optimizer):
def
_copy_main_params_to_model_params
(
self
):
def
_copy_main_params_to_model_params
(
self
):
for
group_index
,
group_shard
in
enumerate
(
self
.
opt_group_shards
):
for
group_index
,
group_shard
in
enumerate
(
self
.
opt_group_shards
):
for
param
,
main_shard
in
group_shard
[
"param_map"
].
items
():
for
model_
param
,
main_shard
in
group_shard
[
"param_map"
].
items
():
model_index
,
gbuf_
dtype
=
self
.
param_gbuf_map
[
param
]
model_index
,
dtype
=
self
.
param_gbuf_map
[
model_
param
]
model_shard
=
self
.
model_gbuf_shards
\
model_shard
=
self
.
model_gbuf_shards
\
[
model_index
][
gbuf_
dtype
][
"param_map"
][
param
][
"world"
]
[
model_index
][
dtype
][
"param_map"
][
model_
param
][
"
gbuf_
world"
]
assert
main_shard
.
size
==
model_shard
.
size
assert
main_shard
.
size
==
model_shard
.
size
# Use DDP's contiguous buffer to temporarily hold params.
# Use DDP's contiguous buffer to temporarily hold params.
model_tensor
=
\
model_param
=
self
.
models
[
model_index
].
_grad_buffers
[
dtype
].
data
self
.
models
[
model_index
].
_grad_buffers
[
gbuf_dtype
].
data
main_param
=
self
.
main_param_shards
[
group_index
]
main_tensor
=
self
.
main_param_shards
[
group_index
]
# Copy sub-range within tensor.
# Copy sub-range within tensor.
model_view
=
model_
tensor
[
model_shard
.
start
:
model_shard
.
end
]
model_view
=
model_
param
[
model_shard
.
start
:
model_shard
.
end
]
main_view
=
main_
tensor
[
main_shard
.
start
:
main_shard
.
end
]
main_view
=
main_
param
[
main_shard
.
start
:
main_shard
.
end
]
model_view
.
detach
().
copy_
(
main_view
)
model_view
.
detach
().
copy_
(
main_view
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment