Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
f6811e28
"vscode:/vscode.git/clone" did not exist on "b8fe2b4837c4ccf9f71c65d86ff336a98f45d1c3"
Commit
f6811e28
authored
Mar 11, 2022
by
Lawrence McAfee
Browse files
guard 'gather_params()' with 'if update_successful:'
parent
c64098ef
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
33 additions
and
2 deletions
+33
-2
megatron/optimizer/distrib_optimizer.py
megatron/optimizer/distrib_optimizer.py
+8
-1
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+23
-0
megatron/training.py
megatron/training.py
+2
-1
No files found.
megatron/optimizer/distrib_optimizer.py
View file @
f6811e28
...
...
@@ -177,12 +177,19 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
def
allocate_main_param_shards
(
cls
,
opt_group_shards
):
# Allocator method.
# >>>
allocate_shard
=
lambda
shard_size
,
dtype
:
torch
.
empty
(
(
shard_size
,),
dtype
=
dtype
,
device
=
torch
.
cuda
.
current_device
(),
requires_grad
=
True
)
# allocate_shard = lambda shard_size, dtype : torch.zeros(
# (shard_size,),
# dtype = dtype,
# device = torch.cuda.current_device(),
# requires_grad = True)
# <<<
# Allocate each group's param/grad shard.
for
group_index
,
group_shard
in
enumerate
(
opt_group_shards
):
...
...
megatron/optimizer/optimizer.py
View file @
f6811e28
...
...
@@ -330,6 +330,25 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
# Check for nan.
found_inf_flag
=
(
self
.
found_inf
.
item
()
>
0
)
# >>>
# if self.grad_scaler.scale <= 131072:
# pax(0, {
# # "grad_scaler" : self.grad_scaler,
# # "found_inf_flag" : found_inf_flag,
# "model_params" : [
# p
# for m in self.models
# for p in m.parameters()
# ],
# "model_grads" : [
# p.main_grad
# for m in self.models
# for p in m.parameters()
# ],
# # "main_grads" : main_grads,
# })
# <<<
return
found_inf_flag
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
...
...
@@ -411,6 +430,10 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
found_inf_flag
=
self
.
_unscale_main_grads_and_check_for_nan
()
timers
(
'optimizer-unscale-and-check-inf'
).
stop
()
# >>>
# <<<
# We are done with scaling gradients
# so we can update the loss scale.
self
.
grad_scaler
.
update
(
found_inf_flag
)
...
...
megatron/training.py
View file @
f6811e28
...
...
@@ -453,7 +453,8 @@ def train_step(forward_step_func, data_iterator,
# >>>
# Gather params.
optimizer
.
gather_model_params
(
args
,
timers
,
ITERATION
)
if
update_successful
:
optimizer
.
gather_model_params
(
args
,
timers
,
ITERATION
)
# <<<
# >>>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment