Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
29853c3e
Unverified
Commit
29853c3e
authored
Mar 10, 2021
by
Stas Bekman
Committed by
GitHub
Mar 10, 2021
Browse files
less scary overflow notice (#833)
Co-authored-by:
Jeff Rasley
<
jerasley@microsoft.com
>
parent
dd03cff2
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
18 additions
and
18 deletions
+18
-18
deepspeed/runtime/fp16/fused_optimizer.py
deepspeed/runtime/fp16/fused_optimizer.py
+4
-4
deepspeed/runtime/fp16/loss_scaler.py
deepspeed/runtime/fp16/loss_scaler.py
+1
-1
deepspeed/runtime/fp16/unfused_optimizer.py
deepspeed/runtime/fp16/unfused_optimizer.py
+8
-8
deepspeed/runtime/zero/stage1.py
deepspeed/runtime/zero/stage1.py
+4
-4
deepspeed/runtime/zero/stage2.py
deepspeed/runtime/zero/stage2.py
+1
-1
No files found.
deepspeed/runtime/fp16/fused_optimizer.py
View file @
29853c3e
...
@@ -153,10 +153,10 @@ class FP16_Optimizer(object):
...
@@ -153,10 +153,10 @@ class FP16_Optimizer(object):
if
self
.
overflow
:
if
self
.
overflow
:
if
self
.
verbose
:
if
self
.
verbose
:
logger
.
info
(
"[deepspeed] OVERFLOW! Skipping step. Attempted loss "
logger
.
info
(
"scale: {}, reducing to {}"
.
format
(
"[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale
,
"scale: {}, reducing to {}"
.
format
(
prev_scale
,
self
.
cur_scale
))
self
.
cur_scale
))
return
self
.
overflow
return
self
.
overflow
combined_scale
=
self
.
unscale_and_clip_grads
(
grads_groups_flat
,
combined_scale
=
self
.
unscale_and_clip_grads
(
grads_groups_flat
,
norm_groups
,
norm_groups
,
...
...
deepspeed/runtime/fp16/loss_scaler.py
View file @
29853c3e
...
@@ -213,7 +213,7 @@ if __name__ == "__main__":
...
@@ -213,7 +213,7 @@ if __name__ == "__main__":
optimizer.step()
optimizer.step()
# Otherwise, don't do anything -- ie, skip iteration
# Otherwise, don't do anything -- ie, skip iteration
else:
else:
print('
OVERFLOW
!')
print('
fp16 dynamic loss scale overflow
!')
# Update loss scale for next iteration
# Update loss scale for next iteration
loss_scaler.update_scale(has_overflow)
loss_scaler.update_scale(has_overflow)
...
...
deepspeed/runtime/fp16/unfused_optimizer.py
View file @
29853c3e
...
@@ -139,10 +139,10 @@ class FP16_UnfusedOptimizer(object):
...
@@ -139,10 +139,10 @@ class FP16_UnfusedOptimizer(object):
self
.
_update_scale
(
self
.
overflow
)
self
.
_update_scale
(
self
.
overflow
)
if
self
.
overflow
:
if
self
.
overflow
:
if
self
.
verbose
:
if
self
.
verbose
:
logger
.
info
(
"[deepspeed] OVERFLOW! Skipping step. Attempted loss "
logger
.
info
(
"scale: {}, reducing to {}"
.
format
(
"[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale
,
"scale: {}, reducing to {}"
.
format
(
prev_scale
,
self
.
cur_scale
))
self
.
cur_scale
))
return
self
.
overflow
return
self
.
overflow
combined_scale
=
self
.
unscale_and_clip_grads
(
norm_groups
,
apply_scale
=
False
)
combined_scale
=
self
.
unscale_and_clip_grads
(
norm_groups
,
apply_scale
=
False
)
...
@@ -165,10 +165,10 @@ class FP16_UnfusedOptimizer(object):
...
@@ -165,10 +165,10 @@ class FP16_UnfusedOptimizer(object):
self
.
_update_scale
(
self
.
overflow
)
self
.
_update_scale
(
self
.
overflow
)
if
self
.
overflow
:
if
self
.
overflow
:
if
self
.
verbose
:
if
self
.
verbose
:
logger
.
info
(
"[deepspeed] OVERFLOW! Skipping step. Attempted loss "
logger
.
info
(
"scale: {}, reducing to {}"
.
format
(
"[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale
,
"scale: {}, reducing to {}"
.
format
(
prev_scale
,
self
.
cur_scale
))
self
.
cur_scale
))
return
self
.
overflow
return
self
.
overflow
norm_groups
=
[]
norm_groups
=
[]
...
...
deepspeed/runtime/zero/stage1.py
View file @
29853c3e
...
@@ -630,10 +630,10 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object):
...
@@ -630,10 +630,10 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object):
if
self
.
overflow
:
if
self
.
overflow
:
self
.
zero_grad
()
self
.
zero_grad
()
if
self
.
verbose
:
if
self
.
verbose
:
logger
.
info
(
"[deepspeed] OVERFLOW! Skipping step. Attempted loss "
logger
.
info
(
"scale: {}, reducing to {}"
.
format
(
"[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale
,
"scale: {}, reducing to {}"
.
format
(
prev_scale
,
self
.
loss_scale
))
self
.
loss_scale
))
return
self
.
overflow
return
self
.
overflow
norm_groups
=
[]
norm_groups
=
[]
...
...
deepspeed/runtime/zero/stage2.py
View file @
29853c3e
...
@@ -1355,7 +1355,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
...
@@ -1355,7 +1355,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
see_memory_usage
(
'After overflow after clearing gradients'
)
see_memory_usage
(
'After overflow after clearing gradients'
)
logger
.
info
(
logger
.
info
(
"[deeps
cale] OVERFLOW
! Rank {} Skipping step. Attempted loss scale: {}, "
"[deeps
peed] fp16 dynamic loss scale overflow
! Rank {} Skipping step. Attempted loss scale: {}, "
"reducing to {}"
.
format
(
dist
.
get_rank
(),
"reducing to {}"
.
format
(
dist
.
get_rank
(),
prev_scale
,
prev_scale
,
self
.
loss_scale
))
self
.
loss_scale
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment