Unverified Commit 29853c3e authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

less scary overflow notice (#833)


Co-authored-by: default avatarJeff Rasley <jerasley@microsoft.com>
parent dd03cff2
...@@ -153,10 +153,10 @@ class FP16_Optimizer(object): ...@@ -153,10 +153,10 @@ class FP16_Optimizer(object):
if self.overflow: if self.overflow:
if self.verbose: if self.verbose:
logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " logger.info(
"scale: {}, reducing to {}".format( "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale, "scale: {}, reducing to {}".format(prev_scale,
self.cur_scale)) self.cur_scale))
return self.overflow return self.overflow
combined_scale = self.unscale_and_clip_grads(grads_groups_flat, combined_scale = self.unscale_and_clip_grads(grads_groups_flat,
norm_groups, norm_groups,
......
...@@ -213,7 +213,7 @@ if __name__ == "__main__": ...@@ -213,7 +213,7 @@ if __name__ == "__main__":
optimizer.step() optimizer.step()
# Otherwise, don't do anything -- ie, skip iteration # Otherwise, don't do anything -- ie, skip iteration
else: else:
print('OVERFLOW!') print('fp16 dynamic loss scale overflow!')
# Update loss scale for next iteration # Update loss scale for next iteration
loss_scaler.update_scale(has_overflow) loss_scaler.update_scale(has_overflow)
......
...@@ -139,10 +139,10 @@ class FP16_UnfusedOptimizer(object): ...@@ -139,10 +139,10 @@ class FP16_UnfusedOptimizer(object):
self._update_scale(self.overflow) self._update_scale(self.overflow)
if self.overflow: if self.overflow:
if self.verbose: if self.verbose:
logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " logger.info(
"scale: {}, reducing to {}".format( "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale, "scale: {}, reducing to {}".format(prev_scale,
self.cur_scale)) self.cur_scale))
return self.overflow return self.overflow
combined_scale = self.unscale_and_clip_grads(norm_groups, apply_scale=False) combined_scale = self.unscale_and_clip_grads(norm_groups, apply_scale=False)
...@@ -165,10 +165,10 @@ class FP16_UnfusedOptimizer(object): ...@@ -165,10 +165,10 @@ class FP16_UnfusedOptimizer(object):
self._update_scale(self.overflow) self._update_scale(self.overflow)
if self.overflow: if self.overflow:
if self.verbose: if self.verbose:
logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " logger.info(
"scale: {}, reducing to {}".format( "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale, "scale: {}, reducing to {}".format(prev_scale,
self.cur_scale)) self.cur_scale))
return self.overflow return self.overflow
norm_groups = [] norm_groups = []
......
...@@ -630,10 +630,10 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object): ...@@ -630,10 +630,10 @@ class FP16_DeepSpeedZeroOptimizer_Stage1(object):
if self.overflow: if self.overflow:
self.zero_grad() self.zero_grad()
if self.verbose: if self.verbose:
logger.info("[deepspeed] OVERFLOW! Skipping step. Attempted loss " logger.info(
"scale: {}, reducing to {}".format( "[deepspeed] fp16 dynamic loss scale overflow! Skipping step. Attempted loss "
prev_scale, "scale: {}, reducing to {}".format(prev_scale,
self.loss_scale)) self.loss_scale))
return self.overflow return self.overflow
norm_groups = [] norm_groups = []
......
...@@ -1355,7 +1355,7 @@ class FP16_DeepSpeedZeroOptimizer(object): ...@@ -1355,7 +1355,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
see_memory_usage('After overflow after clearing gradients') see_memory_usage('After overflow after clearing gradients')
logger.info( logger.info(
"[deepscale] OVERFLOW! Rank {} Skipping step. Attempted loss scale: {}, " "[deepspeed] fp16 dynamic loss scale overflow! Rank {} Skipping step. Attempted loss scale: {}, "
"reducing to {}".format(dist.get_rank(), "reducing to {}".format(dist.get_rank(),
prev_scale, prev_scale,
self.loss_scale)) self.loss_scale))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment