"git@developer.sourcefind.cn:one/TransferBench.git" did not exist on "a576c7612d85d24780d26f382a046ab45d2b1bf7"
Unverified Commit 2d52809c authored by Cao Yuhang's avatar Cao Yuhang Committed by GitHub
Browse files

Add dynamic scale (#585)

* add dynamic scale

* add type check of loss scale

* fix lint

* minor fix
parent 03214fd4
...@@ -6,7 +6,7 @@ from .checkpoint import (_load_checkpoint, load_checkpoint, load_state_dict, ...@@ -6,7 +6,7 @@ from .checkpoint import (_load_checkpoint, load_checkpoint, load_state_dict,
from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info, from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
init_dist, master_only) init_dist, master_only)
from .epoch_based_runner import EpochBasedRunner, Runner from .epoch_based_runner import EpochBasedRunner, Runner
from .fp16_utils import auto_fp16, force_fp32, wrap_fp16_model from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistSamplerSeedHook, from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistSamplerSeedHook,
EMAHook, Fp16OptimizerHook, Hook, IterTimerHook, EMAHook, Fp16OptimizerHook, Hook, IterTimerHook,
LoggerHook, LrUpdaterHook, MlflowLoggerHook, OptimizerHook, LoggerHook, LrUpdaterHook, MlflowLoggerHook, OptimizerHook,
...@@ -33,5 +33,5 @@ __all__ = [ ...@@ -33,5 +33,5 @@ __all__ = [
'build_optimizer', 'build_optimizer_constructor', 'IterLoader', 'build_optimizer', 'build_optimizer_constructor', 'IterLoader',
'set_random_seed', 'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'set_random_seed', 'auto_fp16', 'force_fp32', 'wrap_fp16_model',
'Fp16OptimizerHook', 'SyncBuffersHook', 'EMAHook', 'build_runner', 'Fp16OptimizerHook', 'SyncBuffersHook', 'EMAHook', 'build_runner',
'RUNNERS', 'allreduce_grads', 'allreduce_params' 'RUNNERS', 'allreduce_grads', 'allreduce_params', 'LossScaler'
] ]
...@@ -264,3 +264,87 @@ def patch_forward_method(func, src_type, dst_type, convert_output=True): ...@@ -264,3 +264,87 @@ def patch_forward_method(func, src_type, dst_type, convert_output=True):
return output return output
return new_forward return new_forward
class LossScaler:
"""Class that manages loss scaling in mixed precision training which
supports both dynamic or static mode.
The implementation refers to
https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
It's important to understand how :class:`LossScaler` operates.
Loss scaling is designed to combat the problem of underflowing
gradients encountered at long times when training fp16 networks.
Dynamic loss scaling begins by attempting a very high loss
scale. Ironically, this may result in OVERflowing gradients.
If overflowing gradients are encountered, :class:`FP16_Optimizer` then
skips the update step for this particular iteration/minibatch,
and :class:`LossScaler` adjusts the loss scale to a lower value.
If a certain number of iterations occur without overflowing gradients
detected,:class:`LossScaler` increases the loss scale once more.
In this way :class:`LossScaler` attempts to "ride the edge" of always
using the highest loss scale possible without incurring overflow.
Args:
init_scale (float): Initial loss scale value, default: 2**32.
scale_factor (float): Factor used when adjusting the loss scale.
Default: 2.
mode (str): Loss scaling mode. 'dynamic' or 'static'
scale_window (int): Number of consecutive iterations without an
overflow to wait before increasing the loss scale. Default: 1000.
"""
def __init__(self,
init_scale=2**32,
mode='dynamic',
scale_factor=2.,
scale_window=1000):
self.cur_scale = init_scale
self.cur_iter = 0
assert mode in ('dynamic',
'static'), 'mode can only be dynamic or static'
self.mode = mode
self.last_overflow_iter = -1
self.scale_factor = scale_factor
self.scale_window = scale_window
def has_overflow(self, params):
"""Check if params contain overflow."""
if self.mode != 'dynamic':
return False
for p in params:
if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
return True
return False
def _has_inf_or_nan(x):
"""Check if params contain NaN."""
try:
cpu_sum = float(x.float().sum())
except RuntimeError as instance:
if 'value cannot be converted' not in instance.args[0]:
raise
return True
else:
if cpu_sum == float('inf') or cpu_sum == -float('inf') \
or cpu_sum != cpu_sum:
return True
return False
def update_scale(self, overflow):
"""update the current loss scale value when overflow happens."""
if self.mode != 'dynamic':
return
if overflow:
self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
self.last_overflow_iter = self.cur_iter
else:
if (self.cur_iter - self.last_overflow_iter) % \
self.scale_window == 0:
self.cur_scale *= self.scale_factor
self.cur_iter += 1
@property
def loss_scale(self):
return self.cur_scale
...@@ -6,7 +6,7 @@ from itertools import chain ...@@ -6,7 +6,7 @@ from itertools import chain
from torch.nn.utils import clip_grad from torch.nn.utils import clip_grad
from ..dist_utils import allreduce_grads from ..dist_utils import allreduce_grads
from ..fp16_utils import wrap_fp16_model from ..fp16_utils import LossScaler, wrap_fp16_model
from .hook import HOOKS, Hook from .hook import HOOKS, Hook
...@@ -48,7 +48,8 @@ class Fp16OptimizerHook(OptimizerHook): ...@@ -48,7 +48,8 @@ class Fp16OptimizerHook(OptimizerHook):
Refer to https://arxiv.org/abs/1710.03740 for more details. Refer to https://arxiv.org/abs/1710.03740 for more details.
Args: Args:
loss_scale (float): Scale factor multiplied with loss. loss_scale (float | str): Scale factor multiplied with loss. If
'dynamic' is specified, then dynamic loss scaling will be used.
""" """
def __init__(self, def __init__(self,
...@@ -60,8 +61,13 @@ class Fp16OptimizerHook(OptimizerHook): ...@@ -60,8 +61,13 @@ class Fp16OptimizerHook(OptimizerHook):
self.grad_clip = grad_clip self.grad_clip = grad_clip
self.coalesce = coalesce self.coalesce = coalesce
self.bucket_size_mb = bucket_size_mb self.bucket_size_mb = bucket_size_mb
self.loss_scale = loss_scale
self.distributed = distributed self.distributed = distributed
if loss_scale == 'dynamic':
self.loss_scaler = LossScaler(mode='dynamic')
elif isinstance(loss_scale, float):
self.loss_scaler = LossScaler(init_scale=loss_scale, mode='static')
else:
raise ValueError('loss_scale must be of type float or str')
def before_run(self, runner): def before_run(self, runner):
"""Preparing steps before Mixed Precision Training. """Preparing steps before Mixed Precision Training.
...@@ -100,7 +106,8 @@ class Fp16OptimizerHook(OptimizerHook): ...@@ -100,7 +106,8 @@ class Fp16OptimizerHook(OptimizerHook):
fp16_param.data.copy_(fp32_param.data) fp16_param.data.copy_(fp32_param.data)
def after_train_iter(self, runner): def after_train_iter(self, runner):
"""Backward optimization steps for Mixed Precision Training. """Backward optimization steps for Mixed Precision Training. For
dynamic loss scaling, please refer `loss_scalar.py`
1. Scale the loss by a scale factor. 1. Scale the loss by a scale factor.
2. Backward the loss to obtain the gradients (fp16). 2. Backward the loss to obtain the gradients (fp16).
...@@ -112,9 +119,10 @@ class Fp16OptimizerHook(OptimizerHook): ...@@ -112,9 +119,10 @@ class Fp16OptimizerHook(OptimizerHook):
runner.model.zero_grad() runner.model.zero_grad()
runner.optimizer.zero_grad() runner.optimizer.zero_grad()
# scale the loss value # scale the loss value
scaled_loss = runner.outputs['loss'] * self.loss_scale scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
scaled_loss.backward() scaled_loss.backward()
# copy fp16 grads in the model to fp32 params in the optimizer # copy fp16 grads in the model to fp32 params in the optimizer
fp32_weights = [] fp32_weights = []
for param_group in runner.optimizer.param_groups: for param_group in runner.optimizer.param_groups:
fp32_weights += param_group['params'] fp32_weights += param_group['params']
...@@ -122,17 +130,21 @@ class Fp16OptimizerHook(OptimizerHook): ...@@ -122,17 +130,21 @@ class Fp16OptimizerHook(OptimizerHook):
# allreduce grads # allreduce grads
if self.distributed: if self.distributed:
allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb) allreduce_grads(fp32_weights, self.coalesce, self.bucket_size_mb)
# scale the gradients back
for param in fp32_weights: has_overflow = self.loss_scaler.has_overflow(fp32_weights)
if param.grad is not None: # if has overflow, skip this iteration
param.grad.div_(self.loss_scale) if not has_overflow:
if self.grad_clip is not None: # scale the gradients back
grad_norm = self.clip_grads(fp32_weights) for param in fp32_weights:
if grad_norm is not None: if param.grad is not None:
# Add grad norm to the logger param.grad.div_(self.loss_scaler.loss_scale)
runner.log_buffer.update({'grad_norm': float(grad_norm)}, if self.grad_clip is not None:
runner.outputs['num_samples']) self.clip_grads(fp32_weights)
# update fp32 params # update fp32 params
runner.optimizer.step() runner.optimizer.step()
# copy fp32 params to the fp16 model # copy fp32 params to the fp16 model
self.copy_params_to_fp16(runner.model, fp32_weights) self.copy_params_to_fp16(runner.model, fp32_weights)
self.loss_scaler.update_scale(has_overflow)
if has_overflow:
runner.logger.warning('Check overflow, downscale loss scale '
f'to {self.loss_scaler.cur_scale}')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment