Commit 007c5947 authored by Deyu Fu's avatar Deyu Fu
Browse files

Reverse to Fused* naming, clean up accordingly:

FusedSGD now work as before
FusedAdam now work with o1/o2, no longer fuse scaling and casting
Removed special backend handling for FusedAdam
Moved and updated test for FusedAdam into run_optimizers
Removed legacy tests for optimizers.FP16_optimizer and FusedAdam in run_mixed_adam
parent 37a1c121
...@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer ...@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer
from apex.fp16_utils import convert_network from apex.fp16_utils import convert_network
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
from ..optimizers import FusedAdam
from ..parallel import DistributedDataParallel as apex_DDP from ..parallel import DistributedDataParallel as apex_DDP
from ..parallel.LARC import LARC from ..parallel.LARC import LARC
......
...@@ -3,7 +3,7 @@ from ..fp16_utils import master_params_to_model_params ...@@ -3,7 +3,7 @@ from ..fp16_utils import master_params_to_model_params
from ..multi_tensor_apply import multi_tensor_applier from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import maybe_print from ._amp_state import maybe_print
import torch import torch
from ..optimizers import FusedAdam, FusedSGD from ..optimizers import FusedSGD
class AmpOptimizerState(object): class AmpOptimizerState(object):
...@@ -241,67 +241,8 @@ def post_backward_no_master_weights(self, scaler): ...@@ -241,67 +241,8 @@ def post_backward_no_master_weights(self, scaler):
post_backward_models_are_masters(scaler, params, stashed_grads) post_backward_models_are_masters(scaler, params, stashed_grads)
#####################################################################################
# FusedAdam versions
#####################################################################################
def prepare_backward_with_master_weights_FusedAdam(self):
stash = self._amp_stash
self._amp_lazy_init()
def post_backward_with_master_weights_FusedAdam(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
stash.scale = scaler.loss_scale()
stash.grads = [[param.grad.data for param in group] for group in stash.fp16_groups]
stash.output_params = [[param for param in group] for group in stash.fp16_groups]
norm_groups = []
skip = False
for grad_group in stash.grads:
norm, _ = multi_tensor_applier(
stash.multi_tensor_l2norm,
stash.dummy_overflow_buf,
[grad_group],
False)
# Still syncing here for now.
norm = float(norm)
norm_groups.append(norm)
if norm == float('inf') or norm == -float('inf') or norm != norm:
skip = True
if skip:
scaler._overflow_buf.fill_(1.)
scaler._has_overflow = True
stash.grad_norms = norm_groups
def prepare_backward_no_master_weights_FusedAdam(self):
stash = self._amp_stash
self._amp_lazy_init()
def post_backward_no_master_weights_FusedAdam(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
stash.scale = scaler.loss_scale()
stash.grads = None
stash.output_params = None
stash.grad_norms = None
##################################################################################### #####################################################################################
# FusedSGD versions # FusedSGD versions
# Eat this ugly code duplication for now. First make it work, then make it clean.
# It's difficult to anticipate what can be unified between the FusedAdam and FusedSGD
# implementations until I have them both working.
##################################################################################### #####################################################################################
# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params # FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
...@@ -406,7 +347,7 @@ def _process_optimizer(optimizer, properties): ...@@ -406,7 +347,7 @@ def _process_optimizer(optimizer, properties):
if closure is not None: if closure is not None:
raise RuntimeError("Currently, Amp does not support closure use with optimizers.") raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
retval = old_step() retval = old_step()
if not (isinstance(self, FusedAdam) or isinstance(self, FusedSGD)): if not isinstance(self, FusedSGD):
self._master_params_to_model_params() self._master_params_to_model_params()
# Clear the master grads that wouldn't be zeroed by model.zero_grad() # Clear the master grads that wouldn't be zeroed by model.zero_grad()
for param in self._amp_stash.all_fp32_from_fp16_params: for param in self._amp_stash.all_fp32_from_fp16_params:
...@@ -432,12 +373,7 @@ def _process_optimizer(optimizer, properties): ...@@ -432,12 +373,7 @@ def _process_optimizer(optimizer, properties):
param.grad = None param.grad = None
optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer) optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
if isinstance(optimizer, FusedAdam): if isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedAdam, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights_FusedAdam, optimizer)
elif isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType( optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedSGD, optimizer) prepare_backward_with_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType( optimizer._post_amp_backward = types.MethodType(
...@@ -451,12 +387,7 @@ def _process_optimizer(optimizer, properties): ...@@ -451,12 +387,7 @@ def _process_optimizer(optimizer, properties):
optimizer._lazy_init_maybe_master_weights = types.MethodType( optimizer._lazy_init_maybe_master_weights = types.MethodType(
lazy_init_no_master_weights, optimizer) lazy_init_no_master_weights, optimizer)
if isinstance(optimizer, FusedAdam): if isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedAdam, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights_FusedAdam, optimizer)
elif isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType( optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedSGD, optimizer) prepare_backward_no_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType( optimizer._post_amp_backward = types.MethodType(
......
...@@ -113,7 +113,7 @@ def scale_loss(loss, ...@@ -113,7 +113,7 @@ def scale_loss(loss,
for optimizer in optimizers: for optimizer in optimizers:
optimizer._amp_stash.params_have_scaled_gradients = True optimizer._amp_stash.params_have_scaled_gradients = True
else: else:
# FusedAdam and FusedSGD may take care of unscaling as part of their step() methods. # FusedSGD may take care of unscaling as part of their step() methods.
# if not isinstance(optimizers, FP16_Optimizer_for_fused): # if not isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scaler.clear_overflow_state() loss_scaler.clear_overflow_state()
for optimizer in optimizers: for optimizer in optimizers:
......
from .fused_sgd import FusedSGD from .fused_sgd import FusedSGD
from .fused_adam import FusedAdam from .fused_adam import FusedAdam
from .fused_novograd import FusedNovoGrad
from .fp16_optimizer import FP16_Optimizer from .fp16_optimizer import FP16_Optimizer
from .sgd import SGD
from .adam import Adam
from .novograd import NovoGrad
import torch
from apex.multi_tensor_apply import multi_tensor_applier
from amp_C import multi_tensor_adam
class Adam(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``.
This version of fused adam implements 2 fusion:
- Fusion of operations within adam optimizer
- Apply operation on a list of tensor in single multi-tensor kernel by group
It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False) NOT SUPPORTED in FusedAdam!
eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, bias_correction = True,
betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
weight_decay=0., amsgrad=False):
if amsgrad:
raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay)
super(Adam, self).__init__(params, defaults)
self.eps_mode = 0 if eps_inside_sqrt else 1
self.dummy_overflow_buf = torch.cuda.IntTensor([0])
def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
if any(p is not None for p in [grads, output_params, scale, grad_norms]):
raise RuntimeError('Adam has been updated, please use with AMP for mixed precision. '
'For legacy code using fp16_optimizer, use FusedAdam.')
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
# create lists for multi-tensor apply
p_list, g_list, m1_list, m2_list = [], [], [], []
for p in group['params']:
if p.grad is None:
continue
if p.grad.data.is_sparse:
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
p_list.append(p.data)
g_list.append(p.grad.data)
m1_list.append(state['exp_avg'])
m2_list.append(state['exp_avg_sq'])
multi_tensor_applier(multi_tensor_adam,
self.dummy_overflow_buf,
[g_list, p_list, m1_list, m2_list],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
return loss
...@@ -35,8 +35,8 @@ class FP16_Optimizer(object): ...@@ -35,8 +35,8 @@ class FP16_Optimizer(object):
dynamic_loss_args=None, dynamic_loss_args=None,
verbose=True): verbose=True):
print("\nfp16_optimizer is designed to work with apex.optimizers.Fused*, and will be removed in future") print("\nfp16_optimizer is designed to only work with apex.optimizers, and will be removed in future")
print("To update, use updated optimizers without Fused prefix with AMP.") print("To update, use updated optimizers with AMP.")
# The fused optimizer does all the work. We need this layer for two reason: # The fused optimizer does all the work. We need this layer for two reason:
# 1. maintain same user API from apex.fp16_utils # 1. maintain same user API from apex.fp16_utils
# 2. keep common stuff here in case we need to add new fused optimizer later # 2. keep common stuff here in case we need to add new fused optimizer later
......
import types
import torch import torch
import importlib
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
from amp_C import multi_tensor_adam
class FusedAdam(torch.optim.Optimizer): class FusedAdam(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via """Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``. ``python setup.py install --cuda_ext --cpp_ext``.
This version of fused adam implements 2 fusion:
- Fusion of operations within adam optimizer
- Apply operation on a list of tensor in single multi-tensor kernel by group
It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling.
It has been proposed in `Adam: A Method for Stochastic Optimization`_. It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments: Arguments:
...@@ -26,8 +30,6 @@ class FusedAdam(torch.optim.Optimizer): ...@@ -26,8 +30,6 @@ class FusedAdam(torch.optim.Optimizer):
adds eps to the bias-corrected second moment estimate before adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False) second moment estimate as in the original paper. (default: False)
use_mt (boolean, optional): use multi tensor apply for lower launch
latency. (default: False)
.. _Adam\: A Method for Stochastic Optimization: .. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980 https://arxiv.org/abs/1412.6980
...@@ -35,165 +37,75 @@ class FusedAdam(torch.optim.Optimizer): ...@@ -35,165 +37,75 @@ class FusedAdam(torch.optim.Optimizer):
https://openreview.net/forum?id=ryQu7f-RZ https://openreview.net/forum?id=ryQu7f-RZ
""" """
def __init__(self, params, def __init__(self, params, lr=1e-3, bias_correction = True,
lr=1e-3, bias_correction = True,
betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False, betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False, weight_decay=0., amsgrad=False):
amp_scale_adjustment=1.0):
print("\nFusedAdam will be removed in future. To update, use apex.optimizers.Adam with AMP.")
global fused_adam_cuda
fused_adam_cuda = importlib.import_module("fused_adam_cuda")
self._use_multi_tensor = False
if use_mt:
if not multi_tensor_applier.available:
print("Warning: multi_tensor_applier is unavailable")
else:
self._use_multi_tensor = True
self._overflow_buf = torch.cuda.IntTensor([0])
self._amp_scale_adjustment = amp_scale_adjustment
if amsgrad: if amsgrad:
raise RuntimeError('FusedAdam does not support the AMSGrad variant.') raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction, defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay, betas=betas, eps=eps, weight_decay=weight_decay)
max_grad_norm=max_grad_norm)
super(FusedAdam, self).__init__(params, defaults) super(FusedAdam, self).__init__(params, defaults)
self.eps_mode = 0 if eps_inside_sqrt else 1 self.eps_mode = 0 if eps_inside_sqrt else 1
self.dummy_overflow_buf = torch.cuda.IntTensor([0])
def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None): def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
"""Performs a single optimization step. """Performs a single optimization step.
Arguments: Arguments:
closure (callable, optional): A closure that reevaluates the model closure (callable, optional): A closure that reevaluates the model
and returns the loss. and returns the loss.
grads (list of tensors, optional): weight gradient to use for the
optimizer update. If gradients have type torch.half, parameters
are expected to be in type torch.float. (default: None)
output params (list of tensors, optional): A reduced precision copy
of the updated weights written out in addition to the regular
updated weights. Have to be of same type as gradients. (default: None)
scale (float, optional): factor to divide gradient tensor values
by before applying to weights. (default: 1)
""" """
if any(p is not None for p in [grads, output_params, scale, grad_norms]):
raise RuntimeError('FusedAdam has been updated, please use with AMP for mixed precision.')
loss = None loss = None
if closure is not None: if closure is not None:
loss = closure() loss = closure()
if hasattr(self, "_amp_stash"): for group in self.param_groups:
grads = self._amp_stash.grads
output_params = self._amp_stash.output_params
scale = self._amp_stash.scale*self._amp_scale_adjustment
grad_norms = self._amp_stash.grad_norms
if grads is None:
grads_group = [None]*len(self.param_groups)
# backward compatibility
# assuming a list/generator of parameter means single group
elif isinstance(grads, types.GeneratorType):
grads_group = [grads]
elif type(grads[0])!=list:
grads_group = [grads]
else:
grads_group = grads
if output_params is None:
output_params_group = [None]*len(self.param_groups)
elif isinstance(output_params, types.GeneratorType):
output_params_group = [output_params]
elif type(output_params[0])!=list:
output_params_group = [output_params]
else:
output_params_group = output_params
if grad_norms is None:
grad_norms = [None]*len(self.param_groups)
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
if grads_this_group is None:
grads_this_group = [None]*len(group['params'])
if output_params_this_group is None:
output_params_this_group = [None]*len(group['params'])
# compute combined scale factor for this group
combined_scale = scale
if group['max_grad_norm'] > 0:
# norm is in fact norm*scale
clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm']
if clip > 1:
combined_scale = clip * scale
bias_correction = 1 if group['bias_correction'] else 0 bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
if self._use_multi_tensor: # create lists for multi-tensor apply
if output_params: p_list, g_list, m1_list, m2_list = [], [], [], []
tensorlists = [[],[],[],[],[]]
else:
tensorlists = [[],[],[],[]]
for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group): for p in group['params']:
#note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients if p.grad is None:
if p.grad is None and grad is None:
continue continue
if grad is None: if p.grad.data.is_sparse:
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead') raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p] state = self.state[p]
# State initialization # State initialization
if len(state) == 0: if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values # Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data) state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values # Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data) state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] p_list.append(p.data)
beta1, beta2 = group['betas'] g_list.append(p.grad.data)
m1_list.append(state['exp_avg'])
state['step'] += 1 m2_list.append(state['exp_avg_sq'])
out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param multi_tensor_applier(multi_tensor_adam,
if self._use_multi_tensor: self.dummy_overflow_buf,
pl = [p.data, exp_avg, exp_avg_sq, grad] [g_list, p_list, m1_list, m2_list],
if output_param is not None: group['lr'],
pl.append(out_p) beta1,
beta2,
for tl, t in zip(tensorlists, pl): group['eps'],
tl.append(t) group['step'],
else: self.eps_mode,
fused_adam_cuda.adam(p.data, bias_correction,
out_p, group['weight_decay'])
exp_avg,
exp_avg_sq,
grad,
group['lr'],
beta1,
beta2,
group['eps'],
combined_scale,
state['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
if self._use_multi_tensor:
multi_tensor_applier(
fused_adam_cuda.adam_mt,
self._overflow_buf,
tensorlists,
group['lr'],
beta1,
beta2,
group['eps'],
combined_scale,
state['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
return loss return loss
import torch import torch
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
class NovoGrad(torch.optim.Optimizer): class FusedNovoGrad(torch.optim.Optimizer):
"""Implements NovoGrad algorithm. Currently GPU-only. Requires Apex to be installed via """Implements NovoGrad algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``. ``python setup.py install --cuda_ext --cpp_ext``.
...@@ -47,19 +47,19 @@ class NovoGrad(torch.optim.Optimizer): ...@@ -47,19 +47,19 @@ class NovoGrad(torch.optim.Optimizer):
grad_averaging=True, norm_type=2, init_zero=False, grad_averaging=True, norm_type=2, init_zero=False,
set_grad_none=True): set_grad_none=True):
if amsgrad: if amsgrad:
raise RuntimeError('NovoGrad does not support the AMSGrad variant.') raise RuntimeError('FusedNovoGrad does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction, defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay, betas=betas, eps=eps, weight_decay=weight_decay,
grad_averaging=grad_averaging, norm_type=norm_type, grad_averaging=grad_averaging, norm_type=norm_type,
init_zero=init_zero) init_zero=init_zero)
super(NovoGrad, self).__init__(params, defaults) super(FusedNovoGrad, self).__init__(params, defaults)
if multi_tensor_applier.available: if multi_tensor_applier.available:
import amp_C import amp_C
# Skip buffer # Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0]) self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_novograd = amp_C.multi_tensor_novograd self.multi_tensor_novograd = amp_C.multi_tensor_novograd
else: else:
raise RuntimeError('apex.optimizers.NovoGrad requires cuda extensions') raise RuntimeError('apex.optimizers.FusedNovoGrad requires cuda extensions')
self.moment_mode = 0 if reg_inside_moment else 1 self.moment_mode = 0 if reg_inside_moment else 1
self.set_grad_none = set_grad_none self.set_grad_none = set_grad_none
...@@ -70,7 +70,7 @@ class NovoGrad(torch.optim.Optimizer): ...@@ -70,7 +70,7 @@ class NovoGrad(torch.optim.Optimizer):
for p in group['params']: for p in group['params']:
p.grad = None p.grad = None
else: else:
super(NovoGrad, self).zero_grad() super(FusedNovoGrad, self).zero_grad()
def step(self, closure=None): def step(self, closure=None):
"""Performs a single optimization step. """Performs a single optimization step.
...@@ -103,7 +103,7 @@ class NovoGrad(torch.optim.Optimizer): ...@@ -103,7 +103,7 @@ class NovoGrad(torch.optim.Optimizer):
if p.grad is None: if p.grad is None:
continue continue
if p.grad.data.is_sparse: if p.grad.data.is_sparse:
raise RuntimeError('NovoGrad does not support sparse gradients, please consider SparseAdam instead') raise RuntimeError('FusedNovoGrad does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p] state = self.state[p]
# State initialization # State initialization
...@@ -120,7 +120,7 @@ class NovoGrad(torch.optim.Optimizer): ...@@ -120,7 +120,7 @@ class NovoGrad(torch.optim.Optimizer):
p_32.append(p.data) p_32.append(p.data)
m_32.append(state['exp_avg']) m_32.append(state['exp_avg'])
else: else:
raise RuntimeError('NovoGrad only support fp16 and fp32.') raise RuntimeError('FusedNovoGrad only support fp16 and fp32.')
# we store per weight norm as one tensor for one group/precision combination # we store per weight norm as one tensor for one group/precision combination
# different from optim.Adam, we store norm here(not ^2) so we can unify calculation for norm types # different from optim.Adam, we store norm here(not ^2) so we can unify calculation for norm types
...@@ -137,7 +137,7 @@ class NovoGrad(torch.optim.Optimizer): ...@@ -137,7 +137,7 @@ class NovoGrad(torch.optim.Optimizer):
v_16 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_16] v_16 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_16]
v_32 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_32] v_32 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_32]
else: else:
raise RuntimeError('NovoGrad only support l2/inf norm now.') raise RuntimeError('FusedNovoGrad only support l2/inf norm now.')
group['exp_avg_sq'][0] = torch.cuda.FloatTensor(v_16) group['exp_avg_sq'][0] = torch.cuda.FloatTensor(v_16)
group['exp_avg_sq'][1] = torch.cuda.FloatTensor(v_32) group['exp_avg_sq'][1] = torch.cuda.FloatTensor(v_32)
else: else:
......
...@@ -53,9 +53,6 @@ class FusedSGD(Optimizer): ...@@ -53,9 +53,6 @@ class FusedSGD(Optimizer):
weight_decay=0, nesterov=False, weight_decay=0, nesterov=False,
wd_after_momentum=False, wd_after_momentum=False,
materialize_master_grads=True): materialize_master_grads=True):
print("\nFusedSGD will be removed in future. To update, use apex.optimizers.SGD with AMP.")
if lr is not required and lr < 0.0: if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0: if momentum < 0.0:
......
import torch
from apex.multi_tensor_apply import multi_tensor_applier
class SGD(torch.optim.Optimizer):
r"""Implements stochastic gradient descent (optionally with momentum).
Nesterov momentum is based on the formula from
`On the importance of initialization and momentum in deep learning`__.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float): learning rate
momentum (float, optional): momentum factor (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
dampening (float, optional): dampening for momentum (default: 0)
nesterov (bool, optional): enables Nesterov momentum (default: False)
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
Example:
>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
>>> optimizer.zero_grad()
>>> loss_fn(model(input), target).backward()
>>> optimizer.step()
__ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
.. note::
The implementation of SGD with Momentum/Nesterov subtly differs from
Sutskever et. al. and implementations in some other frameworks.
Considering the specific case of Momentum, the update can be written as
.. math::
v = \rho * v + g \\
p = p - lr * v
where p, g, v and :math:`\rho` denote the parameters, gradient,
velocity, and momentum respectively.
This is in contrast to Sutskever et. al. and
other frameworks which employ an update of the form
.. math::
v = \rho * v + lr * g \\
p = p - v
The Nesterov version is analogously modified.
"""
def __init__(self, params, lr=0.1, momentum=0., dampening=0.,
weight_decay=0., nesterov=False, wd_after_momentum=False,
set_grad_none=True):
if lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
self.wd_after_momentum = wd_after_momentum
self.set_grad_none = set_grad_none
if multi_tensor_applier.available:
import amp_C
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_axpby = amp_C.multi_tensor_axpby
self.multi_tensor_sgd = amp_C.multi_tensor_sgd
else:
raise RuntimeError('apex.optimizers.SGD requires cuda extensions')
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def zero_grad(self):
if self.set_grad_none:
for group in self.param_groups:
for p in group['params']:
p.grad = None
else:
super(SGD, self).zero_grad()
def get_momentums(self, params):
momentums = []
first_run = True
for p in params:
param_state = self.state[p]
# torch.optim.SGD initializes momentum in the main loop, we have
# to do it here, and track whether or not we've done so, so that
# momentum application can be skipped in the main kernel.
if 'momentum_buffer' not in param_state:
first_run = True
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
momentums.append(buf)
else:
first_run = False
momentums.append(param_state['momentum_buffer'])
return momentums, first_run
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for group_dtype in [torch.float16, torch.float32]:
grad_list = [p.grad for p in group['params'] if (p.dtype == group_dtype and p.grad is not None)]
if len(grad_list) == 0:
continue
param_list = [p for p in group['params'] if (p.dtype == group_dtype and p.grad is not None)]
if momentum != 0:
momentum_list, first_run = self.get_momentums(param_list)
multi_tensor_applier(
self.multi_tensor_sgd,
self._dummy_overflow_buf,
[grad_list, param_list, momentum_list],
weight_decay,
momentum,
dampening,
group['lr'],
nesterov,
first_run,
self.wd_after_momentum,
1.0)
else:
# show how to implement SGD using axpby, without writing new multi_tensor kernel
# only enabled now in no momentum case, since it saves creating momentum for us
# keep momentum != 0 code below for completeness
if weight_decay != 0 and not self.wd_after_momentum:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[grad_list, param_list, grad_list],
1.,
weight_decay,
2)
if momentum != 0: # always False
if not first_run:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[momentum_list, grad_list, momentum_list],
momentum,
1.-dampening,
2)
if nesterov:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[grad_list, momentum_list, grad_list],
1.,
momentum,
2)
else:
grad_list = momentum_list
if weight_decay != 0 and self.wd_after_momentum:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[grad_list, param_list, grad_list],
1.,
weight_decay,
2)
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[param_list, grad_list, param_list],
1.,
-group['lr'],
2)
return loss
import unittest
import torch
import apex
import os
class TestFP16Optimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(13337)
N, D_in, D_out = 64, 1024, 16
self.N = N
self.D_in = D_in
self.D_out = D_out
self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda')
self.ref_model = torch.nn.Linear(D_in, D_out).cuda().half()
self.tst_model = torch.nn.Linear(D_in, D_out).cuda().half()
for p,q in zip(self.tst_model.parameters(), self.ref_model.parameters()):
p.data.copy_(q.data)
def get_max_diff(self, ref_param, tst_param):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
max_abs_diff_p = (p_ref - p_tst).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def test_fp16_optimizer(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_loss_scaling(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_parameter_groups(self):
ref_groups = [{'params': [self.ref_model.weight]},{'params': [self.ref_model.bias]}]
ref_optim = torch.optim.Adam(ref_groups)
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}]
tst_optim = apex.optimizers.FusedAdam(tst_groups)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_grad_clip(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters(), max_grad_norm=0.01)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.clip_master_grads(0.01)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@unittest.skip('Not support grad being None')
def test_grad_None(self):
self.fail()
@unittest.skip('Not support same weight decay as pytorch')
def test_weight_decay(self):
self.fail()
@unittest.skip('Not support empty parameter groups')
def test_group_empty(self):
self.fail()
if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main()
import unittest
import os
import random
import torch
import apex
class TestFusedAdam(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(9876)
def tearDown(self):
pass
def gen_param_optim(self, tensors, ref_adam_option, tst_adam_option=None):
ref_param = []
tst_param = []
for tensor in tensors:
ref_param.append(torch.nn.Parameter(tensor.clone()))
tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = torch.optim.Adam(ref_param, **ref_adam_option)
if tst_adam_option:
tst_optim = apex.optimizers.FusedAdam(tst_param, **tst_adam_option)
else:
tst_optim = apex.optimizers.FusedAdam(tst_param, **ref_adam_option)
return (ref_param, tst_param, ref_optim, tst_optim)
def gen_grad(self, ref_param, tst_param):
for p_ref, p_tst in zip(ref_param, tst_param):
p_ref.grad = torch.rand_like(p_ref)
p_tst.grad = p_ref.grad
def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
half_grads = []
for p_ref, p_tst in zip(ref_param, tst_param):
half_grads.append(torch.rand_like(p_ref).half())
p_ref.grad = half_grads[-1].float() / scale
return half_grads
def get_max_diff(self, ref_param, tst_param):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
max_abs_diff_p = (p_ref - p_tst.type(p_ref.type())).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst.type(p_ref.type())) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=param_type, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_double(self):
self.gen_single_type_test(param_type=torch.double)
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
def test_half(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, adam_option)
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_scale(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
scale = random.random() * 1000
half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
ref_optim.step()
tst_optim.step(grads=half_grads, scale=scale)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_fp16_output(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
fp16_param = torch.nn.Parameter(tensor.clone().half())
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads, output_params=[fp16_param])
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
[fp16_param.float()])
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_adam_option(self):
nelem = 1
adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_multi_tensor(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
ref_adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tst_adam_option = dict(ref_adam_option, **{'use_mt':True})
tensors = []
fp16_params = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
fp16_params.append(torch.nn.Parameter(tensors[-1].clone().half()))
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, ref_adam_option, tst_adam_option)
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads, output_params=fp16_params)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
fp16_params)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main()
...@@ -23,7 +23,7 @@ class TestFusedAdam(unittest.TestCase): ...@@ -23,7 +23,7 @@ class TestFusedAdam(unittest.TestCase):
tst_param.append(torch.nn.Parameter(tensor.clone())) tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = torch.optim.Adam(ref_param, **adam_option) ref_optim = torch.optim.Adam(ref_param, **adam_option)
tst_optim = apex.optimizers.Adam(tst_param, **adam_option) tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
return (ref_param, tst_param, ref_optim, tst_optim) return (ref_param, tst_param, ref_optim, tst_optim)
......
import unittest import unittest
import sys import sys
test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam", "run_fused_layer_norm"] test_dirs = ["run_amp", "run_fp16util", "run_fused_layer_norm", "run_optimizers"]
runner = unittest.TextTestRunner(verbosity=2) runner = unittest.TextTestRunner(verbosity=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment