Commit 007c5947 authored by Deyu Fu's avatar Deyu Fu
Browse files

Reverse to Fused* naming, clean up accordingly:

FusedSGD now work as before
FusedAdam now work with o1/o2, no longer fuse scaling and casting
Removed special backend handling for FusedAdam
Moved and updated test for FusedAdam into run_optimizers
Removed legacy tests for optimizers.FP16_optimizer and FusedAdam in run_mixed_adam
parent 37a1c121
......@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer
from apex.fp16_utils import convert_network
from ..fp16_utils import FP16_Optimizer as FP16_Optimizer_general
from ..optimizers import FP16_Optimizer as FP16_Optimizer_for_fused
from ..optimizers import FusedAdam
from ..parallel import DistributedDataParallel as apex_DDP
from ..parallel.LARC import LARC
......
......@@ -3,7 +3,7 @@ from ..fp16_utils import master_params_to_model_params
from ..multi_tensor_apply import multi_tensor_applier
from ._amp_state import maybe_print
import torch
from ..optimizers import FusedAdam, FusedSGD
from ..optimizers import FusedSGD
class AmpOptimizerState(object):
......@@ -241,67 +241,8 @@ def post_backward_no_master_weights(self, scaler):
post_backward_models_are_masters(scaler, params, stashed_grads)
#####################################################################################
# FusedAdam versions
#####################################################################################
def prepare_backward_with_master_weights_FusedAdam(self):
stash = self._amp_stash
self._amp_lazy_init()
def post_backward_with_master_weights_FusedAdam(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
stash.scale = scaler.loss_scale()
stash.grads = [[param.grad.data for param in group] for group in stash.fp16_groups]
stash.output_params = [[param for param in group] for group in stash.fp16_groups]
norm_groups = []
skip = False
for grad_group in stash.grads:
norm, _ = multi_tensor_applier(
stash.multi_tensor_l2norm,
stash.dummy_overflow_buf,
[grad_group],
False)
# Still syncing here for now.
norm = float(norm)
norm_groups.append(norm)
if norm == float('inf') or norm == -float('inf') or norm != norm:
skip = True
if skip:
scaler._overflow_buf.fill_(1.)
scaler._has_overflow = True
stash.grad_norms = norm_groups
def prepare_backward_no_master_weights_FusedAdam(self):
stash = self._amp_stash
self._amp_lazy_init()
def post_backward_no_master_weights_FusedAdam(self, scaler):
stash = self._amp_stash
self._amp_lazy_init()
stash.scale = scaler.loss_scale()
stash.grads = None
stash.output_params = None
stash.grad_norms = None
#####################################################################################
# FusedSGD versions
# Eat this ugly code duplication for now. First make it work, then make it clean.
# It's difficult to anticipate what can be unified between the FusedAdam and FusedSGD
# implementations until I have them both working.
#####################################################################################
# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
......@@ -406,7 +347,7 @@ def _process_optimizer(optimizer, properties):
if closure is not None:
raise RuntimeError("Currently, Amp does not support closure use with optimizers.")
retval = old_step()
if not (isinstance(self, FusedAdam) or isinstance(self, FusedSGD)):
if not isinstance(self, FusedSGD):
self._master_params_to_model_params()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for param in self._amp_stash.all_fp32_from_fp16_params:
......@@ -432,12 +373,7 @@ def _process_optimizer(optimizer, properties):
param.grad = None
optimizer.zero_grad = types.MethodType(new_zero_grad, optimizer)
if isinstance(optimizer, FusedAdam):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedAdam, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_with_master_weights_FusedAdam, optimizer)
elif isinstance(optimizer, FusedSGD):
if isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_with_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType(
......@@ -451,12 +387,7 @@ def _process_optimizer(optimizer, properties):
optimizer._lazy_init_maybe_master_weights = types.MethodType(
lazy_init_no_master_weights, optimizer)
if isinstance(optimizer, FusedAdam):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedAdam, optimizer)
optimizer._post_amp_backward = types.MethodType(
post_backward_no_master_weights_FusedAdam, optimizer)
elif isinstance(optimizer, FusedSGD):
if isinstance(optimizer, FusedSGD):
optimizer._prepare_amp_backward = types.MethodType(
prepare_backward_no_master_weights_FusedSGD, optimizer)
optimizer._post_amp_backward = types.MethodType(
......
......@@ -113,7 +113,7 @@ def scale_loss(loss,
for optimizer in optimizers:
optimizer._amp_stash.params_have_scaled_gradients = True
else:
# FusedAdam and FusedSGD may take care of unscaling as part of their step() methods.
# FusedSGD may take care of unscaling as part of their step() methods.
# if not isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scaler.clear_overflow_state()
for optimizer in optimizers:
......
from .fused_sgd import FusedSGD
from .fused_adam import FusedAdam
from .fused_novograd import FusedNovoGrad
from .fp16_optimizer import FP16_Optimizer
from .sgd import SGD
from .adam import Adam
from .novograd import NovoGrad
import torch
from apex.multi_tensor_apply import multi_tensor_applier
from amp_C import multi_tensor_adam
class Adam(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``.
This version of fused adam implements 2 fusion:
- Fusion of operations within adam optimizer
- Apply operation on a list of tensor in single multi-tensor kernel by group
It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False) NOT SUPPORTED in FusedAdam!
eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, bias_correction = True,
betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
weight_decay=0., amsgrad=False):
if amsgrad:
raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay)
super(Adam, self).__init__(params, defaults)
self.eps_mode = 0 if eps_inside_sqrt else 1
self.dummy_overflow_buf = torch.cuda.IntTensor([0])
def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
if any(p is not None for p in [grads, output_params, scale, grad_norms]):
raise RuntimeError('Adam has been updated, please use with AMP for mixed precision. '
'For legacy code using fp16_optimizer, use FusedAdam.')
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
# create lists for multi-tensor apply
p_list, g_list, m1_list, m2_list = [], [], [], []
for p in group['params']:
if p.grad is None:
continue
if p.grad.data.is_sparse:
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
p_list.append(p.data)
g_list.append(p.grad.data)
m1_list.append(state['exp_avg'])
m2_list.append(state['exp_avg_sq'])
multi_tensor_applier(multi_tensor_adam,
self.dummy_overflow_buf,
[g_list, p_list, m1_list, m2_list],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
return loss
......@@ -35,8 +35,8 @@ class FP16_Optimizer(object):
dynamic_loss_args=None,
verbose=True):
print("\nfp16_optimizer is designed to work with apex.optimizers.Fused*, and will be removed in future")
print("To update, use updated optimizers without Fused prefix with AMP.")
print("\nfp16_optimizer is designed to only work with apex.optimizers, and will be removed in future")
print("To update, use updated optimizers with AMP.")
# The fused optimizer does all the work. We need this layer for two reason:
# 1. maintain same user API from apex.fp16_utils
# 2. keep common stuff here in case we need to add new fused optimizer later
......
import types
import torch
import importlib
from apex.multi_tensor_apply import multi_tensor_applier
from amp_C import multi_tensor_adam
class FusedAdam(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``.
This version of fused adam implements 2 fusion:
- Fusion of operations within adam optimizer
- Apply operation on a list of tensor in single multi-tensor kernel by group
It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling.
It has been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
......@@ -26,8 +30,6 @@ class FusedAdam(torch.optim.Optimizer):
adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False)
use_mt (boolean, optional): use multi tensor apply for lower launch
latency. (default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
......@@ -35,165 +37,75 @@ class FusedAdam(torch.optim.Optimizer):
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params,
lr=1e-3, bias_correction = True,
def __init__(self, params, lr=1e-3, bias_correction = True,
betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False,
amp_scale_adjustment=1.0):
print("\nFusedAdam will be removed in future. To update, use apex.optimizers.Adam with AMP.")
global fused_adam_cuda
fused_adam_cuda = importlib.import_module("fused_adam_cuda")
self._use_multi_tensor = False
if use_mt:
if not multi_tensor_applier.available:
print("Warning: multi_tensor_applier is unavailable")
else:
self._use_multi_tensor = True
self._overflow_buf = torch.cuda.IntTensor([0])
self._amp_scale_adjustment = amp_scale_adjustment
weight_decay=0., amsgrad=False):
if amsgrad:
raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
betas=betas, eps=eps, weight_decay=weight_decay)
super(FusedAdam, self).__init__(params, defaults)
self.eps_mode = 0 if eps_inside_sqrt else 1
self.dummy_overflow_buf = torch.cuda.IntTensor([0])
def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
grads (list of tensors, optional): weight gradient to use for the
optimizer update. If gradients have type torch.half, parameters
are expected to be in type torch.float. (default: None)
output params (list of tensors, optional): A reduced precision copy
of the updated weights written out in addition to the regular
updated weights. Have to be of same type as gradients. (default: None)
scale (float, optional): factor to divide gradient tensor values
by before applying to weights. (default: 1)
"""
if any(p is not None for p in [grads, output_params, scale, grad_norms]):
raise RuntimeError('FusedAdam has been updated, please use with AMP for mixed precision.')
loss = None
if closure is not None:
loss = closure()
if hasattr(self, "_amp_stash"):
grads = self._amp_stash.grads
output_params = self._amp_stash.output_params
scale = self._amp_stash.scale*self._amp_scale_adjustment
grad_norms = self._amp_stash.grad_norms
if grads is None:
grads_group = [None]*len(self.param_groups)
# backward compatibility
# assuming a list/generator of parameter means single group
elif isinstance(grads, types.GeneratorType):
grads_group = [grads]
elif type(grads[0])!=list:
grads_group = [grads]
else:
grads_group = grads
if output_params is None:
output_params_group = [None]*len(self.param_groups)
elif isinstance(output_params, types.GeneratorType):
output_params_group = [output_params]
elif type(output_params[0])!=list:
output_params_group = [output_params]
else:
output_params_group = output_params
if grad_norms is None:
grad_norms = [None]*len(self.param_groups)
for group, grads_this_group, output_params_this_group, grad_norm in zip(self.param_groups, grads_group, output_params_group, grad_norms):
if grads_this_group is None:
grads_this_group = [None]*len(group['params'])
if output_params_this_group is None:
output_params_this_group = [None]*len(group['params'])
# compute combined scale factor for this group
combined_scale = scale
if group['max_grad_norm'] > 0:
# norm is in fact norm*scale
clip = ((grad_norm / scale) + 1e-6) / group['max_grad_norm']
if clip > 1:
combined_scale = clip * scale
for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
if self._use_multi_tensor:
if output_params:
tensorlists = [[],[],[],[],[]]
else:
tensorlists = [[],[],[],[]]
# create lists for multi-tensor apply
p_list, g_list, m1_list, m2_list = [], [], [], []
for p, grad, output_param in zip(group['params'], grads_this_group, output_params_this_group):
#note: p.grad should not ever be set for correct operation of mixed precision optimizer that sometimes sends None gradients
if p.grad is None and grad is None:
for p in group['params']:
if p.grad is None:
continue
if grad is None:
grad = p.grad.data
if grad.is_sparse:
if p.grad.data.is_sparse:
raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
out_p = torch.tensor([], dtype = torch.float) if output_param is None else output_param
if self._use_multi_tensor:
pl = [p.data, exp_avg, exp_avg_sq, grad]
if output_param is not None:
pl.append(out_p)
for tl, t in zip(tensorlists, pl):
tl.append(t)
else:
fused_adam_cuda.adam(p.data,
out_p,
exp_avg,
exp_avg_sq,
grad,
group['lr'],
beta1,
beta2,
group['eps'],
combined_scale,
state['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
if self._use_multi_tensor:
multi_tensor_applier(
fused_adam_cuda.adam_mt,
self._overflow_buf,
tensorlists,
group['lr'],
beta1,
beta2,
group['eps'],
combined_scale,
state['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
p_list.append(p.data)
g_list.append(p.grad.data)
m1_list.append(state['exp_avg'])
m2_list.append(state['exp_avg_sq'])
multi_tensor_applier(multi_tensor_adam,
self.dummy_overflow_buf,
[g_list, p_list, m1_list, m2_list],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
self.eps_mode,
bias_correction,
group['weight_decay'])
return loss
import torch
from apex.multi_tensor_apply import multi_tensor_applier
class NovoGrad(torch.optim.Optimizer):
class FusedNovoGrad(torch.optim.Optimizer):
"""Implements NovoGrad algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``.
......@@ -47,19 +47,19 @@ class NovoGrad(torch.optim.Optimizer):
grad_averaging=True, norm_type=2, init_zero=False,
set_grad_none=True):
if amsgrad:
raise RuntimeError('NovoGrad does not support the AMSGrad variant.')
raise RuntimeError('FusedNovoGrad does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay,
grad_averaging=grad_averaging, norm_type=norm_type,
init_zero=init_zero)
super(NovoGrad, self).__init__(params, defaults)
super(FusedNovoGrad, self).__init__(params, defaults)
if multi_tensor_applier.available:
import amp_C
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_novograd = amp_C.multi_tensor_novograd
else:
raise RuntimeError('apex.optimizers.NovoGrad requires cuda extensions')
raise RuntimeError('apex.optimizers.FusedNovoGrad requires cuda extensions')
self.moment_mode = 0 if reg_inside_moment else 1
self.set_grad_none = set_grad_none
......@@ -70,7 +70,7 @@ class NovoGrad(torch.optim.Optimizer):
for p in group['params']:
p.grad = None
else:
super(NovoGrad, self).zero_grad()
super(FusedNovoGrad, self).zero_grad()
def step(self, closure=None):
"""Performs a single optimization step.
......@@ -103,7 +103,7 @@ class NovoGrad(torch.optim.Optimizer):
if p.grad is None:
continue
if p.grad.data.is_sparse:
raise RuntimeError('NovoGrad does not support sparse gradients, please consider SparseAdam instead')
raise RuntimeError('FusedNovoGrad does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
......@@ -120,7 +120,7 @@ class NovoGrad(torch.optim.Optimizer):
p_32.append(p.data)
m_32.append(state['exp_avg'])
else:
raise RuntimeError('NovoGrad only support fp16 and fp32.')
raise RuntimeError('FusedNovoGrad only support fp16 and fp32.')
# we store per weight norm as one tensor for one group/precision combination
# different from optim.Adam, we store norm here(not ^2) so we can unify calculation for norm types
......@@ -137,7 +137,7 @@ class NovoGrad(torch.optim.Optimizer):
v_16 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_16]
v_32 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_32]
else:
raise RuntimeError('NovoGrad only support l2/inf norm now.')
raise RuntimeError('FusedNovoGrad only support l2/inf norm now.')
group['exp_avg_sq'][0] = torch.cuda.FloatTensor(v_16)
group['exp_avg_sq'][1] = torch.cuda.FloatTensor(v_32)
else:
......
......@@ -53,9 +53,6 @@ class FusedSGD(Optimizer):
weight_decay=0, nesterov=False,
wd_after_momentum=False,
materialize_master_grads=True):
print("\nFusedSGD will be removed in future. To update, use apex.optimizers.SGD with AMP.")
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
......
import torch
from apex.multi_tensor_apply import multi_tensor_applier
class SGD(torch.optim.Optimizer):
r"""Implements stochastic gradient descent (optionally with momentum).
Nesterov momentum is based on the formula from
`On the importance of initialization and momentum in deep learning`__.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float): learning rate
momentum (float, optional): momentum factor (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
dampening (float, optional): dampening for momentum (default: 0)
nesterov (bool, optional): enables Nesterov momentum (default: False)
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
Example:
>>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
>>> optimizer.zero_grad()
>>> loss_fn(model(input), target).backward()
>>> optimizer.step()
__ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
.. note::
The implementation of SGD with Momentum/Nesterov subtly differs from
Sutskever et. al. and implementations in some other frameworks.
Considering the specific case of Momentum, the update can be written as
.. math::
v = \rho * v + g \\
p = p - lr * v
where p, g, v and :math:`\rho` denote the parameters, gradient,
velocity, and momentum respectively.
This is in contrast to Sutskever et. al. and
other frameworks which employ an update of the form
.. math::
v = \rho * v + lr * g \\
p = p - v
The Nesterov version is analogously modified.
"""
def __init__(self, params, lr=0.1, momentum=0., dampening=0.,
weight_decay=0., nesterov=False, wd_after_momentum=False,
set_grad_none=True):
if lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
self.wd_after_momentum = wd_after_momentum
self.set_grad_none = set_grad_none
if multi_tensor_applier.available:
import amp_C
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_axpby = amp_C.multi_tensor_axpby
self.multi_tensor_sgd = amp_C.multi_tensor_sgd
else:
raise RuntimeError('apex.optimizers.SGD requires cuda extensions')
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(SGD, self).__init__(params, defaults)
def __setstate__(self, state):
super(SGD, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def zero_grad(self):
if self.set_grad_none:
for group in self.param_groups:
for p in group['params']:
p.grad = None
else:
super(SGD, self).zero_grad()
def get_momentums(self, params):
momentums = []
first_run = True
for p in params:
param_state = self.state[p]
# torch.optim.SGD initializes momentum in the main loop, we have
# to do it here, and track whether or not we've done so, so that
# momentum application can be skipped in the main kernel.
if 'momentum_buffer' not in param_state:
first_run = True
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
momentums.append(buf)
else:
first_run = False
momentums.append(param_state['momentum_buffer'])
return momentums, first_run
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
for group_dtype in [torch.float16, torch.float32]:
grad_list = [p.grad for p in group['params'] if (p.dtype == group_dtype and p.grad is not None)]
if len(grad_list) == 0:
continue
param_list = [p for p in group['params'] if (p.dtype == group_dtype and p.grad is not None)]
if momentum != 0:
momentum_list, first_run = self.get_momentums(param_list)
multi_tensor_applier(
self.multi_tensor_sgd,
self._dummy_overflow_buf,
[grad_list, param_list, momentum_list],
weight_decay,
momentum,
dampening,
group['lr'],
nesterov,
first_run,
self.wd_after_momentum,
1.0)
else:
# show how to implement SGD using axpby, without writing new multi_tensor kernel
# only enabled now in no momentum case, since it saves creating momentum for us
# keep momentum != 0 code below for completeness
if weight_decay != 0 and not self.wd_after_momentum:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[grad_list, param_list, grad_list],
1.,
weight_decay,
2)
if momentum != 0: # always False
if not first_run:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[momentum_list, grad_list, momentum_list],
momentum,
1.-dampening,
2)
if nesterov:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[grad_list, momentum_list, grad_list],
1.,
momentum,
2)
else:
grad_list = momentum_list
if weight_decay != 0 and self.wd_after_momentum:
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[grad_list, param_list, grad_list],
1.,
weight_decay,
2)
multi_tensor_applier(
self.multi_tensor_axpby,
self._dummy_overflow_buf,
[param_list, grad_list, param_list],
1.,
-group['lr'],
2)
return loss
import unittest
import torch
import apex
import os
class TestFP16Optimizer(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(13337)
N, D_in, D_out = 64, 1024, 16
self.N = N
self.D_in = D_in
self.D_out = D_out
self.x = torch.randn((N, D_in), dtype=torch.float16, device='cuda')
self.ref_model = torch.nn.Linear(D_in, D_out).cuda().half()
self.tst_model = torch.nn.Linear(D_in, D_out).cuda().half()
for p,q in zip(self.tst_model.parameters(), self.ref_model.parameters()):
p.data.copy_(q.data)
def get_max_diff(self, ref_param, tst_param):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
max_abs_diff_p = (p_ref - p_tst).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def test_fp16_optimizer(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_loss_scaling(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_parameter_groups(self):
ref_groups = [{'params': [self.ref_model.weight]},{'params': [self.ref_model.bias]}]
ref_optim = torch.optim.Adam(ref_groups)
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}]
tst_optim = apex.optimizers.FusedAdam(tst_groups)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_grad_clip(self):
ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters(), max_grad_norm=0.01)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters):
ref_loss = self.ref_model(self.x).sum()
ref_optim.backward(ref_loss)
ref_optim.clip_master_grads(0.01)
ref_optim.step()
tst_loss = self.tst_model(self.x).sum()
tst_optim.backward(tst_loss)
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(self.ref_model.parameters(), self.tst_model.parameters())
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
@unittest.skip('Not support grad being None')
def test_grad_None(self):
self.fail()
@unittest.skip('Not support same weight decay as pytorch')
def test_weight_decay(self):
self.fail()
@unittest.skip('Not support empty parameter groups')
def test_group_empty(self):
self.fail()
if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main()
import unittest
import os
import random
import torch
import apex
class TestFusedAdam(unittest.TestCase):
def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
self.max_abs_diff = max_abs_diff
self.max_rel_diff = max_rel_diff
self.iters = iters
torch.cuda.manual_seed(9876)
def tearDown(self):
pass
def gen_param_optim(self, tensors, ref_adam_option, tst_adam_option=None):
ref_param = []
tst_param = []
for tensor in tensors:
ref_param.append(torch.nn.Parameter(tensor.clone()))
tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = torch.optim.Adam(ref_param, **ref_adam_option)
if tst_adam_option:
tst_optim = apex.optimizers.FusedAdam(tst_param, **tst_adam_option)
else:
tst_optim = apex.optimizers.FusedAdam(tst_param, **ref_adam_option)
return (ref_param, tst_param, ref_optim, tst_optim)
def gen_grad(self, ref_param, tst_param):
for p_ref, p_tst in zip(ref_param, tst_param):
p_ref.grad = torch.rand_like(p_ref)
p_tst.grad = p_ref.grad
def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
half_grads = []
for p_ref, p_tst in zip(ref_param, tst_param):
half_grads.append(torch.rand_like(p_ref).half())
p_ref.grad = half_grads[-1].float() / scale
return half_grads
def get_max_diff(self, ref_param, tst_param):
max_abs_diff = max_rel_diff = 0
for p_ref, p_tst in zip(ref_param, tst_param):
max_abs_diff_p = (p_ref - p_tst.type(p_ref.type())).abs().max().item()
max_rel_diff_p = ((p_ref - p_tst.type(p_ref.type())) / p_ref).abs().max().item()
if max_abs_diff_p > max_abs_diff: max_abs_diff = max_abs_diff_p
if max_rel_diff_p > max_rel_diff: max_rel_diff = max_rel_diff_p
return max_abs_diff, max_rel_diff
def gen_single_type_test(self, param_type=torch.float):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=param_type, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_double(self):
self.gen_single_type_test(param_type=torch.double)
def test_float(self):
self.gen_single_type_test(param_type=torch.float)
def test_half(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_multi_params(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensors = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, adam_option)
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_scale(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
scale = random.random() * 1000
half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
ref_optim.step()
tst_optim.step(grads=half_grads, scale=scale)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_fp16_output(self):
nelem = 278011
adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
fp16_param = torch.nn.Parameter(tensor.clone().half())
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads, output_params=[fp16_param])
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
[fp16_param.float()])
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_adam_option(self):
nelem = 1
adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
'weight_decay':0, 'amsgrad':False}
tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim([tensor], adam_option)
for i in range(self.iters):
self.gen_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step()
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
def test_multi_tensor(self):
sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
ref_adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
'weight_decay':0, 'amsgrad':False}
tst_adam_option = dict(ref_adam_option, **{'use_mt':True})
tensors = []
fp16_params = []
for size in sizes:
tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
fp16_params.append(torch.nn.Parameter(tensors[-1].clone().half()))
ref_param, tst_param, ref_optim, tst_optim = \
self.gen_param_optim(tensors, ref_adam_option, tst_adam_option)
for i in range(self.iters):
half_grads = self.gen_mixed_grad(ref_param, tst_param)
ref_optim.step()
tst_optim.step(grads=half_grads, output_params=fp16_params)
max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
fp16_params)
self.assertLessEqual(max_abs_diff, self.max_abs_diff)
self.assertLessEqual(max_rel_diff, self.max_rel_diff)
if __name__ == '__main__':
script_path = os.path.dirname(os.path.realpath(__file__))
unittest.main()
......@@ -23,7 +23,7 @@ class TestFusedAdam(unittest.TestCase):
tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = torch.optim.Adam(ref_param, **adam_option)
tst_optim = apex.optimizers.Adam(tst_param, **adam_option)
tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
return (ref_param, tst_param, ref_optim, tst_optim)
......
import unittest
import sys
test_dirs = ["run_amp", "run_fp16util", "run_mixed_adam", "run_fused_layer_norm"]
test_dirs = ["run_amp", "run_fp16util", "run_fused_layer_norm", "run_optimizers"]
runner = unittest.TextTestRunner(verbosity=2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment