"tests/git@developer.sourcefind.cn:OpenDAS/mmcv.git" did not exist on "f59aec8ffb040fa91f5565df5cf7299c61817106"
Commit adad5996 authored by Deyu Fu's avatar Deyu Fu
Browse files

keep old fused* name and rename new optimizers without prefix

parent 4d6ed501
from .fused_sgd import FusedSGD from .fused_sgd import FusedSGD
from .novograd import FusedNovoGrad from .fused_adam import FusedAdam
from .fused_adam_v1 import FusedAdam_v1
from .adam import FusedAdam
#from .sgd import FusedSGD
from .fp16_optimizer import FP16_Optimizer from .fp16_optimizer import FP16_Optimizer
from .sgd import SGD
from .adam import Adam
from .novograd import NovoGrad
...@@ -2,7 +2,7 @@ import torch ...@@ -2,7 +2,7 @@ import torch
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
from amp_C import multi_tensor_adam from amp_C import multi_tensor_adam
class FusedAdam(torch.optim.Optimizer): class Adam(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via """Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``. ``python setup.py install --cuda_ext --cpp_ext``.
...@@ -45,7 +45,7 @@ class FusedAdam(torch.optim.Optimizer): ...@@ -45,7 +45,7 @@ class FusedAdam(torch.optim.Optimizer):
raise RuntimeError('FusedAdam does not support the AMSGrad variant.') raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction, defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay) betas=betas, eps=eps, weight_decay=weight_decay)
super(FusedAdam, self).__init__(params, defaults) super(Adam, self).__init__(params, defaults)
self.eps_mode = 0 if eps_inside_sqrt else 1 self.eps_mode = 0 if eps_inside_sqrt else 1
self.dummy_overflow_buf = torch.cuda.IntTensor([0]) self.dummy_overflow_buf = torch.cuda.IntTensor([0])
...@@ -57,8 +57,8 @@ class FusedAdam(torch.optim.Optimizer): ...@@ -57,8 +57,8 @@ class FusedAdam(torch.optim.Optimizer):
and returns the loss. and returns the loss.
""" """
if any(p is not None for p in [grads, output_params, scale, grad_norms]): if any(p is not None for p in [grads, output_params, scale, grad_norms]):
raise RuntimeError('FusedAdam has been updated, please use with AMP for mixed precision. ' raise RuntimeError('Adam has been updated, please use with AMP for mixed precision. '
'For legacy code using fp16_optimizer, use FusedAdam_v1.') 'For legacy code using fp16_optimizer, use FusedAdam.')
loss = None loss = None
if closure is not None: if closure is not None:
loss = closure() loss = closure()
......
...@@ -35,7 +35,8 @@ class FP16_Optimizer(object): ...@@ -35,7 +35,8 @@ class FP16_Optimizer(object):
dynamic_loss_args=None, dynamic_loss_args=None,
verbose=True): verbose=True):
print("\nfp16_optimizer will be removed in future. To update, use fused optimizers with AMP.") print("\nfp16_optimizer is designed to work with apex.optimizers.Fused*, and will be removed in future")
print("To update, use updated optimizers without Fused prefix with AMP.")
# The fused optimizer does all the work. We need this layer for two reason: # The fused optimizer does all the work. We need this layer for two reason:
# 1. maintain same user API from apex.fp16_utils # 1. maintain same user API from apex.fp16_utils
# 2. keep common stuff here in case we need to add new fused optimizer later # 2. keep common stuff here in case we need to add new fused optimizer later
......
...@@ -2,9 +2,9 @@ import types ...@@ -2,9 +2,9 @@ import types
import torch import torch
import importlib import importlib
from ..multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
class FusedAdam_v1(torch.optim.Optimizer): class FusedAdam(torch.optim.Optimizer):
"""Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via """Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``. ``python setup.py install --cuda_ext --cpp_ext``.
...@@ -40,6 +40,8 @@ class FusedAdam_v1(torch.optim.Optimizer): ...@@ -40,6 +40,8 @@ class FusedAdam_v1(torch.optim.Optimizer):
betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False, betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False, weight_decay=0., max_grad_norm=0., amsgrad=False, use_mt=False,
amp_scale_adjustment=1.0): amp_scale_adjustment=1.0):
print("\nFusedAdam will be removed in future. To update, use apex.optimizers.Adam with AMP.")
global fused_adam_cuda global fused_adam_cuda
fused_adam_cuda = importlib.import_module("fused_adam_cuda") fused_adam_cuda = importlib.import_module("fused_adam_cuda")
...@@ -58,7 +60,7 @@ class FusedAdam_v1(torch.optim.Optimizer): ...@@ -58,7 +60,7 @@ class FusedAdam_v1(torch.optim.Optimizer):
defaults = dict(lr=lr, bias_correction=bias_correction, defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay, betas=betas, eps=eps, weight_decay=weight_decay,
max_grad_norm=max_grad_norm) max_grad_norm=max_grad_norm)
super(FusedAdam_v1, self).__init__(params, defaults) super(FusedAdam, self).__init__(params, defaults)
self.eps_mode = 0 if eps_inside_sqrt else 1 self.eps_mode = 0 if eps_inside_sqrt else 1
def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None): def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):
...@@ -195,4 +197,3 @@ class FusedAdam_v1(torch.optim.Optimizer): ...@@ -195,4 +197,3 @@ class FusedAdam_v1(torch.optim.Optimizer):
group['weight_decay']) group['weight_decay'])
return loss return loss
...@@ -53,6 +53,9 @@ class FusedSGD(Optimizer): ...@@ -53,6 +53,9 @@ class FusedSGD(Optimizer):
weight_decay=0, nesterov=False, weight_decay=0, nesterov=False,
wd_after_momentum=False, wd_after_momentum=False,
materialize_master_grads=True): materialize_master_grads=True):
print("\nFusedSGD will be removed in future. To update, use apex.optimizers.SGD with AMP.")
if lr is not required and lr < 0.0: if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr)) raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0: if momentum < 0.0:
......
...@@ -2,7 +2,7 @@ import torch ...@@ -2,7 +2,7 @@ import torch
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
from amp_C import multi_tensor_novograd from amp_C import multi_tensor_novograd
class FusedNovoGrad(torch.optim.Optimizer): class NovoGrad(torch.optim.Optimizer):
"""Implements NovoGrad algorithm. Currently GPU-only. Requires Apex to be installed via """Implements NovoGrad algorithm. Currently GPU-only. Requires Apex to be installed via
``python setup.py install --cuda_ext --cpp_ext``. ``python setup.py install --cuda_ext --cpp_ext``.
...@@ -48,12 +48,12 @@ class FusedNovoGrad(torch.optim.Optimizer): ...@@ -48,12 +48,12 @@ class FusedNovoGrad(torch.optim.Optimizer):
grad_averaging=True, norm_type=2, init_zero=False, grad_averaging=True, norm_type=2, init_zero=False,
set_grad_none=True): set_grad_none=True):
if amsgrad: if amsgrad:
raise RuntimeError('FusedNovoGrad does not support the AMSGrad variant.') raise RuntimeError('NovoGrad does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction, defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay, betas=betas, eps=eps, weight_decay=weight_decay,
grad_averaging=grad_averaging, norm_type=norm_type, grad_averaging=grad_averaging, norm_type=norm_type,
init_zero=init_zero) init_zero=init_zero)
super(FusedNovoGrad, self).__init__(params, defaults) super(NovoGrad, self).__init__(params, defaults)
self.moment_mode = 0 if reg_inside_moment else 1 self.moment_mode = 0 if reg_inside_moment else 1
self.dummy_overflow_buf = torch.cuda.IntTensor([0]) self.dummy_overflow_buf = torch.cuda.IntTensor([0])
self.set_grad_none = set_grad_none self.set_grad_none = set_grad_none
...@@ -64,7 +64,7 @@ class FusedNovoGrad(torch.optim.Optimizer): ...@@ -64,7 +64,7 @@ class FusedNovoGrad(torch.optim.Optimizer):
for p in group['params']: for p in group['params']:
p.grad = None p.grad = None
else: else:
super(FusedNovoGrad, self).zero_grad() super(NovoGrad, self).zero_grad()
def step(self, closure=None): def step(self, closure=None):
"""Performs a single optimization step. """Performs a single optimization step.
...@@ -96,7 +96,7 @@ class FusedNovoGrad(torch.optim.Optimizer): ...@@ -96,7 +96,7 @@ class FusedNovoGrad(torch.optim.Optimizer):
if p.grad is None: if p.grad is None:
continue continue
if p.grad.data.is_sparse: if p.grad.data.is_sparse:
raise RuntimeError('FusedNovoGrad does not support sparse gradients, please consider SparseAdam instead') raise RuntimeError('NovoGrad does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p] state = self.state[p]
# State initialization # State initialization
...@@ -119,7 +119,7 @@ class FusedNovoGrad(torch.optim.Optimizer): ...@@ -119,7 +119,7 @@ class FusedNovoGrad(torch.optim.Optimizer):
elif group['norm_type'] == 2: elif group['norm_type'] == 2:
m2 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_list] m2 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_list]
else: else:
raise RuntimeError('FusedNovoGrad only support l2/inf norm now.') raise RuntimeError('NovoGrad only support l2/inf norm now.')
group['exp_avg_sq'] = torch.cuda.FloatTensor(m2) group['exp_avg_sq'] = torch.cuda.FloatTensor(m2)
else: else:
assert(len(g_list) == group['exp_avg_sq'].numel()) assert(len(g_list) == group['exp_avg_sq'].numel())
......
...@@ -4,7 +4,7 @@ from torch.optim import Optimizer ...@@ -4,7 +4,7 @@ from torch.optim import Optimizer
from amp_C import multi_tensor_axpby from amp_C import multi_tensor_axpby
from apex.multi_tensor_apply import multi_tensor_applier from apex.multi_tensor_apply import multi_tensor_applier
class FusedSGD(Optimizer): class SGD(Optimizer):
r"""Implements stochastic gradient descent (optionally with momentum). r"""Implements stochastic gradient descent (optionally with momentum).
Nesterov momentum is based on the formula from Nesterov momentum is based on the formula from
`On the importance of initialization and momentum in deep learning`__. `On the importance of initialization and momentum in deep learning`__.
...@@ -52,10 +52,10 @@ class FusedSGD(Optimizer): ...@@ -52,10 +52,10 @@ class FusedSGD(Optimizer):
weight_decay=weight_decay, nesterov=nesterov) weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0): if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError("Nesterov momentum requires a momentum and zero dampening") raise ValueError("Nesterov momentum requires a momentum and zero dampening")
super(FusedSGD, self).__init__(params, defaults) super(SGD, self).__init__(params, defaults)
def __setstate__(self, state): def __setstate__(self, state):
super(FusedSGD, self).__setstate__(state) super(SGD, self).__setstate__(state)
for group in self.param_groups: for group in self.param_groups:
group.setdefault('nesterov', False) group.setdefault('nesterov', False)
......
...@@ -36,7 +36,7 @@ class TestFP16Optimizer(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestFP16Optimizer(unittest.TestCase):
ref_optim = torch.optim.Adam(self.ref_model.parameters()) ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False) ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam_v1(self.tst_model.parameters()) tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim) tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters): for i in range(self.iters):
...@@ -58,7 +58,7 @@ class TestFP16Optimizer(unittest.TestCase): ...@@ -58,7 +58,7 @@ class TestFP16Optimizer(unittest.TestCase):
ref_optim = torch.optim.Adam(self.ref_model.parameters()) ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False) ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False)
tst_optim = apex.optimizers.FusedAdam_v1(self.tst_model.parameters()) tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0) tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0)
for i in range(self.iters): for i in range(self.iters):
...@@ -81,7 +81,7 @@ class TestFP16Optimizer(unittest.TestCase): ...@@ -81,7 +81,7 @@ class TestFP16Optimizer(unittest.TestCase):
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False) ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}] tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}]
tst_optim = apex.optimizers.FusedAdam_v1(tst_groups) tst_optim = apex.optimizers.FusedAdam(tst_groups)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim) tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters): for i in range(self.iters):
...@@ -101,7 +101,7 @@ class TestFP16Optimizer(unittest.TestCase): ...@@ -101,7 +101,7 @@ class TestFP16Optimizer(unittest.TestCase):
ref_optim = torch.optim.Adam(self.ref_model.parameters()) ref_optim = torch.optim.Adam(self.ref_model.parameters())
ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False) ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)
tst_optim = apex.optimizers.FusedAdam_v1(self.tst_model.parameters(), max_grad_norm=0.01) tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters(), max_grad_norm=0.01)
tst_optim = apex.optimizers.FP16_Optimizer(tst_optim) tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)
for i in range(self.iters): for i in range(self.iters):
......
...@@ -24,10 +24,10 @@ class TestFusedAdam(unittest.TestCase): ...@@ -24,10 +24,10 @@ class TestFusedAdam(unittest.TestCase):
ref_optim = torch.optim.Adam(ref_param, **ref_adam_option) ref_optim = torch.optim.Adam(ref_param, **ref_adam_option)
if tst_adam_option: if tst_adam_option:
tst_optim = apex.optimizers.FusedAdam_v1(tst_param, **tst_adam_option) tst_optim = apex.optimizers.FusedAdam(tst_param, **tst_adam_option)
else: else:
tst_optim = apex.optimizers.FusedAdam_v1(tst_param, **ref_adam_option) tst_optim = apex.optimizers.FusedAdam(tst_param, **ref_adam_option)
return (ref_param, tst_param, ref_optim, tst_optim) return (ref_param, tst_param, ref_optim, tst_optim)
def gen_grad(self, ref_param, tst_param): def gen_grad(self, ref_param, tst_param):
......
...@@ -23,7 +23,7 @@ class TestFusedAdam(unittest.TestCase): ...@@ -23,7 +23,7 @@ class TestFusedAdam(unittest.TestCase):
tst_param.append(torch.nn.Parameter(tensor.clone())) tst_param.append(torch.nn.Parameter(tensor.clone()))
ref_optim = torch.optim.Adam(ref_param, **adam_option) ref_optim = torch.optim.Adam(ref_param, **adam_option)
tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option) tst_optim = apex.optimizers.Adam(tst_param, **adam_option)
return (ref_param, tst_param, ref_optim, tst_optim) return (ref_param, tst_param, ref_optim, tst_optim)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment