import torch from apex.multi_tensor_apply import multi_tensor_applier from amp_C import multi_tensor_adam class FusedAdam(torch.optim.Optimizer): """Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via ``python setup.py install --cuda_ext --cpp_ext``. This version of fused adam implements 2 fusion: - Fusion of operations within adam optimizer - Apply operation on a list of tensor in single multi-tensor kernel by group It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Arguments: params (iterable): iterable of parameters to optimize or dicts defining parameter groups. lr (float, optional): learning rate. (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square. (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability. (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) NOT SUPPORTED in FusedAdam! eps_inside_sqrt (boolean, optional): in the 'update parameters' step, adds eps to the bias-corrected second moment estimate before evaluating square root instead of adding it to the square root of second moment estimate as in the original paper. (default: False) .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, bias_correction = True, betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False, weight_decay=0., amsgrad=False): if amsgrad: raise RuntimeError('FusedAdam does not support the AMSGrad variant.') defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay) super(FusedAdam, self).__init__(params, defaults) self.eps_mode = 0 if eps_inside_sqrt else 1 self.dummy_overflow_buf = torch.cuda.IntTensor([0]) def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ if any(p is not None for p in [grads, output_params, scale, grad_norms]): raise RuntimeError('FusedAdam has been updated, please use with AMP for mixed precision.') loss = None if closure is not None: loss = closure() for group in self.param_groups: bias_correction = 1 if group['bias_correction'] else 0 beta1, beta2 = group['betas'] # assume same step across group now to simplify things # per parameter step can be easily support by making it tensor, or pass list into kernel if 'step' in group: group['step'] += 1 else: group['step'] = 1 # create lists for multi-tensor apply p_list, g_list, m1_list, m2_list = [], [], [], [] for p in group['params']: if p.grad is None: continue if p.grad.data.is_sparse: raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead') state = self.state[p] # State initialization if len(state) == 0: # Exponential moving average of gradient values state['exp_avg'] = torch.zeros_like(p.data) # Exponential moving average of squared gradient values state['exp_avg_sq'] = torch.zeros_like(p.data) p_list.append(p.data) g_list.append(p.grad.data) m1_list.append(state['exp_avg']) m2_list.append(state['exp_avg_sq']) multi_tensor_applier(multi_tensor_adam, self.dummy_overflow_buf, [g_list, p_list, m1_list, m2_list], group['lr'], beta1, beta2, group['eps'], group['step'], self.eps_mode, bias_correction, group['weight_decay']) return loss