initial commit to make fused optimizers compatible with AMP

690b1f71 · Deyu Fu · 37795aac · 690b1f71 · 690b1f71 · 690b1f71
Commit 690b1f71 authored Aug 08, 2019 by Deyu Fu
16 changed files
--- a/apex/optimizers/__init__.py
+++ b/apex/optimizers/__init__.py
-from .fused_adam import FusedAdam
+from .sgd import FusedSGD
+from .novograd import FusedNovoGrad
+from .fused_adam_v1 import FusedAdam_v1
+from .adam import FusedAdam
 from .fp16_optimizer import FP16_Optimizer
--- a/apex/optimizers/adam.py
+++ b/apex/optimizers/adam.py
+import torch
+from apex.multi_tensor_apply import multi_tensor_applier
+from amp_C import multi_tensor_adam
+
+class FusedAdam(torch.optim.Optimizer):
+
+    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    This version of fused adam implements 2 fusion:
+      - Fusion of operations within adam optimizer
+      - Apply operation on a list of tensor in single multi-tensor kernel by group
+    It is a breaking change over last version, as API changes and it no longer fuse grad norm and loss scaling.
+
+    It has been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED in FusedAdam!
+        eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
+            adds eps to the bias-corrected second moment estimate before
+            evaluating square root instead of adding it to the square root of
+            second moment estimate as in the original paper. (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, bias_correction = True,
+                 betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt = False,
+                 weight_decay=0., amsgrad=False):
+
+        if amsgrad:
+            raise RuntimeError('FusedAdam does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay)
+        super(FusedAdam, self).__init__(params, defaults)
+        self.eps_mode = 0 if  eps_inside_sqrt else 1
+        self.dummy_overflow_buf = torch.cuda.IntTensor([0])
+
+    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
+            raise RuntimeError('FusedAdam has been updated, please use with AMP for mixed precision. '
+                               'For legacy code using fp16_optimizer, use FusedAdam_v1.')
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            # create lists for multi-tensor apply
+            p_list, g_list, m1_list, m2_list = [], [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError('FusedAdam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                p_list.append(p.data)
+                g_list.append(p.grad.data)
+                m1_list.append(state['exp_avg'])
+                m2_list.append(state['exp_avg_sq'])
+
+            multi_tensor_applier(multi_tensor_adam,
+                                 self.dummy_overflow_buf,
+                                 [g_list, p_list, m1_list, m2_list],
+                                 group['lr'],
+                                 beta1,
+                                 beta2,
+                                 group['eps'],
+                                 group['step'],
+                                 self.eps_mode,
+                                 bias_correction,
+                                 group['weight_decay'])
+
+
+        return loss
--- a/apex/optimizers/fp16_optimizer.py
+++ b/apex/optimizers/fp16_optimizer.py
@@ -35,6 +35,7 @@ class FP16_Optimizer(object):
                 dynamic_loss_args=None,
                 verbose=True):

+        print("\nfp16_optimizer will be removed in future. To update, use fused optimizers with AMP.")
        # The fused optimizer does all the work. We need this layer for two reason:
        # 1. maintain same user API from apex.fp16_utils
        # 2. keep common stuff here in case we need to add new fused optimizer later

--- a/apex/optimizers/fused_adam.py
+++ b/apex/optimizers/fused_adam.py
@@ -2,7 +2,7 @@ import types
 import torch
 import importlib

-class FusedAdam(torch.optim.Optimizer):
+class FusedAdam_v1(torch.optim.Optimizer):

    """Implements Adam algorithm. Currently GPU-only.  Requires Apex to be installed via
    ``python setup.py install --cuda_ext --cpp_ext``.
@@ -44,7 +44,7 @@ class FusedAdam(torch.optim.Optimizer):
        defaults = dict(lr=lr, bias_correction=bias_correction,
                        betas=betas, eps=eps, weight_decay=weight_decay,
                        max_grad_norm=max_grad_norm)
-        super(FusedAdam, self).__init__(params, defaults)
+        super(FusedAdam_v1, self).__init__(params, defaults)
        self.eps_mode = 0 if  eps_inside_sqrt else 1

    def step(self, closure=None, grads=None, output_params=None, scale=1., grad_norms=None):

--- a/apex/optimizers/novograd.py
+++ b/apex/optimizers/novograd.py
+import torch
+from apex.multi_tensor_apply import multi_tensor_applier
+from amp_C import multi_tensor_novograd
+
+class FusedNovoGrad(torch.optim.Optimizer):
+
+    """Implements NovoGrad algorithm. Currently GPU-only.  Requires Apex to be installed via
+    ``python setup.py install --cuda_ext --cpp_ext``.
+
+    It has been proposed in `Jasper: An End-to-End Convolutional Neural Acoustic Model`_.
+    More info: https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html#novograd
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its norm. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            NOT SUPPORTED now! (default: False)
+        reg_inside_moment (bool, optional): whether do regularization (norm and L2)
+            in momentum calculation. True for include, False for not include and
+            only do it on update term. (default: False)
+        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
+            calculating running averages of gradient. (default: True)
+        norm_type (int, optional): which norm to calculate for each layer.
+            2 for L2 norm, and 0 for infinite norm. These 2 are only supported
+            type now. (default: 2)
+        init_zero (bool, optional): whether init norm with 0 (start averaging on
+            1st step) or first step norm (start averaging on 2nd step). True for
+            init with 0. (default: False)
+        set_grad_none (bool, optional): whether set grad to None when zero_grad()
+            method is called. (default: True)
+
+    .. _Jasper\: An End-to-End Convolutional Neural Acoustic Mode:
+        https://arxiv.org/abs/1904.03288
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, bias_correction=True,
+                 betas=(0.9, 0.999), eps=1e-8, weight_decay=0.,
+                 amsgrad=False, reg_inside_moment=False,
+                 grad_averaging=True, norm_type=2, init_zero=False,
+                 set_grad_none=True):
+        if amsgrad:
+            raise RuntimeError('FusedNovoGrad does not support the AMSGrad variant.')
+        defaults = dict(lr=lr, bias_correction=bias_correction,
+                        betas=betas, eps=eps, weight_decay=weight_decay,
+                        grad_averaging=grad_averaging, norm_type=norm_type,
+                        init_zero=init_zero)
+        super(FusedNovoGrad, self).__init__(params, defaults)
+        self.moment_mode = 0 if reg_inside_moment else 1
+        self.dummy_overflow_buf = torch.cuda.IntTensor([0])
+        self.set_grad_none = set_grad_none
+
+    def zero_grad(self):
+        if self.set_grad_none:
+            for group in self.param_groups:
+                for p in group['params']:
+                    p.grad = None
+        else:
+            super(FusedNovoGrad, self).zero_grad()
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            bias_correction = 1 if group['bias_correction'] else 0
+            beta1, beta2 = group['betas']
+            grad_averaging = 1 if group['grad_averaging'] else 0
+
+            # assume same step across group now to simplify things
+            # per parameter step can be easily support by making it tensor, or pass list into kernel
+            if 'step' in group:
+                group['step'] += 1
+            else:
+                group['step'] = 1
+
+            # create lists for multi-tensor apply
+            p_list, g_list, m1_list = [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                if p.grad.data.is_sparse:
+                    raise RuntimeError('FusedNovoGrad does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+
+                p_list.append(p.data)
+                g_list.append(p.grad.data)
+                m1_list.append(state['exp_avg'])
+
+            # we will store per weight norm as one tensor for a group
+            # different rom optim.Adam, we store norm here(not ^2) so we can unify 2 norm type
+            if 'exp_avg_sq' not in group:
+                if group['init_zero']:
+                    group['exp_avg_sq'] = torch.cuda.FloatTensor(len(g_list)).contiguous().fill_(0)
+                else: # init with first step norm, so first blend have no effect
+                    if group['norm_type'] == 0:
+                        m2 = [torch.max(torch.abs(g)).item() for g in g_list]
+                    elif group['norm_type'] == 2:
+                        m2 = [torch.sum(torch.pow(g, 2)).sqrt().item() for g in g_list]
+                    else:
+                        raise RuntimeError('FusedNovoGrad only support l2/inf norm now.')
+                    group['exp_avg_sq'] = torch.cuda.FloatTensor(m2)
+            else:
+                assert(len(g_list) == group['exp_avg_sq'].numel())
+
+            multi_tensor_applier(multi_tensor_novograd,
+                                 self.dummy_overflow_buf,
+                                 [g_list, p_list, m1_list],
+                                 group['exp_avg_sq'],
+                                 group['lr'],
+                                 beta1,
+                                 beta2,
+                                 group['eps'],
+                                 group['step'],
+                                 bias_correction,
+                                 group['weight_decay'],
+                                 grad_averaging,
+                                 self.moment_mode,
+                                 group['norm_type'])
+
+        return loss
--- a/apex/optimizers/sgd.py
+++ b/apex/optimizers/sgd.py
+import torch
+from torch.optim import Optimizer
+
+from amp_C import multi_tensor_axpby
+from apex.multi_tensor_apply import multi_tensor_applier
+
+class FusedSGD(Optimizer):
+    r"""Implements stochastic gradient descent (optionally with momentum).
+    Nesterov momentum is based on the formula from
+    `On the importance of initialization and momentum in deep learning`__.
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float): learning rate
+        momentum (float, optional): momentum factor (default: 0)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        dampening (float, optional): dampening for momentum (default: 0)
+        nesterov (bool, optional): enables Nesterov momentum (default: False)
+    Example:
+        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+        >>> optimizer.zero_grad()
+        >>> loss_fn(model(input), target).backward()
+        >>> optimizer.step()
+    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
+    .. note::
+        The implementation of SGD with Momentum/Nesterov subtly differs from
+        Sutskever et. al. and implementations in some other frameworks.
+        Considering the specific case of Momentum, the update can be written as
+        .. math::
+                  v = \rho * v + g \\
+                  p = p - lr * v
+        where p, g, v and :math:`\rho` denote the parameters, gradient,
+        velocity, and momentum respectively.
+        This is in contrast to Sutskever et. al. and
+        other frameworks which employ an update of the form
+        .. math::
+             v = \rho * v + lr * g \\
+             p = p - v
+        The Nesterov version is analogously modified.
+    """
+
+    def __init__(self, params, lr=0.1, momentum=0., dampening=0.,
+                 weight_decay=0., nesterov=False):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if momentum < 0.0:
+            raise ValueError("Invalid momentum value: {}".format(momentum))
+        if weight_decay < 0.0:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
+                        weight_decay=weight_decay, nesterov=nesterov)
+        if nesterov and (momentum <= 0 or dampening != 0):
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
+        super(FusedSGD, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(FusedSGD, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('nesterov', False)
+
+    def zero_grad(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is not None:
+                    p.grad.fill_(0.33)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            weight_decay = group['weight_decay']
+            momentum = group['momentum']
+            dampening = group['dampening']
+            nesterov = group['nesterov']
+
+            param_list, grad_list, momentum_list = [], [], []
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                # create lists for multi tensor apply
+                param_list.append(p.data)
+                grad_list.append(p.grad.data)
+                if momentum != 0:
+                    param_state = self.state[p]
+                    if 'momentum_buffer' not in param_state:
+                        buf = param_state['momentum_buffer'] = torch.clone(p.grad.data).detach()
+                        group['init'] = True
+                    else:
+                        buf = param_state['momentum_buffer']
+                        group['init'] = False
+                    momentum_list.append(buf)
+
+            if weight_decay != 0:
+                multi_tensor_applier(
+                    multi_tensor_axpby,
+                    torch.cuda.IntTensor([0]),#dummy_overflow_buf,
+                    [grad_list, param_list, grad_list],
+                    1.,
+                    weight_decay,
+                    2)
+            if momentum != 0:
+                if not group['init']:
+                    multi_tensor_applier(
+                        multi_tensor_axpby,
+                        torch.cuda.IntTensor([0]),#dummy_overflow_buf,
+                        [momentum_list, grad_list, momentum_list],
+                        momentum,
+                        1.-dampening,
+                        2)
+                if nesterov:
+                    multi_tensor_applier(
+                        multi_tensor_axpby,
+                        torch.cuda.IntTensor([0]),#dummy_overflow_buf,
+                        [grad_list, momentum_list, grad_list],
+                        1.,
+                        momentum,
+                        2)
+                else:
+                    grad_list = momentum_list
+
+            multi_tensor_applier(
+                multi_tensor_axpby,
+                torch.cuda.IntTensor([0]),#dummy_overflow_buf,
+                [param_list, grad_list, param_list],
+                1.,
+                -group['lr'],
+                2)
+
+        return loss
--- a/csrc/amp_C_frontend.cpp
+++ b/csrc/amp_C_frontend.cpp
@@ -40,6 +40,35 @@ void multi_tensor_lamb_stage2_cuda(
  at::Tensor per_tensor_update_norm,
  const float step_size);

+void multi_tensor_adam_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int eps_mode,
+  const int bias_correction,
+  const float weight_decay);
+
+void multi_tensor_novograd_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor grad_norms,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int moment_mode,
+  const int norm_type);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("multi_tensor_scale", &multi_tensor_scale_cuda,
        "Fused overflow check + scale for a list of contiguous tensors");
@@ -51,4 +80,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
        "Computes update part of LAMB optimizer");
  m.def("multi_tensor_lamb_stage2_cuda", &multi_tensor_lamb_stage2_cuda,
        "Completes application of gradient to parameters for LAMB optimizer");
+  m.def("multi_tensor_adam", &multi_tensor_adam_cuda,
+        "Compute and apply gradient update to parameters for Adam optimizer");
+  m.def("multi_tensor_novograd", &multi_tensor_novograd_cuda,
+        "Compute and apply gradient update to parameters for Adam optimizer");
 }
--- a/csrc/multi_tensor_adam.cu
+++ b/csrc/multi_tensor_adam.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum{
+  ADAM_MODE_0   =0, // eps under square root
+  ADAM_MODE_1   =1  // eps outside square root
+} adamMode_t;
+
+
+template<typename T>
+struct AdamFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<4>& tl,
+    const float beta1,
+    const float beta2,
+    const float eps,
+    const float step_size,
+    adamMode_t mode,
+    const float decay)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+
+    // potentially use to pass in list of scalar
+    // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    T* g = (T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+
+    T* v = (T*)tl.addresses[3][tensor_loc];
+    v += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      T r_g[ILP];
+      T r_p[ILP];
+      T r_m[ILP];
+      T r_v[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          r_p[ii] = p[i];
+          r_m[ii] = m[i];
+          r_v[ii] = v[i];
+        } else {
+          r_g[ii] = T(0);
+          r_p[ii] = T(0);
+          r_m[ii] = T(0);
+          r_v[ii] = T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        r_m[ii] = beta1 * r_m[ii] + (1-beta1) * r_g[ii];
+        r_v[ii] = beta2 * r_v[ii] + (1-beta2) * r_g[ii] * r_g[ii];
+        T denom;
+        if (mode == ADAM_MODE_0)
+          denom = sqrtf(r_v[ii] + eps);
+        else // Mode 1
+          denom = sqrtf(r_v[ii]) + eps;
+        T update = (r_m[ii] / denom) + (decay * r_p[ii]);
+        r_p[ii] = r_p[ii] - (step_size * update);
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+          m[i] = r_m[ii];
+          v[i] = r_v[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_adam_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int eps_mode,
+  const int bias_correction,
+  const float weight_decay)
+{
+  using namespace at;
+
+  float step_size = 0;
+  if (bias_correction == 1) {
+    const float bias_correction1 = 1 - std::pow(beta1, step);
+    const float bias_correction2 = 1 - std::pow(beta2, step);
+    step_size = lr * std::sqrt(bias_correction2)/bias_correction1;
+  }
+  else {
+    step_size = lr;
+  }
+
+  // Assume single type across p,g,m1,m2 now
+  DISPATCH_DOUBLE_FLOAT_AND_HALF(
+    tensor_lists[0][0].scalar_type(), 0, "adam",
+    multi_tensor_apply<4>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      AdamFunctor<scalar_t_0>(),
+      beta1,
+      beta2,
+      epsilon,
+      step_size,
+      (adamMode_t) eps_mode,
+      weight_decay); )
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+}
--- a/csrc/multi_tensor_l2norm_kernel.cu
+++ b/csrc/multi_tensor_l2norm_kernel.cu
@@ -75,6 +75,69 @@ struct L2NormFunctor
  }
 };

+// Probably better to template, but since we are not likely to support other norm
+template<typename x_t>
+struct MaxNormFunctor
+{
+  __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<1>& tl,
+    float* output,
+    float* output_per_tensor,
+    bool per_tensor,
+    int max_chunks_per_tensor)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    x_t* x = (x_t*)tl.addresses[0][tensor_loc];
+    x += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    __shared__ float s_vals[512];
+
+    float vals[ILP]; // = {0}; // this probably works too but I want to be sure...
+    for(int i = 0; i < ILP; i++)
+      vals[i] = 0.f;
+
+    for(int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x*ILP)
+    {
+      #pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          float next = static_cast<float>(x[i]);
+          vals[ii] = fmaxf(fabsf(vals[ii]), fabsf(next));
+        }
+      }
+    }
+
+    float val = 0.f;
+    for(int i = 0; i < ILP; i++)
+        val = fmaxf(fabsf(val), fabsf(vals[i]));
+
+    float final = reduce_block_into_lanes_max_op(s_vals, val);
+
+    if(threadIdx.x == 0)
+    {
+      if(!isfinite(final))
+        *noop_gmem = 1; // Blindly fire off a write.  These will race but that's ok.
+      output[blockIdx.x] = fmaxf(fabsf(output[blockIdx.x]), fabsf(final));
+      if(per_tensor)
+        output_per_tensor[(tl.start_tensor_this_launch + tensor_loc)*max_chunks_per_tensor + chunk_idx] = final;
+    }
+  }
+};
+

 __global__ void cleanup(
  float* output,
@@ -113,6 +176,63 @@ __global__ void cleanup(
  }
 }

+__global__ void cleanup_v2(
+  float* output,
+  float* output_per_tensor,
+  float* ret,
+  float* ret_per_tensor,
+  bool per_tensor,
+  int max_chunks_per_tensor,
+  int norm_type,
+  float alpha,
+  float beta)
+{
+  __shared__ float vals[512];
+
+  if(blockIdx.x == 0)
+  {
+    float val = 0;
+    if(threadIdx.x < 320)
+      val = output[threadIdx.x];
+
+    if (norm_type == 0) {
+      float final = reduce_block_into_lanes_max_op(vals, val);
+      if(threadIdx.x == 0)
+        *ret = alpha * (*ret) + beta * final;
+    }
+    else {
+      float final = reduce_block_into_lanes(vals, val);
+      if(threadIdx.x == 0)
+        *ret = sqrt(alpha * (*ret) * (*ret) + beta * final);
+    }
+  }
+
+  if(per_tensor)
+  {
+    float* output_this_tensor = output_per_tensor + blockIdx.x*max_chunks_per_tensor;
+
+    if (norm_type == 0) {
+      float val = 0;
+      for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
+        val = fmaxf(fabsf(val), fabsf(output_this_tensor[i]));
+
+      float final = reduce_block_into_lanes_max_op(vals, val);
+
+      if(threadIdx.x == 0)
+        ret_per_tensor[blockIdx.x] = alpha * ret_per_tensor[blockIdx.x] + beta * final;
+    }
+    else {
+      float val = 0;
+      for(int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x)
+        val += output_this_tensor[i];
+
+      float final = reduce_block_into_lanes(vals, val);
+
+      if(threadIdx.x == 0)
+        ret_per_tensor[blockIdx.x] = sqrt(alpha * ret_per_tensor[blockIdx.x] * ret_per_tensor[blockIdx.x] + beta * final);
+    }
+  }
+}

 std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
  int chunk_size,
@@ -178,3 +298,90 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(

  return std::tuple<at::Tensor, at::Tensor>(ret, ret_per_tensor);
 }
+
+
+// Compute and update grad norm
+// Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
+// L-2: gn = sqrt(a * gn^2 + b * n^2)
+// L-inf: gn = a * gn + b * n
+void multi_tensor_norm_out_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor out,
+  const float alpha,
+  const float beta,
+  const int norm_type)
+{
+  auto float_options = tensor_lists[0][0].options().dtype(at::kFloat);
+
+  // we don't need global thus uses empty here
+  auto output = at::empty({320}, float_options);
+
+  at::Tensor output_per_tensor;
+  at::Tensor ret_per_tensor;
+
+  int ntensors = tensor_lists[0].size();
+  int max_chunks_per_tensor = -1;
+
+  for(int t = 0; t < ntensors; t++)
+  {
+    int max_chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1)/chunk_size;
+    if(max_chunks_this_tensor > max_chunks_per_tensor)
+      max_chunks_per_tensor = max_chunks_this_tensor;
+  }
+
+  // Although it is single write then read, still need to be zero
+  // Since tailing element also participate cleanup
+  output_per_tensor = at::zeros({ntensors*max_chunks_per_tensor}, float_options);
+
+  if (norm_type == 0) {
+    DISPATCH_FLOAT_AND_HALF(
+      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_maxnorm_cuda",
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        MaxNormFunctor<scalar_t_0>(),
+        output.data<float>(),
+        output_per_tensor.data<float>(),
+        true,
+        max_chunks_per_tensor);)
+  }
+  else {
+    DISPATCH_FLOAT_AND_HALF(
+      tensor_lists[0][0].scalar_type(), 0, "multi_tensor_l2norm_cuda",
+      multi_tensor_apply<1>(
+        BLOCK_SIZE,
+        chunk_size,
+        noop_flag,
+        tensor_lists,
+        L2NormFunctor<scalar_t_0>(),
+        output.data<float>(),
+        output_per_tensor.data<float>(),
+        true,
+        max_chunks_per_tensor);)
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // AT_CUDA_CHECK(cudaDeviceSynchronize());
+
+  // This involves one more small kernel launches, but will be negligible end to end.
+  // I could get rid of these by hacking the functor + multi tensor harness with persistence
+  // logic, but keeping it simple for now
+  auto ret = at::empty({1}, output.options());
+  auto stream = at::cuda::getCurrentCUDAStream();
+  cleanup_v2<<<ntensors, 512, 0, stream>>>(
+    output.data<float>(),
+    output_per_tensor.data<float>(),
+    ret.data<float>(),
+    out.data<float>(),
+    true,
+    max_chunks_per_tensor,
+    norm_type,
+    alpha,
+    beta);
+
+  return ;
+}
--- a/csrc/multi_tensor_novograd.cu
+++ b/csrc/multi_tensor_novograd.cu
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+// Another possibility:
+// #include <torch/all.h>
+
+#include <assert.h>
+
+#include "type_shim.h"
+#include "multi_tensor_apply.cuh"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum{
+  MOMENT_MODE_0   =0, // Momentum with denom/decay, optional grad averaging after
+  MOMENT_MODE_1   =1  // Momentum without denom/decay
+} momentMode_t;
+
+void multi_tensor_norm_out_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor out,
+  const float alpha,
+  const float beta,
+  const int norm_type);
+
+template<typename T>
+struct NovoGradFunctor
+{
+   __device__ __forceinline__ void operator()(
+    int chunk_size,
+    volatile int* noop_gmem,
+    TensorListMetadata<3>& tl,
+    const float beta1,
+    const float beta2,
+    const float beta3,
+    const float eps,
+    const float step_size,
+    momentMode_t m_mode,
+    const float decay,
+    const float* per_tensor_grad_norm)
+  {
+    // I'd like this kernel to propagate infs/nans.
+    // if(*noop_gmem == 1)
+    //   return;
+
+    int tensor_loc = tl.block_to_tensor[blockIdx.x];
+    int tensor_num = tl.start_tensor_this_launch + tensor_loc;
+    int chunk_idx = tl.block_to_chunk[blockIdx.x];
+    int n = tl.sizes[tensor_loc];
+
+    float grad_norm = per_tensor_grad_norm[tensor_num];
+
+    T* g = (T*)tl.addresses[0][tensor_loc];
+    g += chunk_idx*chunk_size;
+
+    T* p = (T*)tl.addresses[1][tensor_loc];
+    p += chunk_idx*chunk_size;
+
+    T* m = (T*)tl.addresses[2][tensor_loc];
+    m += chunk_idx*chunk_size;
+
+    n -= chunk_idx*chunk_size;
+
+    // see note in multi_tensor_scale_kernel.cu
+    for(int i_start = 0;
+            i_start < n && i_start < chunk_size;
+            i_start += blockDim.x*ILP)
+    {
+      T r_g[ILP];
+      T r_p[ILP];
+      T r_m[ILP];
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          r_g[ii] = g[i];
+          r_p[ii] = p[i];
+          r_m[ii] = m[i];
+        } else {
+          r_g[ii] = T(0);
+          r_p[ii] = T(0);
+          r_m[ii] = T(0);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        if (m_mode == MOMENT_MODE_0) {
+          T denom = grad_norm + eps;
+          r_g[ii] = (r_g[ii] / denom) + (decay * r_p[ii]);
+          r_m[ii] = beta1 * r_m[ii] + beta3 * r_g[ii];
+          r_p[ii] = r_p[ii] - (step_size * r_m[ii]);
+        }
+        else {
+          r_m[ii] = beta1 * r_m[ii] + beta3 * r_g[ii];
+          T denom = grad_norm + eps;
+          T update = (r_m[ii] / denom) + (decay * r_p[ii]);
+          r_p[ii] = r_p[ii] - (step_size * update);
+        }
+      }
+#pragma unroll
+      for(int ii = 0; ii < ILP; ii++)
+      {
+        int i = i_start + threadIdx.x + ii*blockDim.x;
+        if(i < n && i < chunk_size)
+        {
+          p[i] = r_p[ii];
+          m[i] = r_m[ii];
+        }
+      }
+    }
+  }
+};
+
+void multi_tensor_novograd_cuda(
+  int chunk_size,
+  at::Tensor noop_flag,
+  std::vector<std::vector<at::Tensor>> tensor_lists,
+  at::Tensor grad_norms,
+  const float lr,
+  const float beta1,
+  const float beta2,
+  const float epsilon,
+  const int step,
+  const int bias_correction,
+  const float weight_decay,
+  const int grad_averaging,
+  const int moment_mode,
+  const int norm_type)
+{
+  using namespace at;
+
+  // Handle bias correction mode
+  float step_size = 0;
+  if (bias_correction == 1) {
+    const float bias_correction1 = 1 - std::pow(beta1, step);
+    const float bias_correction2 = 1 - std::pow(beta2, step);
+    step_size = lr * std::sqrt(bias_correction2)/bias_correction1;
+  }
+  else {
+    step_size = lr;
+  }
+
+  // Handle grad averaging mode
+  float beta3 = 1;
+  if (grad_averaging == 1) beta3 = 1 - beta1;
+
+  std::vector<std::vector<at::Tensor>> grad_list(tensor_lists.begin(), tensor_lists.begin()+1);
+
+  // Compute and update grad norm
+  // Here use a per tensor norm, and blend new norm(n) and old norm(gn) by
+  // L-2: gn = sqrt(a * gn^2 + b * n^2)
+  // L-inf: gn = a * gn + b * n
+  multi_tensor_norm_out_cuda(chunk_size, noop_flag, grad_list, grad_norms, beta2, (1.0f - beta2), norm_type);
+
+  // Assume single type across p,g,m1,m2 now
+  DISPATCH_DOUBLE_FLOAT_AND_HALF(
+    tensor_lists[0][0].scalar_type(), 0, "novograd",
+    multi_tensor_apply<3>(
+      BLOCK_SIZE,
+      chunk_size,
+      noop_flag,
+      tensor_lists,
+      NovoGradFunctor<scalar_t_0>(),
+      beta1,
+      beta2,
+      beta3, // 1-beta1 or 1 depends on averaging mode
+      epsilon,
+      step_size,
+      (momentMode_t) moment_mode,
+      weight_decay,
+      grad_norms.data<float>()); )
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+}
--- a/csrc/type_shim.h
+++ b/csrc/type_shim.h
@@ -128,3 +128,53 @@ __device__ __forceinline__ T reduce_block_into_lanes

  return final;
 }
+
+template<typename T>
+__device__ __forceinline__ T reduce_block_into_lanes_max_op
+  (T *x,
+   T val,
+   int lanes=1,
+   bool share_result=false) // lanes is intended to be <= 32.
+{
+  int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  int blockSize = blockDim.x*blockDim.y; // blockSize is intended to be a multiple of 32.
+
+  if(blockSize >= 64)
+  {
+    x[tid] = val;
+    __syncthreads();
+  }
+
+  #pragma unroll
+  for(int i = (blockSize >> 1); i >= 64; i >>= 1)
+  {
+    if(tid < i)
+      x[tid] = fmaxf(fabsf(x[tid]), fabsf(x[tid+i]));
+    __syncthreads();
+  }
+
+  T final;
+
+  if(tid < 32)
+  {
+    if(blockSize >= 64)
+      final = fmaxf(fabsf(x[tid]), fabsf(x[tid+32]));
+    else
+      final = val;
+    // __SYNCWARP();
+
+    #pragma unroll
+    for(int i = 16; i >= lanes; i >>= 1)
+      final = fmaxf(fabsf(final), fabsf(__shfl_down_sync(0xffffffff, final, i)));
+  }
+
+  if(share_result)
+  {
+    if(tid < lanes)
+      x[tid] = final; // EpilogueOp
+    // Make sure the smem result is visible to all warps.
+    __syncthreads();
+  }
+
+  return final;
+}
--- a/setup.py
+++ b/setup.py
@@ -76,7 +76,9 @@ if "--cuda_ext" in sys.argv:
                                   'csrc/multi_tensor_axpby_kernel.cu',
                                   'csrc/multi_tensor_l2norm_kernel.cu',
                                   'csrc/multi_tensor_lamb_stage_1.cu',
-                                   'csrc/multi_tensor_lamb_stage_2.cu'],
+                                   'csrc/multi_tensor_lamb_stage_2.cu',
+                                   'csrc/multi_tensor_adam.cu',
+                                   'csrc/multi_tensor_novograd.cu'],
                          extra_compile_args={'cxx': ['-O3'],
                                              'nvcc':['-lineinfo',
                                                      '-O3',

--- a/tests/L0/run_fused_optimizers/__init__.py
+++ b/tests/L0/run_fused_optimizers/__init__.py
--- a/tests/L0/run_fused_optimizers/test_fused_adam.py
+++ b/tests/L0/run_fused_optimizers/test_fused_adam.py
+import unittest
+import os
+import random
+
+import torch
+import apex
+
+class TestFusedAdam(unittest.TestCase):
+    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
+        self.max_abs_diff = max_abs_diff
+        self.max_rel_diff = max_rel_diff
+        self.iters = iters
+        torch.cuda.manual_seed(9876)
+
+    def tearDown(self):
+        pass
+
+    def gen_param_optim(self, tensors, adam_option):
+        ref_param = []
+        tst_param = []
+        for tensor in tensors:
+            ref_param.append(torch.nn.Parameter(tensor.clone()))
+            tst_param.append(torch.nn.Parameter(tensor.clone()))
+
+        ref_optim = torch.optim.Adam(ref_param, **adam_option)
+        tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
+
+        return (ref_param, tst_param, ref_optim, tst_optim)
+
+    def gen_grad(self, ref_param, tst_param):
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            p_ref.grad = torch.rand_like(p_ref)
+            p_tst.grad = p_ref.grad
+
+    def gen_mixed_grad(self, ref_param, tst_param, scale=1.0):
+        half_grads = []
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            half_grads.append(torch.rand_like(p_ref).half())
+            p_ref.grad = half_grads[-1].float() / scale
+        return half_grads
+
+    def get_max_diff(self, ref_param, tst_param):
+        max_abs_diff = max_rel_diff = 0
+        for p_ref, p_tst in zip(ref_param, tst_param):
+            max_abs_diff_p = (p_ref - p_tst).abs().max().item()
+            max_rel_diff_p = ((p_ref - p_tst) / p_ref).abs().max().item()
+
+            if max_abs_diff_p > max_abs_diff:  max_abs_diff = max_abs_diff_p
+            if max_rel_diff_p > max_rel_diff:  max_rel_diff = max_rel_diff_p
+
+        return max_abs_diff, max_rel_diff
+
+    def gen_single_type_test(self, param_type=torch.float):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=param_type, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_double(self):
+        self.gen_single_type_test(param_type=torch.double)
+
+    def test_float(self):
+        self.gen_single_type_test(param_type=torch.float)
+
+    def test_half(self):
+        self.gen_single_type_test(param_type=torch.float16)
+
+    def test_multi_params(self):
+        sizes = [[4096, 1024], [4096], [4096, 2048], [32320, 1024], [1]]
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensors = []
+        for size in sizes:
+            tensors.append(torch.rand(size, dtype=torch.float, device='cuda'))
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim(tensors, adam_option)
+
+        for i in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    @unittest.skip('No longer support fuse scaling')
+    def test_scale(self):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            scale = random.random() * 1000
+            half_grads = self.gen_mixed_grad(ref_param, tst_param, scale)
+            ref_optim.step()
+            tst_optim.step(grads=half_grads, scale=scale)
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    @unittest.skip('No longer support output fp16 param')
+    def test_fp16_output(self):
+        nelem = 278011
+        adam_option = {'lr':5e-4, 'betas':(0.9, 0.999), 'eps':1e-08,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        fp16_param = torch.nn.Parameter(tensor.clone().half())
+
+        for i in range(self.iters):
+            half_grads = self.gen_mixed_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step(grads=half_grads, output_params=[fp16_param])
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+            max_abs_diff, max_rel_diff = self.get_max_diff(tst_param, \
+                [fp16_param.float()])
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+    def test_adam_option(self):
+        nelem = 1
+        adam_option = {'lr':0.01, 'betas':(0.6, 0.9), 'eps':3e-06,
+            'weight_decay':0, 'amsgrad':False}
+
+        tensor = torch.rand(nelem, dtype=torch.float, device='cuda')
+        ref_param, tst_param, ref_optim, tst_optim = \
+            self.gen_param_optim([tensor], adam_option)
+
+        for i in range(self.iters):
+            self.gen_grad(ref_param, tst_param)
+            ref_optim.step()
+            tst_optim.step()
+            max_abs_diff, max_rel_diff = self.get_max_diff(ref_param, tst_param)
+
+            self.assertLessEqual(max_abs_diff, self.max_abs_diff)
+            self.assertLessEqual(max_rel_diff, self.max_rel_diff)
+
+
+if __name__ == '__main__':
+    script_path = os.path.dirname(os.path.realpath(__file__))
+    unittest.main()
--- a/tests/L0/run_mixed_adam/test_fp16_optimizer.py
+++ b/tests/L0/run_mixed_adam/test_fp16_optimizer.py
 import unittest
 import torch
 import apex
+import os

 class TestFP16Optimizer(unittest.TestCase):
    def setUp(self, max_abs_diff=1e-3, max_rel_diff=1, iters=7):
@@ -35,7 +36,7 @@ class TestFP16Optimizer(unittest.TestCase):
        ref_optim = torch.optim.Adam(self.ref_model.parameters())
        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)

-        tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
+        tst_optim = apex.optimizers.FusedAdam_v1(self.tst_model.parameters())
        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)

        for i in range(self.iters):
@@ -57,7 +58,7 @@ class TestFP16Optimizer(unittest.TestCase):
        ref_optim = torch.optim.Adam(self.ref_model.parameters())
        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, static_loss_scale=128.0, verbose=False)

-        tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters())
+        tst_optim = apex.optimizers.FusedAdam_v1(self.tst_model.parameters())
        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim, static_loss_scale=128.0)

        for i in range(self.iters):
@@ -80,7 +81,7 @@ class TestFP16Optimizer(unittest.TestCase):
        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)

        tst_groups = [{'params': [self.tst_model.weight]},{'params': [self.tst_model.bias]}]
-        tst_optim = apex.optimizers.FusedAdam(tst_groups)
+        tst_optim = apex.optimizers.FusedAdam_v1(tst_groups)
        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)

        for i in range(self.iters):
@@ -100,7 +101,7 @@ class TestFP16Optimizer(unittest.TestCase):
        ref_optim = torch.optim.Adam(self.ref_model.parameters())
        ref_optim = apex.fp16_utils.FP16_Optimizer(ref_optim, verbose=False)

-        tst_optim = apex.optimizers.FusedAdam(self.tst_model.parameters(), max_grad_norm=0.01)
+        tst_optim = apex.optimizers.FusedAdam_v1(self.tst_model.parameters(), max_grad_norm=0.01)
        tst_optim = apex.optimizers.FP16_Optimizer(tst_optim)

        for i in range(self.iters):

--- a/tests/L0/run_mixed_adam/test_mixed_adam.py
+++ b/tests/L0/run_mixed_adam/test_mixed_adam.py
@@ -23,7 +23,7 @@ class TestFusedAdam(unittest.TestCase):
            tst_param.append(torch.nn.Parameter(tensor.clone()))

        ref_optim = torch.optim.Adam(ref_param, **adam_option)
-        tst_optim = apex.optimizers.FusedAdam(tst_param, **adam_option)
+        tst_optim = apex.optimizers.FusedAdam_v1(tst_param, **adam_option)
       
        return (ref_param, tst_param, ref_optim, tst_optim)